|  | /* | 
|  | * Implement AES algorithm in Intel AES-NI instructions. | 
|  | * | 
|  | * The white paper of AES-NI instructions can be downloaded from: | 
|  | *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf | 
|  | * | 
|  | * Copyright (C) 2008, Intel Corp. | 
|  | *    Author: Huang Ying <ying.huang@intel.com> | 
|  | *            Vinodh Gopal <vinodh.gopal@intel.com> | 
|  | *            Kahraman Akdemir | 
|  | * | 
|  | * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD | 
|  | * interface for 64-bit kernels. | 
|  | *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) | 
|  | *             Aidan O'Mahony (aidan.o.mahony@intel.com) | 
|  | *             Adrian Hoban <adrian.hoban@intel.com> | 
|  | *             James Guilford (james.guilford@intel.com) | 
|  | *             Gabriele Paoloni <gabriele.paoloni@intel.com> | 
|  | *             Tadeusz Struk (tadeusz.struk@intel.com) | 
|  | *             Wajdi Feghali (wajdi.k.feghali@intel.com) | 
|  | *    Copyright (c) 2010, Intel Corporation. | 
|  | * | 
|  | * Ported x86_64 version to x86: | 
|  | *    Author: Mathias Krause <minipli@googlemail.com> | 
|  | * | 
|  | * This program is free software; you can redistribute it and/or modify | 
|  | * it under the terms of the GNU General Public License as published by | 
|  | * the Free Software Foundation; either version 2 of the License, or | 
|  | * (at your option) any later version. | 
|  | */ | 
|  |  | 
|  | #include <linux/linkage.h> | 
|  | #include <asm/inst.h> | 
|  | #include <asm/frame.h> | 
|  | #include <asm/nospec-branch.h> | 
|  |  | 
|  | /* | 
|  | * The following macros are used to move an (un)aligned 16 byte value to/from | 
|  | * an XMM register.  This can done for either FP or integer values, for FP use | 
|  | * movaps (move aligned packed single) or integer use movdqa (move double quad | 
|  | * aligned).  It doesn't make a performance difference which instruction is used | 
|  | * since Nehalem (original Core i7) was released.  However, the movaps is a byte | 
|  | * shorter, so that is the one we'll use for now. (same for unaligned). | 
|  | */ | 
|  | #define MOVADQ	movaps | 
|  | #define MOVUDQ	movups | 
|  |  | 
|  | #ifdef __x86_64__ | 
|  |  | 
|  | # constants in mergeable sections, linker can reorder and merge | 
|  | .section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 | 
|  | .align 16 | 
|  | .Lgf128mul_x_ble_mask: | 
|  | .octa 0x00000000000000010000000000000087 | 
|  | .section	.rodata.cst16.POLY, "aM", @progbits, 16 | 
|  | .align 16 | 
|  | POLY:   .octa 0xC2000000000000000000000000000001 | 
|  | .section	.rodata.cst16.TWOONE, "aM", @progbits, 16 | 
|  | .align 16 | 
|  | TWOONE: .octa 0x00000001000000000000000000000001 | 
|  |  | 
|  | .section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16 | 
|  | .align 16 | 
|  | SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F | 
|  | .section	.rodata.cst16.MASK1, "aM", @progbits, 16 | 
|  | .align 16 | 
|  | MASK1:      .octa 0x0000000000000000ffffffffffffffff | 
|  | .section	.rodata.cst16.MASK2, "aM", @progbits, 16 | 
|  | .align 16 | 
|  | MASK2:      .octa 0xffffffffffffffff0000000000000000 | 
|  | .section	.rodata.cst16.ONE, "aM", @progbits, 16 | 
|  | .align 16 | 
|  | ONE:        .octa 0x00000000000000000000000000000001 | 
|  | .section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 | 
|  | .align 16 | 
|  | F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 | 
|  | .section	.rodata.cst16.dec, "aM", @progbits, 16 | 
|  | .align 16 | 
|  | dec:        .octa 0x1 | 
|  | .section	.rodata.cst16.enc, "aM", @progbits, 16 | 
|  | .align 16 | 
|  | enc:        .octa 0x2 | 
|  |  | 
|  | # order of these constants should not change. | 
|  | # more specifically, ALL_F should follow SHIFT_MASK, | 
|  | # and zero should follow ALL_F | 
|  | .section	.rodata, "a", @progbits | 
|  | .align 16 | 
|  | SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 | 
|  | ALL_F:      .octa 0xffffffffffffffffffffffffffffffff | 
|  | .octa 0x00000000000000000000000000000000 | 
|  |  | 
|  | .text | 
|  |  | 
|  |  | 
|  | #define	STACK_OFFSET    8*3 | 
|  | #define	HashKey		16*0	// store HashKey <<1 mod poly here | 
|  | #define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here | 
|  | #define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here | 
|  | #define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here | 
|  | #define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64 | 
|  | // bits of  HashKey <<1 mod poly here | 
|  | //(for Karatsuba purposes) | 
|  | #define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64 | 
|  | // bits of  HashKey^2 <<1 mod poly here | 
|  | // (for Karatsuba purposes) | 
|  | #define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64 | 
|  | // bits of  HashKey^3 <<1 mod poly here | 
|  | // (for Karatsuba purposes) | 
|  | #define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64 | 
|  | // bits of  HashKey^4 <<1 mod poly here | 
|  | // (for Karatsuba purposes) | 
|  | #define	VARIABLE_OFFSET	16*8 | 
|  |  | 
|  | #define arg1 rdi | 
|  | #define arg2 rsi | 
|  | #define arg3 rdx | 
|  | #define arg4 rcx | 
|  | #define arg5 r8 | 
|  | #define arg6 r9 | 
|  | #define arg7 STACK_OFFSET+8(%r14) | 
|  | #define arg8 STACK_OFFSET+16(%r14) | 
|  | #define arg9 STACK_OFFSET+24(%r14) | 
|  | #define arg10 STACK_OFFSET+32(%r14) | 
|  | #define keysize 2*15*16(%arg1) | 
|  | #endif | 
|  |  | 
|  |  | 
|  | #define STATE1	%xmm0 | 
|  | #define STATE2	%xmm4 | 
|  | #define STATE3	%xmm5 | 
|  | #define STATE4	%xmm6 | 
|  | #define STATE	STATE1 | 
|  | #define IN1	%xmm1 | 
|  | #define IN2	%xmm7 | 
|  | #define IN3	%xmm8 | 
|  | #define IN4	%xmm9 | 
|  | #define IN	IN1 | 
|  | #define KEY	%xmm2 | 
|  | #define IV	%xmm3 | 
|  |  | 
|  | #define BSWAP_MASK %xmm10 | 
|  | #define CTR	%xmm11 | 
|  | #define INC	%xmm12 | 
|  |  | 
|  | #define GF128MUL_MASK %xmm10 | 
|  |  | 
|  | #ifdef __x86_64__ | 
|  | #define AREG	%rax | 
|  | #define KEYP	%rdi | 
|  | #define OUTP	%rsi | 
|  | #define UKEYP	OUTP | 
|  | #define INP	%rdx | 
|  | #define LEN	%rcx | 
|  | #define IVP	%r8 | 
|  | #define KLEN	%r9d | 
|  | #define T1	%r10 | 
|  | #define TKEYP	T1 | 
|  | #define T2	%r11 | 
|  | #define TCTR_LOW T2 | 
|  | #else | 
|  | #define AREG	%eax | 
|  | #define KEYP	%edi | 
|  | #define OUTP	AREG | 
|  | #define UKEYP	OUTP | 
|  | #define INP	%edx | 
|  | #define LEN	%esi | 
|  | #define IVP	%ebp | 
|  | #define KLEN	%ebx | 
|  | #define T1	%ecx | 
|  | #define TKEYP	T1 | 
|  | #endif | 
|  |  | 
|  |  | 
|  | #ifdef __x86_64__ | 
|  | /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) | 
|  | * | 
|  | * | 
|  | * Input: A and B (128-bits each, bit-reflected) | 
|  | * Output: C = A*B*x mod poly, (i.e. >>1 ) | 
|  | * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input | 
|  | * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. | 
|  | * | 
|  | */ | 
|  | .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 | 
|  | movdqa	  \GH, \TMP1 | 
|  | pshufd	  $78, \GH, \TMP2 | 
|  | pshufd	  $78, \HK, \TMP3 | 
|  | pxor	  \GH, \TMP2            # TMP2 = a1+a0 | 
|  | pxor	  \HK, \TMP3            # TMP3 = b1+b0 | 
|  | PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1 | 
|  | PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0 | 
|  | PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0) | 
|  | pxor	  \GH, \TMP2 | 
|  | pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0) | 
|  | movdqa	  \TMP2, \TMP3 | 
|  | pslldq	  $8, \TMP3             # left shift TMP3 2 DWs | 
|  | psrldq	  $8, \TMP2             # right shift TMP2 2 DWs | 
|  | pxor	  \TMP3, \GH | 
|  | pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK | 
|  |  | 
|  | # first phase of the reduction | 
|  |  | 
|  | movdqa    \GH, \TMP2 | 
|  | movdqa    \GH, \TMP3 | 
|  | movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4 | 
|  | # in in order to perform | 
|  | # independent shifts | 
|  | pslld     $31, \TMP2            # packed right shift <<31 | 
|  | pslld     $30, \TMP3            # packed right shift <<30 | 
|  | pslld     $25, \TMP4            # packed right shift <<25 | 
|  | pxor      \TMP3, \TMP2          # xor the shifted versions | 
|  | pxor      \TMP4, \TMP2 | 
|  | movdqa    \TMP2, \TMP5 | 
|  | psrldq    $4, \TMP5             # right shift TMP5 1 DW | 
|  | pslldq    $12, \TMP2            # left shift TMP2 3 DWs | 
|  | pxor      \TMP2, \GH | 
|  |  | 
|  | # second phase of the reduction | 
|  |  | 
|  | movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4 | 
|  | # in in order to perform | 
|  | # independent shifts | 
|  | movdqa    \GH,\TMP3 | 
|  | movdqa    \GH,\TMP4 | 
|  | psrld     $1,\TMP2              # packed left shift >>1 | 
|  | psrld     $2,\TMP3              # packed left shift >>2 | 
|  | psrld     $7,\TMP4              # packed left shift >>7 | 
|  | pxor      \TMP3,\TMP2		# xor the shifted versions | 
|  | pxor      \TMP4,\TMP2 | 
|  | pxor      \TMP5, \TMP2 | 
|  | pxor      \TMP2, \GH | 
|  | pxor      \TMP1, \GH            # result is in TMP1 | 
|  | .endm | 
|  |  | 
|  | # Reads DLEN bytes starting at DPTR and stores in XMMDst | 
|  | # where 0 < DLEN < 16 | 
|  | # Clobbers %rax, DLEN and XMM1 | 
|  | .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst | 
|  | cmp $8, \DLEN | 
|  | jl _read_lt8_\@ | 
|  | mov (\DPTR), %rax | 
|  | MOVQ_R64_XMM %rax, \XMMDst | 
|  | sub $8, \DLEN | 
|  | jz _done_read_partial_block_\@ | 
|  | xor %eax, %eax | 
|  | _read_next_byte_\@: | 
|  | shl $8, %rax | 
|  | mov 7(\DPTR, \DLEN, 1), %al | 
|  | dec \DLEN | 
|  | jnz _read_next_byte_\@ | 
|  | MOVQ_R64_XMM %rax, \XMM1 | 
|  | pslldq $8, \XMM1 | 
|  | por \XMM1, \XMMDst | 
|  | jmp _done_read_partial_block_\@ | 
|  | _read_lt8_\@: | 
|  | xor %eax, %eax | 
|  | _read_next_byte_lt8_\@: | 
|  | shl $8, %rax | 
|  | mov -1(\DPTR, \DLEN, 1), %al | 
|  | dec \DLEN | 
|  | jnz _read_next_byte_lt8_\@ | 
|  | MOVQ_R64_XMM %rax, \XMMDst | 
|  | _done_read_partial_block_\@: | 
|  | .endm | 
|  |  | 
|  | /* | 
|  | * if a = number of total plaintext bytes | 
|  | * b = floor(a/16) | 
|  | * num_initial_blocks = b mod 4 | 
|  | * encrypt the initial num_initial_blocks blocks and apply ghash on | 
|  | * the ciphertext | 
|  | * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers | 
|  | * are clobbered | 
|  | * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified | 
|  | */ | 
|  |  | 
|  |  | 
|  | .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ | 
|  | XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation | 
|  | MOVADQ     SHUF_MASK(%rip), %xmm14 | 
|  | mov	   arg7, %r10           # %r10 = AAD | 
|  | mov	   arg8, %r11           # %r11 = aadLen | 
|  | pxor	   %xmm\i, %xmm\i | 
|  | pxor       \XMM2, \XMM2 | 
|  |  | 
|  | cmp	   $16, %r11 | 
|  | jl	   _get_AAD_rest\num_initial_blocks\operation | 
|  | _get_AAD_blocks\num_initial_blocks\operation: | 
|  | movdqu	   (%r10), %xmm\i | 
|  | PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data | 
|  | pxor	   %xmm\i, \XMM2 | 
|  | GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | add	   $16, %r10 | 
|  | sub	   $16, %r11 | 
|  | cmp	   $16, %r11 | 
|  | jge	   _get_AAD_blocks\num_initial_blocks\operation | 
|  |  | 
|  | movdqu	   \XMM2, %xmm\i | 
|  |  | 
|  | /* read the last <16B of AAD */ | 
|  | _get_AAD_rest\num_initial_blocks\operation: | 
|  | cmp	   $0, %r11 | 
|  | je	   _get_AAD_done\num_initial_blocks\operation | 
|  |  | 
|  | READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i | 
|  | PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data | 
|  | pxor	   \XMM2, %xmm\i | 
|  | GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  |  | 
|  | _get_AAD_done\num_initial_blocks\operation: | 
|  | xor	   %r11, %r11 # initialise the data pointer offset as zero | 
|  | # start AES for num_initial_blocks blocks | 
|  |  | 
|  | mov	   %arg5, %rax                      # %rax = *Y0 | 
|  | movdqu	   (%rax), \XMM0                    # XMM0 = Y0 | 
|  | PSHUFB_XMM   %xmm14, \XMM0 | 
|  |  | 
|  | .if (\i == 5) || (\i == 6) || (\i == 7) | 
|  | MOVADQ		ONE(%RIP),\TMP1 | 
|  | MOVADQ		(%arg1),\TMP2 | 
|  | .irpc index, \i_seq | 
|  | paddd	   \TMP1, \XMM0                 # INCR Y0 | 
|  | movdqa	   \XMM0, %xmm\index | 
|  | PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap | 
|  | pxor	   \TMP2, %xmm\index | 
|  | .endr | 
|  | lea	0x10(%arg1),%r10 | 
|  | mov	keysize,%eax | 
|  | shr	$2,%eax				# 128->4, 192->6, 256->8 | 
|  | add	$5,%eax			      # 128->9, 192->11, 256->13 | 
|  |  | 
|  | aes_loop_initial_dec\num_initial_blocks: | 
|  | MOVADQ	(%r10),\TMP1 | 
|  | .irpc	index, \i_seq | 
|  | AESENC	\TMP1, %xmm\index | 
|  | .endr | 
|  | add	$16,%r10 | 
|  | sub	$1,%eax | 
|  | jnz	aes_loop_initial_dec\num_initial_blocks | 
|  |  | 
|  | MOVADQ	(%r10), \TMP1 | 
|  | .irpc index, \i_seq | 
|  | AESENCLAST \TMP1, %xmm\index         # Last Round | 
|  | .endr | 
|  | .irpc index, \i_seq | 
|  | movdqu	   (%arg3 , %r11, 1), \TMP1 | 
|  | pxor	   \TMP1, %xmm\index | 
|  | movdqu	   %xmm\index, (%arg2 , %r11, 1) | 
|  | # write back plaintext/ciphertext for num_initial_blocks | 
|  | add	   $16, %r11 | 
|  |  | 
|  | movdqa     \TMP1, %xmm\index | 
|  | PSHUFB_XMM	   %xmm14, %xmm\index | 
|  | # prepare plaintext/ciphertext for GHASH computation | 
|  | .endr | 
|  | .endif | 
|  |  | 
|  | # apply GHASH on num_initial_blocks blocks | 
|  |  | 
|  | .if \i == 5 | 
|  | pxor       %xmm5, %xmm6 | 
|  | GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | pxor       %xmm6, %xmm7 | 
|  | GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | pxor       %xmm7, %xmm8 | 
|  | GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | .elseif \i == 6 | 
|  | pxor       %xmm6, %xmm7 | 
|  | GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | pxor       %xmm7, %xmm8 | 
|  | GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | .elseif \i == 7 | 
|  | pxor       %xmm7, %xmm8 | 
|  | GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | .endif | 
|  | cmp	   $64, %r13 | 
|  | jl	_initial_blocks_done\num_initial_blocks\operation | 
|  | # no need for precomputed values | 
|  | /* | 
|  | * | 
|  | * Precomputations for HashKey parallel with encryption of first 4 blocks. | 
|  | * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | 
|  | */ | 
|  | MOVADQ	   ONE(%rip), \TMP1 | 
|  | paddd	   \TMP1, \XMM0              # INCR Y0 | 
|  | MOVADQ	   \XMM0, \XMM1 | 
|  | PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap | 
|  |  | 
|  | paddd	   \TMP1, \XMM0              # INCR Y0 | 
|  | MOVADQ	   \XMM0, \XMM2 | 
|  | PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap | 
|  |  | 
|  | paddd	   \TMP1, \XMM0              # INCR Y0 | 
|  | MOVADQ	   \XMM0, \XMM3 | 
|  | PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap | 
|  |  | 
|  | paddd	   \TMP1, \XMM0              # INCR Y0 | 
|  | MOVADQ	   \XMM0, \XMM4 | 
|  | PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap | 
|  |  | 
|  | MOVADQ	   0(%arg1),\TMP1 | 
|  | pxor	   \TMP1, \XMM1 | 
|  | pxor	   \TMP1, \XMM2 | 
|  | pxor	   \TMP1, \XMM3 | 
|  | pxor	   \TMP1, \XMM4 | 
|  | movdqa	   \TMP3, \TMP5 | 
|  | pshufd	   $78, \TMP3, \TMP1 | 
|  | pxor	   \TMP3, \TMP1 | 
|  | movdqa	   \TMP1, HashKey_k(%rsp) | 
|  | GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | 
|  | # TMP5 = HashKey^2<<1 (mod poly) | 
|  | movdqa	   \TMP5, HashKey_2(%rsp) | 
|  | # HashKey_2 = HashKey^2<<1 (mod poly) | 
|  | pshufd	   $78, \TMP5, \TMP1 | 
|  | pxor	   \TMP5, \TMP1 | 
|  | movdqa	   \TMP1, HashKey_2_k(%rsp) | 
|  | .irpc index, 1234 # do 4 rounds | 
|  | movaps 0x10*\index(%arg1), \TMP1 | 
|  | AESENC	   \TMP1, \XMM1 | 
|  | AESENC	   \TMP1, \XMM2 | 
|  | AESENC	   \TMP1, \XMM3 | 
|  | AESENC	   \TMP1, \XMM4 | 
|  | .endr | 
|  | GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | 
|  | # TMP5 = HashKey^3<<1 (mod poly) | 
|  | movdqa	   \TMP5, HashKey_3(%rsp) | 
|  | pshufd	   $78, \TMP5, \TMP1 | 
|  | pxor	   \TMP5, \TMP1 | 
|  | movdqa	   \TMP1, HashKey_3_k(%rsp) | 
|  | .irpc index, 56789 # do next 5 rounds | 
|  | movaps 0x10*\index(%arg1), \TMP1 | 
|  | AESENC	   \TMP1, \XMM1 | 
|  | AESENC	   \TMP1, \XMM2 | 
|  | AESENC	   \TMP1, \XMM3 | 
|  | AESENC	   \TMP1, \XMM4 | 
|  | .endr | 
|  | GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | 
|  | # TMP5 = HashKey^3<<1 (mod poly) | 
|  | movdqa	   \TMP5, HashKey_4(%rsp) | 
|  | pshufd	   $78, \TMP5, \TMP1 | 
|  | pxor	   \TMP5, \TMP1 | 
|  | movdqa	   \TMP1, HashKey_4_k(%rsp) | 
|  | lea	   0xa0(%arg1),%r10 | 
|  | mov	   keysize,%eax | 
|  | shr	   $2,%eax			# 128->4, 192->6, 256->8 | 
|  | sub	   $4,%eax			# 128->0, 192->2, 256->4 | 
|  | jz	   aes_loop_pre_dec_done\num_initial_blocks | 
|  |  | 
|  | aes_loop_pre_dec\num_initial_blocks: | 
|  | MOVADQ	   (%r10),\TMP2 | 
|  | .irpc	index, 1234 | 
|  | AESENC	   \TMP2, %xmm\index | 
|  | .endr | 
|  | add	   $16,%r10 | 
|  | sub	   $1,%eax | 
|  | jnz	   aes_loop_pre_dec\num_initial_blocks | 
|  |  | 
|  | aes_loop_pre_dec_done\num_initial_blocks: | 
|  | MOVADQ	   (%r10), \TMP2 | 
|  | AESENCLAST \TMP2, \XMM1 | 
|  | AESENCLAST \TMP2, \XMM2 | 
|  | AESENCLAST \TMP2, \XMM3 | 
|  | AESENCLAST \TMP2, \XMM4 | 
|  | movdqu	   16*0(%arg3 , %r11 , 1), \TMP1 | 
|  | pxor	   \TMP1, \XMM1 | 
|  | movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1) | 
|  | movdqa     \TMP1, \XMM1 | 
|  | movdqu	   16*1(%arg3 , %r11 , 1), \TMP1 | 
|  | pxor	   \TMP1, \XMM2 | 
|  | movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1) | 
|  | movdqa     \TMP1, \XMM2 | 
|  | movdqu	   16*2(%arg3 , %r11 , 1), \TMP1 | 
|  | pxor	   \TMP1, \XMM3 | 
|  | movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1) | 
|  | movdqa     \TMP1, \XMM3 | 
|  | movdqu	   16*3(%arg3 , %r11 , 1), \TMP1 | 
|  | pxor	   \TMP1, \XMM4 | 
|  | movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1) | 
|  | movdqa     \TMP1, \XMM4 | 
|  | add	   $64, %r11 | 
|  | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap | 
|  | pxor	   \XMMDst, \XMM1 | 
|  | # combine GHASHed value with the corresponding ciphertext | 
|  | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap | 
|  | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap | 
|  | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap | 
|  |  | 
|  | _initial_blocks_done\num_initial_blocks\operation: | 
|  |  | 
|  | .endm | 
|  |  | 
|  |  | 
|  | /* | 
|  | * if a = number of total plaintext bytes | 
|  | * b = floor(a/16) | 
|  | * num_initial_blocks = b mod 4 | 
|  | * encrypt the initial num_initial_blocks blocks and apply ghash on | 
|  | * the ciphertext | 
|  | * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers | 
|  | * are clobbered | 
|  | * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified | 
|  | */ | 
|  |  | 
|  |  | 
|  | .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ | 
|  | XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation | 
|  | MOVADQ     SHUF_MASK(%rip), %xmm14 | 
|  | mov	   arg7, %r10           # %r10 = AAD | 
|  | mov	   arg8, %r11           # %r11 = aadLen | 
|  | pxor	   %xmm\i, %xmm\i | 
|  | pxor	   \XMM2, \XMM2 | 
|  |  | 
|  | cmp	   $16, %r11 | 
|  | jl	   _get_AAD_rest\num_initial_blocks\operation | 
|  | _get_AAD_blocks\num_initial_blocks\operation: | 
|  | movdqu	   (%r10), %xmm\i | 
|  | PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data | 
|  | pxor	   %xmm\i, \XMM2 | 
|  | GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | add	   $16, %r10 | 
|  | sub	   $16, %r11 | 
|  | cmp	   $16, %r11 | 
|  | jge	   _get_AAD_blocks\num_initial_blocks\operation | 
|  |  | 
|  | movdqu	   \XMM2, %xmm\i | 
|  |  | 
|  | /* read the last <16B of AAD */ | 
|  | _get_AAD_rest\num_initial_blocks\operation: | 
|  | cmp	   $0, %r11 | 
|  | je	   _get_AAD_done\num_initial_blocks\operation | 
|  |  | 
|  | READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i | 
|  | PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data | 
|  | pxor	   \XMM2, %xmm\i | 
|  | GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  |  | 
|  | _get_AAD_done\num_initial_blocks\operation: | 
|  | xor	   %r11, %r11 # initialise the data pointer offset as zero | 
|  | # start AES for num_initial_blocks blocks | 
|  |  | 
|  | mov	   %arg5, %rax                      # %rax = *Y0 | 
|  | movdqu	   (%rax), \XMM0                    # XMM0 = Y0 | 
|  | PSHUFB_XMM   %xmm14, \XMM0 | 
|  |  | 
|  | .if (\i == 5) || (\i == 6) || (\i == 7) | 
|  |  | 
|  | MOVADQ		ONE(%RIP),\TMP1 | 
|  | MOVADQ		0(%arg1),\TMP2 | 
|  | .irpc index, \i_seq | 
|  | paddd		\TMP1, \XMM0                 # INCR Y0 | 
|  | MOVADQ		\XMM0, %xmm\index | 
|  | PSHUFB_XMM	%xmm14, %xmm\index      # perform a 16 byte swap | 
|  | pxor		\TMP2, %xmm\index | 
|  | .endr | 
|  | lea	0x10(%arg1),%r10 | 
|  | mov	keysize,%eax | 
|  | shr	$2,%eax				# 128->4, 192->6, 256->8 | 
|  | add	$5,%eax			      # 128->9, 192->11, 256->13 | 
|  |  | 
|  | aes_loop_initial_enc\num_initial_blocks: | 
|  | MOVADQ	(%r10),\TMP1 | 
|  | .irpc	index, \i_seq | 
|  | AESENC	\TMP1, %xmm\index | 
|  | .endr | 
|  | add	$16,%r10 | 
|  | sub	$1,%eax | 
|  | jnz	aes_loop_initial_enc\num_initial_blocks | 
|  |  | 
|  | MOVADQ	(%r10), \TMP1 | 
|  | .irpc index, \i_seq | 
|  | AESENCLAST \TMP1, %xmm\index         # Last Round | 
|  | .endr | 
|  | .irpc index, \i_seq | 
|  | movdqu	   (%arg3 , %r11, 1), \TMP1 | 
|  | pxor	   \TMP1, %xmm\index | 
|  | movdqu	   %xmm\index, (%arg2 , %r11, 1) | 
|  | # write back plaintext/ciphertext for num_initial_blocks | 
|  | add	   $16, %r11 | 
|  | PSHUFB_XMM	   %xmm14, %xmm\index | 
|  |  | 
|  | # prepare plaintext/ciphertext for GHASH computation | 
|  | .endr | 
|  | .endif | 
|  |  | 
|  | # apply GHASH on num_initial_blocks blocks | 
|  |  | 
|  | .if \i == 5 | 
|  | pxor       %xmm5, %xmm6 | 
|  | GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | pxor       %xmm6, %xmm7 | 
|  | GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | pxor       %xmm7, %xmm8 | 
|  | GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | .elseif \i == 6 | 
|  | pxor       %xmm6, %xmm7 | 
|  | GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | pxor       %xmm7, %xmm8 | 
|  | GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | .elseif \i == 7 | 
|  | pxor       %xmm7, %xmm8 | 
|  | GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | .endif | 
|  | cmp	   $64, %r13 | 
|  | jl	_initial_blocks_done\num_initial_blocks\operation | 
|  | # no need for precomputed values | 
|  | /* | 
|  | * | 
|  | * Precomputations for HashKey parallel with encryption of first 4 blocks. | 
|  | * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | 
|  | */ | 
|  | MOVADQ	   ONE(%RIP),\TMP1 | 
|  | paddd	   \TMP1, \XMM0              # INCR Y0 | 
|  | MOVADQ	   \XMM0, \XMM1 | 
|  | PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap | 
|  |  | 
|  | paddd	   \TMP1, \XMM0              # INCR Y0 | 
|  | MOVADQ	   \XMM0, \XMM2 | 
|  | PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap | 
|  |  | 
|  | paddd	   \TMP1, \XMM0              # INCR Y0 | 
|  | MOVADQ	   \XMM0, \XMM3 | 
|  | PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap | 
|  |  | 
|  | paddd	   \TMP1, \XMM0              # INCR Y0 | 
|  | MOVADQ	   \XMM0, \XMM4 | 
|  | PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap | 
|  |  | 
|  | MOVADQ	   0(%arg1),\TMP1 | 
|  | pxor	   \TMP1, \XMM1 | 
|  | pxor	   \TMP1, \XMM2 | 
|  | pxor	   \TMP1, \XMM3 | 
|  | pxor	   \TMP1, \XMM4 | 
|  | movdqa	   \TMP3, \TMP5 | 
|  | pshufd	   $78, \TMP3, \TMP1 | 
|  | pxor	   \TMP3, \TMP1 | 
|  | movdqa	   \TMP1, HashKey_k(%rsp) | 
|  | GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | 
|  | # TMP5 = HashKey^2<<1 (mod poly) | 
|  | movdqa	   \TMP5, HashKey_2(%rsp) | 
|  | # HashKey_2 = HashKey^2<<1 (mod poly) | 
|  | pshufd	   $78, \TMP5, \TMP1 | 
|  | pxor	   \TMP5, \TMP1 | 
|  | movdqa	   \TMP1, HashKey_2_k(%rsp) | 
|  | .irpc index, 1234 # do 4 rounds | 
|  | movaps 0x10*\index(%arg1), \TMP1 | 
|  | AESENC	   \TMP1, \XMM1 | 
|  | AESENC	   \TMP1, \XMM2 | 
|  | AESENC	   \TMP1, \XMM3 | 
|  | AESENC	   \TMP1, \XMM4 | 
|  | .endr | 
|  | GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | 
|  | # TMP5 = HashKey^3<<1 (mod poly) | 
|  | movdqa	   \TMP5, HashKey_3(%rsp) | 
|  | pshufd	   $78, \TMP5, \TMP1 | 
|  | pxor	   \TMP5, \TMP1 | 
|  | movdqa	   \TMP1, HashKey_3_k(%rsp) | 
|  | .irpc index, 56789 # do next 5 rounds | 
|  | movaps 0x10*\index(%arg1), \TMP1 | 
|  | AESENC	   \TMP1, \XMM1 | 
|  | AESENC	   \TMP1, \XMM2 | 
|  | AESENC	   \TMP1, \XMM3 | 
|  | AESENC	   \TMP1, \XMM4 | 
|  | .endr | 
|  | GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | 
|  | # TMP5 = HashKey^3<<1 (mod poly) | 
|  | movdqa	   \TMP5, HashKey_4(%rsp) | 
|  | pshufd	   $78, \TMP5, \TMP1 | 
|  | pxor	   \TMP5, \TMP1 | 
|  | movdqa	   \TMP1, HashKey_4_k(%rsp) | 
|  | lea	   0xa0(%arg1),%r10 | 
|  | mov	   keysize,%eax | 
|  | shr	   $2,%eax			# 128->4, 192->6, 256->8 | 
|  | sub	   $4,%eax			# 128->0, 192->2, 256->4 | 
|  | jz	   aes_loop_pre_enc_done\num_initial_blocks | 
|  |  | 
|  | aes_loop_pre_enc\num_initial_blocks: | 
|  | MOVADQ	   (%r10),\TMP2 | 
|  | .irpc	index, 1234 | 
|  | AESENC	   \TMP2, %xmm\index | 
|  | .endr | 
|  | add	   $16,%r10 | 
|  | sub	   $1,%eax | 
|  | jnz	   aes_loop_pre_enc\num_initial_blocks | 
|  |  | 
|  | aes_loop_pre_enc_done\num_initial_blocks: | 
|  | MOVADQ	   (%r10), \TMP2 | 
|  | AESENCLAST \TMP2, \XMM1 | 
|  | AESENCLAST \TMP2, \XMM2 | 
|  | AESENCLAST \TMP2, \XMM3 | 
|  | AESENCLAST \TMP2, \XMM4 | 
|  | movdqu	   16*0(%arg3 , %r11 , 1), \TMP1 | 
|  | pxor	   \TMP1, \XMM1 | 
|  | movdqu	   16*1(%arg3 , %r11 , 1), \TMP1 | 
|  | pxor	   \TMP1, \XMM2 | 
|  | movdqu	   16*2(%arg3 , %r11 , 1), \TMP1 | 
|  | pxor	   \TMP1, \XMM3 | 
|  | movdqu	   16*3(%arg3 , %r11 , 1), \TMP1 | 
|  | pxor	   \TMP1, \XMM4 | 
|  | movdqu     \XMM1, 16*0(%arg2 , %r11 , 1) | 
|  | movdqu     \XMM2, 16*1(%arg2 , %r11 , 1) | 
|  | movdqu     \XMM3, 16*2(%arg2 , %r11 , 1) | 
|  | movdqu     \XMM4, 16*3(%arg2 , %r11 , 1) | 
|  |  | 
|  | add	   $64, %r11 | 
|  | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap | 
|  | pxor	   \XMMDst, \XMM1 | 
|  | # combine GHASHed value with the corresponding ciphertext | 
|  | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap | 
|  | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap | 
|  | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap | 
|  |  | 
|  | _initial_blocks_done\num_initial_blocks\operation: | 
|  |  | 
|  | .endm | 
|  |  | 
|  | /* | 
|  | * encrypt 4 blocks at a time | 
|  | * ghash the 4 previously encrypted ciphertext blocks | 
|  | * arg1, %arg2, %arg3 are used as pointers only, not modified | 
|  | * %r11 is the data offset value | 
|  | */ | 
|  | .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ | 
|  | TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation | 
|  |  | 
|  | movdqa	  \XMM1, \XMM5 | 
|  | movdqa	  \XMM2, \XMM6 | 
|  | movdqa	  \XMM3, \XMM7 | 
|  | movdqa	  \XMM4, \XMM8 | 
|  |  | 
|  | movdqa    SHUF_MASK(%rip), %xmm15 | 
|  | # multiply TMP5 * HashKey using karatsuba | 
|  |  | 
|  | movdqa	  \XMM5, \TMP4 | 
|  | pshufd	  $78, \XMM5, \TMP6 | 
|  | pxor	  \XMM5, \TMP6 | 
|  | paddd     ONE(%rip), \XMM0		# INCR CNT | 
|  | movdqa	  HashKey_4(%rsp), \TMP5 | 
|  | PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1 | 
|  | movdqa    \XMM0, \XMM1 | 
|  | paddd     ONE(%rip), \XMM0		# INCR CNT | 
|  | movdqa    \XMM0, \XMM2 | 
|  | paddd     ONE(%rip), \XMM0		# INCR CNT | 
|  | movdqa    \XMM0, \XMM3 | 
|  | paddd     ONE(%rip), \XMM0		# INCR CNT | 
|  | movdqa    \XMM0, \XMM4 | 
|  | PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap | 
|  | PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0 | 
|  | PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap | 
|  | PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap | 
|  | PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap | 
|  |  | 
|  | pxor	  (%arg1), \XMM1 | 
|  | pxor	  (%arg1), \XMM2 | 
|  | pxor	  (%arg1), \XMM3 | 
|  | pxor	  (%arg1), \XMM4 | 
|  | movdqa	  HashKey_4_k(%rsp), \TMP5 | 
|  | PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0) | 
|  | movaps 0x10(%arg1), \TMP1 | 
|  | AESENC	  \TMP1, \XMM1              # Round 1 | 
|  | AESENC	  \TMP1, \XMM2 | 
|  | AESENC	  \TMP1, \XMM3 | 
|  | AESENC	  \TMP1, \XMM4 | 
|  | movaps 0x20(%arg1), \TMP1 | 
|  | AESENC	  \TMP1, \XMM1              # Round 2 | 
|  | AESENC	  \TMP1, \XMM2 | 
|  | AESENC	  \TMP1, \XMM3 | 
|  | AESENC	  \TMP1, \XMM4 | 
|  | movdqa	  \XMM6, \TMP1 | 
|  | pshufd	  $78, \XMM6, \TMP2 | 
|  | pxor	  \XMM6, \TMP2 | 
|  | movdqa	  HashKey_3(%rsp), \TMP5 | 
|  | PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1 | 
|  | movaps 0x30(%arg1), \TMP3 | 
|  | AESENC    \TMP3, \XMM1              # Round 3 | 
|  | AESENC    \TMP3, \XMM2 | 
|  | AESENC    \TMP3, \XMM3 | 
|  | AESENC    \TMP3, \XMM4 | 
|  | PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0 | 
|  | movaps 0x40(%arg1), \TMP3 | 
|  | AESENC	  \TMP3, \XMM1              # Round 4 | 
|  | AESENC	  \TMP3, \XMM2 | 
|  | AESENC	  \TMP3, \XMM3 | 
|  | AESENC	  \TMP3, \XMM4 | 
|  | movdqa	  HashKey_3_k(%rsp), \TMP5 | 
|  | PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0) | 
|  | movaps 0x50(%arg1), \TMP3 | 
|  | AESENC	  \TMP3, \XMM1              # Round 5 | 
|  | AESENC	  \TMP3, \XMM2 | 
|  | AESENC	  \TMP3, \XMM3 | 
|  | AESENC	  \TMP3, \XMM4 | 
|  | pxor	  \TMP1, \TMP4 | 
|  | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | 
|  | pxor	  \XMM6, \XMM5 | 
|  | pxor	  \TMP2, \TMP6 | 
|  | movdqa	  \XMM7, \TMP1 | 
|  | pshufd	  $78, \XMM7, \TMP2 | 
|  | pxor	  \XMM7, \TMP2 | 
|  | movdqa	  HashKey_2(%rsp ), \TMP5 | 
|  |  | 
|  | # Multiply TMP5 * HashKey using karatsuba | 
|  |  | 
|  | PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1 | 
|  | movaps 0x60(%arg1), \TMP3 | 
|  | AESENC	  \TMP3, \XMM1              # Round 6 | 
|  | AESENC	  \TMP3, \XMM2 | 
|  | AESENC	  \TMP3, \XMM3 | 
|  | AESENC	  \TMP3, \XMM4 | 
|  | PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0 | 
|  | movaps 0x70(%arg1), \TMP3 | 
|  | AESENC	  \TMP3, \XMM1             # Round 7 | 
|  | AESENC	  \TMP3, \XMM2 | 
|  | AESENC	  \TMP3, \XMM3 | 
|  | AESENC	  \TMP3, \XMM4 | 
|  | movdqa	  HashKey_2_k(%rsp), \TMP5 | 
|  | PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0) | 
|  | movaps 0x80(%arg1), \TMP3 | 
|  | AESENC	  \TMP3, \XMM1             # Round 8 | 
|  | AESENC	  \TMP3, \XMM2 | 
|  | AESENC	  \TMP3, \XMM3 | 
|  | AESENC	  \TMP3, \XMM4 | 
|  | pxor	  \TMP1, \TMP4 | 
|  | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | 
|  | pxor	  \XMM7, \XMM5 | 
|  | pxor	  \TMP2, \TMP6 | 
|  |  | 
|  | # Multiply XMM8 * HashKey | 
|  | # XMM8 and TMP5 hold the values for the two operands | 
|  |  | 
|  | movdqa	  \XMM8, \TMP1 | 
|  | pshufd	  $78, \XMM8, \TMP2 | 
|  | pxor	  \XMM8, \TMP2 | 
|  | movdqa	  HashKey(%rsp), \TMP5 | 
|  | PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1 | 
|  | movaps 0x90(%arg1), \TMP3 | 
|  | AESENC	  \TMP3, \XMM1            # Round 9 | 
|  | AESENC	  \TMP3, \XMM2 | 
|  | AESENC	  \TMP3, \XMM3 | 
|  | AESENC	  \TMP3, \XMM4 | 
|  | PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0 | 
|  | lea	  0xa0(%arg1),%r10 | 
|  | mov	  keysize,%eax | 
|  | shr	  $2,%eax			# 128->4, 192->6, 256->8 | 
|  | sub	  $4,%eax			# 128->0, 192->2, 256->4 | 
|  | jz	  aes_loop_par_enc_done | 
|  |  | 
|  | aes_loop_par_enc: | 
|  | MOVADQ	  (%r10),\TMP3 | 
|  | .irpc	index, 1234 | 
|  | AESENC	  \TMP3, %xmm\index | 
|  | .endr | 
|  | add	  $16,%r10 | 
|  | sub	  $1,%eax | 
|  | jnz	  aes_loop_par_enc | 
|  |  | 
|  | aes_loop_par_enc_done: | 
|  | MOVADQ	  (%r10), \TMP3 | 
|  | AESENCLAST \TMP3, \XMM1           # Round 10 | 
|  | AESENCLAST \TMP3, \XMM2 | 
|  | AESENCLAST \TMP3, \XMM3 | 
|  | AESENCLAST \TMP3, \XMM4 | 
|  | movdqa    HashKey_k(%rsp), \TMP5 | 
|  | PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0) | 
|  | movdqu	  (%arg3,%r11,1), \TMP3 | 
|  | pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK | 
|  | movdqu	  16(%arg3,%r11,1), \TMP3 | 
|  | pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK | 
|  | movdqu	  32(%arg3,%r11,1), \TMP3 | 
|  | pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK | 
|  | movdqu	  48(%arg3,%r11,1), \TMP3 | 
|  | pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK | 
|  | movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer | 
|  | movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer | 
|  | movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer | 
|  | movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer | 
|  | PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap | 
|  | PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap | 
|  | PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap | 
|  | PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap | 
|  |  | 
|  | pxor	  \TMP4, \TMP1 | 
|  | pxor	  \XMM8, \XMM5 | 
|  | pxor	  \TMP6, \TMP2 | 
|  | pxor	  \TMP1, \TMP2 | 
|  | pxor	  \XMM5, \TMP2 | 
|  | movdqa	  \TMP2, \TMP3 | 
|  | pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs | 
|  | psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs | 
|  | pxor	  \TMP3, \XMM5 | 
|  | pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5 | 
|  |  | 
|  | # first phase of reduction | 
|  |  | 
|  | movdqa    \XMM5, \TMP2 | 
|  | movdqa    \XMM5, \TMP3 | 
|  | movdqa    \XMM5, \TMP4 | 
|  | # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently | 
|  | pslld     $31, \TMP2                   # packed right shift << 31 | 
|  | pslld     $30, \TMP3                   # packed right shift << 30 | 
|  | pslld     $25, \TMP4                   # packed right shift << 25 | 
|  | pxor      \TMP3, \TMP2	               # xor the shifted versions | 
|  | pxor      \TMP4, \TMP2 | 
|  | movdqa    \TMP2, \TMP5 | 
|  | psrldq    $4, \TMP5                    # right shift T5 1 DW | 
|  | pslldq    $12, \TMP2                   # left shift T2 3 DWs | 
|  | pxor      \TMP2, \XMM5 | 
|  |  | 
|  | # second phase of reduction | 
|  |  | 
|  | movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 | 
|  | movdqa    \XMM5,\TMP3 | 
|  | movdqa    \XMM5,\TMP4 | 
|  | psrld     $1, \TMP2                    # packed left shift >>1 | 
|  | psrld     $2, \TMP3                    # packed left shift >>2 | 
|  | psrld     $7, \TMP4                    # packed left shift >>7 | 
|  | pxor      \TMP3,\TMP2		       # xor the shifted versions | 
|  | pxor      \TMP4,\TMP2 | 
|  | pxor      \TMP5, \TMP2 | 
|  | pxor      \TMP2, \XMM5 | 
|  | pxor      \TMP1, \XMM5                 # result is in TMP1 | 
|  |  | 
|  | pxor	  \XMM5, \XMM1 | 
|  | .endm | 
|  |  | 
|  | /* | 
|  | * decrypt 4 blocks at a time | 
|  | * ghash the 4 previously decrypted ciphertext blocks | 
|  | * arg1, %arg2, %arg3 are used as pointers only, not modified | 
|  | * %r11 is the data offset value | 
|  | */ | 
|  | .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ | 
|  | TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation | 
|  |  | 
|  | movdqa	  \XMM1, \XMM5 | 
|  | movdqa	  \XMM2, \XMM6 | 
|  | movdqa	  \XMM3, \XMM7 | 
|  | movdqa	  \XMM4, \XMM8 | 
|  |  | 
|  | movdqa    SHUF_MASK(%rip), %xmm15 | 
|  | # multiply TMP5 * HashKey using karatsuba | 
|  |  | 
|  | movdqa	  \XMM5, \TMP4 | 
|  | pshufd	  $78, \XMM5, \TMP6 | 
|  | pxor	  \XMM5, \TMP6 | 
|  | paddd     ONE(%rip), \XMM0		# INCR CNT | 
|  | movdqa	  HashKey_4(%rsp), \TMP5 | 
|  | PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1 | 
|  | movdqa    \XMM0, \XMM1 | 
|  | paddd     ONE(%rip), \XMM0		# INCR CNT | 
|  | movdqa    \XMM0, \XMM2 | 
|  | paddd     ONE(%rip), \XMM0		# INCR CNT | 
|  | movdqa    \XMM0, \XMM3 | 
|  | paddd     ONE(%rip), \XMM0		# INCR CNT | 
|  | movdqa    \XMM0, \XMM4 | 
|  | PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap | 
|  | PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0 | 
|  | PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap | 
|  | PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap | 
|  | PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap | 
|  |  | 
|  | pxor	  (%arg1), \XMM1 | 
|  | pxor	  (%arg1), \XMM2 | 
|  | pxor	  (%arg1), \XMM3 | 
|  | pxor	  (%arg1), \XMM4 | 
|  | movdqa	  HashKey_4_k(%rsp), \TMP5 | 
|  | PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0) | 
|  | movaps 0x10(%arg1), \TMP1 | 
|  | AESENC	  \TMP1, \XMM1              # Round 1 | 
|  | AESENC	  \TMP1, \XMM2 | 
|  | AESENC	  \TMP1, \XMM3 | 
|  | AESENC	  \TMP1, \XMM4 | 
|  | movaps 0x20(%arg1), \TMP1 | 
|  | AESENC	  \TMP1, \XMM1              # Round 2 | 
|  | AESENC	  \TMP1, \XMM2 | 
|  | AESENC	  \TMP1, \XMM3 | 
|  | AESENC	  \TMP1, \XMM4 | 
|  | movdqa	  \XMM6, \TMP1 | 
|  | pshufd	  $78, \XMM6, \TMP2 | 
|  | pxor	  \XMM6, \TMP2 | 
|  | movdqa	  HashKey_3(%rsp), \TMP5 | 
|  | PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1 | 
|  | movaps 0x30(%arg1), \TMP3 | 
|  | AESENC    \TMP3, \XMM1              # Round 3 | 
|  | AESENC    \TMP3, \XMM2 | 
|  | AESENC    \TMP3, \XMM3 | 
|  | AESENC    \TMP3, \XMM4 | 
|  | PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0 | 
|  | movaps 0x40(%arg1), \TMP3 | 
|  | AESENC	  \TMP3, \XMM1              # Round 4 | 
|  | AESENC	  \TMP3, \XMM2 | 
|  | AESENC	  \TMP3, \XMM3 | 
|  | AESENC	  \TMP3, \XMM4 | 
|  | movdqa	  HashKey_3_k(%rsp), \TMP5 | 
|  | PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0) | 
|  | movaps 0x50(%arg1), \TMP3 | 
|  | AESENC	  \TMP3, \XMM1              # Round 5 | 
|  | AESENC	  \TMP3, \XMM2 | 
|  | AESENC	  \TMP3, \XMM3 | 
|  | AESENC	  \TMP3, \XMM4 | 
|  | pxor	  \TMP1, \TMP4 | 
|  | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | 
|  | pxor	  \XMM6, \XMM5 | 
|  | pxor	  \TMP2, \TMP6 | 
|  | movdqa	  \XMM7, \TMP1 | 
|  | pshufd	  $78, \XMM7, \TMP2 | 
|  | pxor	  \XMM7, \TMP2 | 
|  | movdqa	  HashKey_2(%rsp ), \TMP5 | 
|  |  | 
|  | # Multiply TMP5 * HashKey using karatsuba | 
|  |  | 
|  | PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1 | 
|  | movaps 0x60(%arg1), \TMP3 | 
|  | AESENC	  \TMP3, \XMM1              # Round 6 | 
|  | AESENC	  \TMP3, \XMM2 | 
|  | AESENC	  \TMP3, \XMM3 | 
|  | AESENC	  \TMP3, \XMM4 | 
|  | PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0 | 
|  | movaps 0x70(%arg1), \TMP3 | 
|  | AESENC	  \TMP3, \XMM1             # Round 7 | 
|  | AESENC	  \TMP3, \XMM2 | 
|  | AESENC	  \TMP3, \XMM3 | 
|  | AESENC	  \TMP3, \XMM4 | 
|  | movdqa	  HashKey_2_k(%rsp), \TMP5 | 
|  | PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0) | 
|  | movaps 0x80(%arg1), \TMP3 | 
|  | AESENC	  \TMP3, \XMM1             # Round 8 | 
|  | AESENC	  \TMP3, \XMM2 | 
|  | AESENC	  \TMP3, \XMM3 | 
|  | AESENC	  \TMP3, \XMM4 | 
|  | pxor	  \TMP1, \TMP4 | 
|  | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | 
|  | pxor	  \XMM7, \XMM5 | 
|  | pxor	  \TMP2, \TMP6 | 
|  |  | 
|  | # Multiply XMM8 * HashKey | 
|  | # XMM8 and TMP5 hold the values for the two operands | 
|  |  | 
|  | movdqa	  \XMM8, \TMP1 | 
|  | pshufd	  $78, \XMM8, \TMP2 | 
|  | pxor	  \XMM8, \TMP2 | 
|  | movdqa	  HashKey(%rsp), \TMP5 | 
|  | PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1 | 
|  | movaps 0x90(%arg1), \TMP3 | 
|  | AESENC	  \TMP3, \XMM1            # Round 9 | 
|  | AESENC	  \TMP3, \XMM2 | 
|  | AESENC	  \TMP3, \XMM3 | 
|  | AESENC	  \TMP3, \XMM4 | 
|  | PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0 | 
|  | lea	  0xa0(%arg1),%r10 | 
|  | mov	  keysize,%eax | 
|  | shr	  $2,%eax		        # 128->4, 192->6, 256->8 | 
|  | sub	  $4,%eax			# 128->0, 192->2, 256->4 | 
|  | jz	  aes_loop_par_dec_done | 
|  |  | 
|  | aes_loop_par_dec: | 
|  | MOVADQ	  (%r10),\TMP3 | 
|  | .irpc	index, 1234 | 
|  | AESENC	  \TMP3, %xmm\index | 
|  | .endr | 
|  | add	  $16,%r10 | 
|  | sub	  $1,%eax | 
|  | jnz	  aes_loop_par_dec | 
|  |  | 
|  | aes_loop_par_dec_done: | 
|  | MOVADQ	  (%r10), \TMP3 | 
|  | AESENCLAST \TMP3, \XMM1           # last round | 
|  | AESENCLAST \TMP3, \XMM2 | 
|  | AESENCLAST \TMP3, \XMM3 | 
|  | AESENCLAST \TMP3, \XMM4 | 
|  | movdqa    HashKey_k(%rsp), \TMP5 | 
|  | PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0) | 
|  | movdqu	  (%arg3,%r11,1), \TMP3 | 
|  | pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK | 
|  | movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer | 
|  | movdqa    \TMP3, \XMM1 | 
|  | movdqu	  16(%arg3,%r11,1), \TMP3 | 
|  | pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK | 
|  | movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer | 
|  | movdqa    \TMP3, \XMM2 | 
|  | movdqu	  32(%arg3,%r11,1), \TMP3 | 
|  | pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK | 
|  | movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer | 
|  | movdqa    \TMP3, \XMM3 | 
|  | movdqu	  48(%arg3,%r11,1), \TMP3 | 
|  | pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK | 
|  | movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer | 
|  | movdqa    \TMP3, \XMM4 | 
|  | PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap | 
|  | PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap | 
|  | PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap | 
|  | PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap | 
|  |  | 
|  | pxor	  \TMP4, \TMP1 | 
|  | pxor	  \XMM8, \XMM5 | 
|  | pxor	  \TMP6, \TMP2 | 
|  | pxor	  \TMP1, \TMP2 | 
|  | pxor	  \XMM5, \TMP2 | 
|  | movdqa	  \TMP2, \TMP3 | 
|  | pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs | 
|  | psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs | 
|  | pxor	  \TMP3, \XMM5 | 
|  | pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5 | 
|  |  | 
|  | # first phase of reduction | 
|  |  | 
|  | movdqa    \XMM5, \TMP2 | 
|  | movdqa    \XMM5, \TMP3 | 
|  | movdqa    \XMM5, \TMP4 | 
|  | # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently | 
|  | pslld     $31, \TMP2                   # packed right shift << 31 | 
|  | pslld     $30, \TMP3                   # packed right shift << 30 | 
|  | pslld     $25, \TMP4                   # packed right shift << 25 | 
|  | pxor      \TMP3, \TMP2	               # xor the shifted versions | 
|  | pxor      \TMP4, \TMP2 | 
|  | movdqa    \TMP2, \TMP5 | 
|  | psrldq    $4, \TMP5                    # right shift T5 1 DW | 
|  | pslldq    $12, \TMP2                   # left shift T2 3 DWs | 
|  | pxor      \TMP2, \XMM5 | 
|  |  | 
|  | # second phase of reduction | 
|  |  | 
|  | movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 | 
|  | movdqa    \XMM5,\TMP3 | 
|  | movdqa    \XMM5,\TMP4 | 
|  | psrld     $1, \TMP2                    # packed left shift >>1 | 
|  | psrld     $2, \TMP3                    # packed left shift >>2 | 
|  | psrld     $7, \TMP4                    # packed left shift >>7 | 
|  | pxor      \TMP3,\TMP2		       # xor the shifted versions | 
|  | pxor      \TMP4,\TMP2 | 
|  | pxor      \TMP5, \TMP2 | 
|  | pxor      \TMP2, \XMM5 | 
|  | pxor      \TMP1, \XMM5                 # result is in TMP1 | 
|  |  | 
|  | pxor	  \XMM5, \XMM1 | 
|  | .endm | 
|  |  | 
|  | /* GHASH the last 4 ciphertext blocks. */ | 
|  | .macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ | 
|  | TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst | 
|  |  | 
|  | # Multiply TMP6 * HashKey (using Karatsuba) | 
|  |  | 
|  | movdqa	  \XMM1, \TMP6 | 
|  | pshufd	  $78, \XMM1, \TMP2 | 
|  | pxor	  \XMM1, \TMP2 | 
|  | movdqa	  HashKey_4(%rsp), \TMP5 | 
|  | PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1 | 
|  | PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0 | 
|  | movdqa	  HashKey_4_k(%rsp), \TMP4 | 
|  | PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0) | 
|  | movdqa	  \XMM1, \XMMDst | 
|  | movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1 | 
|  |  | 
|  | # Multiply TMP1 * HashKey (using Karatsuba) | 
|  |  | 
|  | movdqa	  \XMM2, \TMP1 | 
|  | pshufd	  $78, \XMM2, \TMP2 | 
|  | pxor	  \XMM2, \TMP2 | 
|  | movdqa	  HashKey_3(%rsp), \TMP5 | 
|  | PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1 | 
|  | PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0 | 
|  | movdqa	  HashKey_3_k(%rsp), \TMP4 | 
|  | PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0) | 
|  | pxor	  \TMP1, \TMP6 | 
|  | pxor	  \XMM2, \XMMDst | 
|  | pxor	  \TMP2, \XMM1 | 
|  | # results accumulated in TMP6, XMMDst, XMM1 | 
|  |  | 
|  | # Multiply TMP1 * HashKey (using Karatsuba) | 
|  |  | 
|  | movdqa	  \XMM3, \TMP1 | 
|  | pshufd	  $78, \XMM3, \TMP2 | 
|  | pxor	  \XMM3, \TMP2 | 
|  | movdqa	  HashKey_2(%rsp), \TMP5 | 
|  | PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1 | 
|  | PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0 | 
|  | movdqa	  HashKey_2_k(%rsp), \TMP4 | 
|  | PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0) | 
|  | pxor	  \TMP1, \TMP6 | 
|  | pxor	  \XMM3, \XMMDst | 
|  | pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1 | 
|  |  | 
|  | # Multiply TMP1 * HashKey (using Karatsuba) | 
|  | movdqa	  \XMM4, \TMP1 | 
|  | pshufd	  $78, \XMM4, \TMP2 | 
|  | pxor	  \XMM4, \TMP2 | 
|  | movdqa	  HashKey(%rsp), \TMP5 | 
|  | PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1 | 
|  | PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0 | 
|  | movdqa	  HashKey_k(%rsp), \TMP4 | 
|  | PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0) | 
|  | pxor	  \TMP1, \TMP6 | 
|  | pxor	  \XMM4, \XMMDst | 
|  | pxor	  \XMM1, \TMP2 | 
|  | pxor	  \TMP6, \TMP2 | 
|  | pxor	  \XMMDst, \TMP2 | 
|  | # middle section of the temp results combined as in karatsuba algorithm | 
|  | movdqa	  \TMP2, \TMP4 | 
|  | pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs | 
|  | psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs | 
|  | pxor	  \TMP4, \XMMDst | 
|  | pxor	  \TMP2, \TMP6 | 
|  | # TMP6:XMMDst holds the result of the accumulated carry-less multiplications | 
|  | # first phase of the reduction | 
|  | movdqa    \XMMDst, \TMP2 | 
|  | movdqa    \XMMDst, \TMP3 | 
|  | movdqa    \XMMDst, \TMP4 | 
|  | # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently | 
|  | pslld     $31, \TMP2                # packed right shifting << 31 | 
|  | pslld     $30, \TMP3                # packed right shifting << 30 | 
|  | pslld     $25, \TMP4                # packed right shifting << 25 | 
|  | pxor      \TMP3, \TMP2              # xor the shifted versions | 
|  | pxor      \TMP4, \TMP2 | 
|  | movdqa    \TMP2, \TMP7 | 
|  | psrldq    $4, \TMP7                 # right shift TMP7 1 DW | 
|  | pslldq    $12, \TMP2                # left shift TMP2 3 DWs | 
|  | pxor      \TMP2, \XMMDst | 
|  |  | 
|  | # second phase of the reduction | 
|  | movdqa    \XMMDst, \TMP2 | 
|  | # make 3 copies of XMMDst for doing 3 shift operations | 
|  | movdqa    \XMMDst, \TMP3 | 
|  | movdqa    \XMMDst, \TMP4 | 
|  | psrld     $1, \TMP2                 # packed left shift >> 1 | 
|  | psrld     $2, \TMP3                 # packed left shift >> 2 | 
|  | psrld     $7, \TMP4                 # packed left shift >> 7 | 
|  | pxor      \TMP3, \TMP2              # xor the shifted versions | 
|  | pxor      \TMP4, \TMP2 | 
|  | pxor      \TMP7, \TMP2 | 
|  | pxor      \TMP2, \XMMDst | 
|  | pxor      \TMP6, \XMMDst            # reduced result is in XMMDst | 
|  | .endm | 
|  |  | 
|  |  | 
|  | /* Encryption of a single block | 
|  | * uses eax & r10 | 
|  | */ | 
|  |  | 
|  | .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 | 
|  |  | 
|  | pxor		(%arg1), \XMM0 | 
|  | mov		keysize,%eax | 
|  | shr		$2,%eax			# 128->4, 192->6, 256->8 | 
|  | add		$5,%eax			# 128->9, 192->11, 256->13 | 
|  | lea		16(%arg1), %r10	  # get first expanded key address | 
|  |  | 
|  | _esb_loop_\@: | 
|  | MOVADQ		(%r10),\TMP1 | 
|  | AESENC		\TMP1,\XMM0 | 
|  | add		$16,%r10 | 
|  | sub		$1,%eax | 
|  | jnz		_esb_loop_\@ | 
|  |  | 
|  | MOVADQ		(%r10),\TMP1 | 
|  | AESENCLAST	\TMP1,\XMM0 | 
|  | .endm | 
|  | /***************************************************************************** | 
|  | * void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary. | 
|  | *                   u8 *out,           // Plaintext output. Encrypt in-place is allowed. | 
|  | *                   const u8 *in,      // Ciphertext input | 
|  | *                   u64 plaintext_len, // Length of data in bytes for decryption. | 
|  | *                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association) | 
|  | *                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) | 
|  | *                                      // concatenated with 0x00000001. 16-byte aligned pointer. | 
|  | *                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary. | 
|  | *                   const u8 *aad,     // Additional Authentication Data (AAD) | 
|  | *                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes | 
|  | *                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the | 
|  | *                                      // given authentication tag and only return the plaintext if they match. | 
|  | *                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 | 
|  | *                                      // (most likely), 12 or 8. | 
|  | * | 
|  | * Assumptions: | 
|  | * | 
|  | * keys: | 
|  | *       keys are pre-expanded and aligned to 16 bytes. we are using the first | 
|  | *       set of 11 keys in the data structure void *aes_ctx | 
|  | * | 
|  | * iv: | 
|  | *       0                   1                   2                   3 | 
|  | *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       |                             Salt  (From the SA)               | | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       |                     Initialization Vector                     | | 
|  | *       |         (This is the sequence number from IPSec header)       | | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       |                              0x1                              | | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | * | 
|  | * | 
|  | * | 
|  | * AAD: | 
|  | *       AAD padded to 128 bits with 0 | 
|  | *       for example, assume AAD is a u32 vector | 
|  | * | 
|  | *       if AAD is 8 bytes: | 
|  | *       AAD[3] = {A0, A1}; | 
|  | *       padded AAD in xmm register = {A1 A0 0 0} | 
|  | * | 
|  | *       0                   1                   2                   3 | 
|  | *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       |                               SPI (A1)                        | | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       |                     32-bit Sequence Number (A0)               | | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       |                              0x0                              | | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | * | 
|  | *                                       AAD Format with 32-bit Sequence Number | 
|  | * | 
|  | *       if AAD is 12 bytes: | 
|  | *       AAD[3] = {A0, A1, A2}; | 
|  | *       padded AAD in xmm register = {A2 A1 A0 0} | 
|  | * | 
|  | *       0                   1                   2                   3 | 
|  | *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       |                               SPI (A2)                        | | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       |                 64-bit Extended Sequence Number {A1,A0}       | | 
|  | *       |                                                               | | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       |                              0x0                              | | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | * | 
|  | *                        AAD Format with 64-bit Extended Sequence Number | 
|  | * | 
|  | * poly = x^128 + x^127 + x^126 + x^121 + 1 | 
|  | * | 
|  | *****************************************************************************/ | 
|  | ENTRY(aesni_gcm_dec) | 
|  | push	%r12 | 
|  | push	%r13 | 
|  | push	%r14 | 
|  | mov	%rsp, %r14 | 
|  | /* | 
|  | * states of %xmm registers %xmm6:%xmm15 not saved | 
|  | * all %xmm registers are clobbered | 
|  | */ | 
|  | sub	$VARIABLE_OFFSET, %rsp | 
|  | and	$~63, %rsp                        # align rsp to 64 bytes | 
|  | mov	%arg6, %r12 | 
|  | movdqu	(%r12), %xmm13			  # %xmm13 = HashKey | 
|  | movdqa  SHUF_MASK(%rip), %xmm2 | 
|  | PSHUFB_XMM %xmm2, %xmm13 | 
|  |  | 
|  |  | 
|  | # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) | 
|  |  | 
|  | movdqa	%xmm13, %xmm2 | 
|  | psllq	$1, %xmm13 | 
|  | psrlq	$63, %xmm2 | 
|  | movdqa	%xmm2, %xmm1 | 
|  | pslldq	$8, %xmm2 | 
|  | psrldq	$8, %xmm1 | 
|  | por	%xmm2, %xmm13 | 
|  |  | 
|  | # Reduction | 
|  |  | 
|  | pshufd	$0x24, %xmm1, %xmm2 | 
|  | pcmpeqd TWOONE(%rip), %xmm2 | 
|  | pand	POLY(%rip), %xmm2 | 
|  | pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly) | 
|  |  | 
|  |  | 
|  | # Decrypt first few blocks | 
|  |  | 
|  | movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly) | 
|  | mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext | 
|  | and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16) | 
|  | mov %r13, %r12 | 
|  | and $(3<<4), %r12 | 
|  | jz _initial_num_blocks_is_0_decrypt | 
|  | cmp $(2<<4), %r12 | 
|  | jb _initial_num_blocks_is_1_decrypt | 
|  | je _initial_num_blocks_is_2_decrypt | 
|  | _initial_num_blocks_is_3_decrypt: | 
|  | INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | 
|  | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec | 
|  | sub	$48, %r13 | 
|  | jmp	_initial_blocks_decrypted | 
|  | _initial_num_blocks_is_2_decrypt: | 
|  | INITIAL_BLOCKS_DEC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | 
|  | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec | 
|  | sub	$32, %r13 | 
|  | jmp	_initial_blocks_decrypted | 
|  | _initial_num_blocks_is_1_decrypt: | 
|  | INITIAL_BLOCKS_DEC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | 
|  | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec | 
|  | sub	$16, %r13 | 
|  | jmp	_initial_blocks_decrypted | 
|  | _initial_num_blocks_is_0_decrypt: | 
|  | INITIAL_BLOCKS_DEC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | 
|  | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec | 
|  | _initial_blocks_decrypted: | 
|  | cmp	$0, %r13 | 
|  | je	_zero_cipher_left_decrypt | 
|  | sub	$64, %r13 | 
|  | je	_four_cipher_left_decrypt | 
|  | _decrypt_by_4: | 
|  | GHASH_4_ENCRYPT_4_PARALLEL_DEC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ | 
|  | %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec | 
|  | add	$64, %r11 | 
|  | sub	$64, %r13 | 
|  | jne	_decrypt_by_4 | 
|  | _four_cipher_left_decrypt: | 
|  | GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ | 
|  | %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 | 
|  | _zero_cipher_left_decrypt: | 
|  | mov	%arg4, %r13 | 
|  | and	$15, %r13				# %r13 = arg4 (mod 16) | 
|  | je	_multiple_of_16_bytes_decrypt | 
|  |  | 
|  | # Handle the last <16 byte block separately | 
|  |  | 
|  | paddd ONE(%rip), %xmm0         # increment CNT to get Yn | 
|  | movdqa SHUF_MASK(%rip), %xmm10 | 
|  | PSHUFB_XMM %xmm10, %xmm0 | 
|  |  | 
|  | ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn) | 
|  |  | 
|  | lea (%arg3,%r11,1), %r10 | 
|  | mov %r13, %r12 | 
|  | READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 | 
|  |  | 
|  | lea ALL_F+16(%rip), %r12 | 
|  | sub %r13, %r12 | 
|  | movdqa  %xmm1, %xmm2 | 
|  | pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn) | 
|  | movdqu (%r12), %xmm1 | 
|  | # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 | 
|  | pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0 | 
|  | pand    %xmm1, %xmm2 | 
|  | movdqa SHUF_MASK(%rip), %xmm10 | 
|  | PSHUFB_XMM %xmm10 ,%xmm2 | 
|  |  | 
|  | pxor %xmm2, %xmm8 | 
|  | GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | 
|  |  | 
|  | # output %r13 bytes | 
|  | MOVQ_R64_XMM	%xmm0, %rax | 
|  | cmp	$8, %r13 | 
|  | jle	_less_than_8_bytes_left_decrypt | 
|  | mov	%rax, (%arg2 , %r11, 1) | 
|  | add	$8, %r11 | 
|  | psrldq	$8, %xmm0 | 
|  | MOVQ_R64_XMM	%xmm0, %rax | 
|  | sub	$8, %r13 | 
|  | _less_than_8_bytes_left_decrypt: | 
|  | mov	%al,  (%arg2, %r11, 1) | 
|  | add	$1, %r11 | 
|  | shr	$8, %rax | 
|  | sub	$1, %r13 | 
|  | jne	_less_than_8_bytes_left_decrypt | 
|  | _multiple_of_16_bytes_decrypt: | 
|  | mov	arg8, %r12		  # %r13 = aadLen (number of bytes) | 
|  | shl	$3, %r12		  # convert into number of bits | 
|  | movd	%r12d, %xmm15		  # len(A) in %xmm15 | 
|  | shl	$3, %arg4		  # len(C) in bits (*128) | 
|  | MOVQ_R64_XMM	%arg4, %xmm1 | 
|  | pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000 | 
|  | pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C) | 
|  | pxor	%xmm15, %xmm8 | 
|  | GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | 
|  | # final GHASH computation | 
|  | movdqa SHUF_MASK(%rip), %xmm10 | 
|  | PSHUFB_XMM %xmm10, %xmm8 | 
|  |  | 
|  | mov	%arg5, %rax		  # %rax = *Y0 | 
|  | movdqu	(%rax), %xmm0		  # %xmm0 = Y0 | 
|  | ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0) | 
|  | pxor	%xmm8, %xmm0 | 
|  | _return_T_decrypt: | 
|  | mov	arg9, %r10                # %r10 = authTag | 
|  | mov	arg10, %r11               # %r11 = auth_tag_len | 
|  | cmp	$16, %r11 | 
|  | je	_T_16_decrypt | 
|  | cmp	$8, %r11 | 
|  | jl	_T_4_decrypt | 
|  | _T_8_decrypt: | 
|  | MOVQ_R64_XMM	%xmm0, %rax | 
|  | mov	%rax, (%r10) | 
|  | add	$8, %r10 | 
|  | sub	$8, %r11 | 
|  | psrldq	$8, %xmm0 | 
|  | cmp	$0, %r11 | 
|  | je	_return_T_done_decrypt | 
|  | _T_4_decrypt: | 
|  | movd	%xmm0, %eax | 
|  | mov	%eax, (%r10) | 
|  | add	$4, %r10 | 
|  | sub	$4, %r11 | 
|  | psrldq	$4, %xmm0 | 
|  | cmp	$0, %r11 | 
|  | je	_return_T_done_decrypt | 
|  | _T_123_decrypt: | 
|  | movd	%xmm0, %eax | 
|  | cmp	$2, %r11 | 
|  | jl	_T_1_decrypt | 
|  | mov	%ax, (%r10) | 
|  | cmp	$2, %r11 | 
|  | je	_return_T_done_decrypt | 
|  | add	$2, %r10 | 
|  | sar	$16, %eax | 
|  | _T_1_decrypt: | 
|  | mov	%al, (%r10) | 
|  | jmp	_return_T_done_decrypt | 
|  | _T_16_decrypt: | 
|  | movdqu	%xmm0, (%r10) | 
|  | _return_T_done_decrypt: | 
|  | mov	%r14, %rsp | 
|  | pop	%r14 | 
|  | pop	%r13 | 
|  | pop	%r12 | 
|  | ret | 
|  | ENDPROC(aesni_gcm_dec) | 
|  |  | 
|  |  | 
|  | /***************************************************************************** | 
|  | * void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary. | 
|  | *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed. | 
|  | *                    const u8 *in,       // Plaintext input | 
|  | *                    u64 plaintext_len,  // Length of data in bytes for encryption. | 
|  | *                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association) | 
|  | *                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) | 
|  | *                                        // concatenated with 0x00000001. 16-byte aligned pointer. | 
|  | *                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary. | 
|  | *                    const u8 *aad,      // Additional Authentication Data (AAD) | 
|  | *                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes | 
|  | *                    u8 *auth_tag,       // Authenticated Tag output. | 
|  | *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely), | 
|  | *                                        // 12 or 8. | 
|  | * | 
|  | * Assumptions: | 
|  | * | 
|  | * keys: | 
|  | *       keys are pre-expanded and aligned to 16 bytes. we are using the | 
|  | *       first set of 11 keys in the data structure void *aes_ctx | 
|  | * | 
|  | * | 
|  | * iv: | 
|  | *       0                   1                   2                   3 | 
|  | *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       |                             Salt  (From the SA)               | | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       |                     Initialization Vector                     | | 
|  | *       |         (This is the sequence number from IPSec header)       | | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       |                              0x1                              | | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | * | 
|  | * | 
|  | * | 
|  | * AAD: | 
|  | *       AAD padded to 128 bits with 0 | 
|  | *       for example, assume AAD is a u32 vector | 
|  | * | 
|  | *       if AAD is 8 bytes: | 
|  | *       AAD[3] = {A0, A1}; | 
|  | *       padded AAD in xmm register = {A1 A0 0 0} | 
|  | * | 
|  | *       0                   1                   2                   3 | 
|  | *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       |                               SPI (A1)                        | | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       |                     32-bit Sequence Number (A0)               | | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       |                              0x0                              | | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | * | 
|  | *                                 AAD Format with 32-bit Sequence Number | 
|  | * | 
|  | *       if AAD is 12 bytes: | 
|  | *       AAD[3] = {A0, A1, A2}; | 
|  | *       padded AAD in xmm register = {A2 A1 A0 0} | 
|  | * | 
|  | *       0                   1                   2                   3 | 
|  | *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       |                               SPI (A2)                        | | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       |                 64-bit Extended Sequence Number {A1,A0}       | | 
|  | *       |                                                               | | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | *       |                              0x0                              | | 
|  | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | * | 
|  | *                         AAD Format with 64-bit Extended Sequence Number | 
|  | * | 
|  | * poly = x^128 + x^127 + x^126 + x^121 + 1 | 
|  | ***************************************************************************/ | 
|  | ENTRY(aesni_gcm_enc) | 
|  | push	%r12 | 
|  | push	%r13 | 
|  | push	%r14 | 
|  | mov	%rsp, %r14 | 
|  | # | 
|  | # states of %xmm registers %xmm6:%xmm15 not saved | 
|  | # all %xmm registers are clobbered | 
|  | # | 
|  | sub	$VARIABLE_OFFSET, %rsp | 
|  | and	$~63, %rsp | 
|  | mov	%arg6, %r12 | 
|  | movdqu	(%r12), %xmm13 | 
|  | movdqa  SHUF_MASK(%rip), %xmm2 | 
|  | PSHUFB_XMM %xmm2, %xmm13 | 
|  |  | 
|  |  | 
|  | # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) | 
|  |  | 
|  | movdqa	%xmm13, %xmm2 | 
|  | psllq	$1, %xmm13 | 
|  | psrlq	$63, %xmm2 | 
|  | movdqa	%xmm2, %xmm1 | 
|  | pslldq	$8, %xmm2 | 
|  | psrldq	$8, %xmm1 | 
|  | por	%xmm2, %xmm13 | 
|  |  | 
|  | # reduce HashKey<<1 | 
|  |  | 
|  | pshufd	$0x24, %xmm1, %xmm2 | 
|  | pcmpeqd TWOONE(%rip), %xmm2 | 
|  | pand	POLY(%rip), %xmm2 | 
|  | pxor	%xmm2, %xmm13 | 
|  | movdqa	%xmm13, HashKey(%rsp) | 
|  | mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly) | 
|  | and	$-16, %r13 | 
|  | mov	%r13, %r12 | 
|  |  | 
|  | # Encrypt first few blocks | 
|  |  | 
|  | and	$(3<<4), %r12 | 
|  | jz	_initial_num_blocks_is_0_encrypt | 
|  | cmp	$(2<<4), %r12 | 
|  | jb	_initial_num_blocks_is_1_encrypt | 
|  | je	_initial_num_blocks_is_2_encrypt | 
|  | _initial_num_blocks_is_3_encrypt: | 
|  | INITIAL_BLOCKS_ENC	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | 
|  | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc | 
|  | sub	$48, %r13 | 
|  | jmp	_initial_blocks_encrypted | 
|  | _initial_num_blocks_is_2_encrypt: | 
|  | INITIAL_BLOCKS_ENC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | 
|  | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc | 
|  | sub	$32, %r13 | 
|  | jmp	_initial_blocks_encrypted | 
|  | _initial_num_blocks_is_1_encrypt: | 
|  | INITIAL_BLOCKS_ENC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | 
|  | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc | 
|  | sub	$16, %r13 | 
|  | jmp	_initial_blocks_encrypted | 
|  | _initial_num_blocks_is_0_encrypt: | 
|  | INITIAL_BLOCKS_ENC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | 
|  | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc | 
|  | _initial_blocks_encrypted: | 
|  |  | 
|  | # Main loop - Encrypt remaining blocks | 
|  |  | 
|  | cmp	$0, %r13 | 
|  | je	_zero_cipher_left_encrypt | 
|  | sub	$64, %r13 | 
|  | je	_four_cipher_left_encrypt | 
|  | _encrypt_by_4_encrypt: | 
|  | GHASH_4_ENCRYPT_4_PARALLEL_ENC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ | 
|  | %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc | 
|  | add	$64, %r11 | 
|  | sub	$64, %r13 | 
|  | jne	_encrypt_by_4_encrypt | 
|  | _four_cipher_left_encrypt: | 
|  | GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ | 
|  | %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 | 
|  | _zero_cipher_left_encrypt: | 
|  | mov	%arg4, %r13 | 
|  | and	$15, %r13			# %r13 = arg4 (mod 16) | 
|  | je	_multiple_of_16_bytes_encrypt | 
|  |  | 
|  | # Handle the last <16 Byte block separately | 
|  | paddd ONE(%rip), %xmm0                # INCR CNT to get Yn | 
|  | movdqa SHUF_MASK(%rip), %xmm10 | 
|  | PSHUFB_XMM %xmm10, %xmm0 | 
|  |  | 
|  | ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn) | 
|  |  | 
|  | lea (%arg3,%r11,1), %r10 | 
|  | mov %r13, %r12 | 
|  | READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 | 
|  |  | 
|  | lea ALL_F+16(%rip), %r12 | 
|  | sub %r13, %r12 | 
|  | pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn) | 
|  | movdqu	(%r12), %xmm1 | 
|  | # get the appropriate mask to mask out top 16-r13 bytes of xmm0 | 
|  | pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0 | 
|  | movdqa SHUF_MASK(%rip), %xmm10 | 
|  | PSHUFB_XMM %xmm10,%xmm0 | 
|  |  | 
|  | pxor	%xmm0, %xmm8 | 
|  | GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | 
|  | # GHASH computation for the last <16 byte block | 
|  | movdqa SHUF_MASK(%rip), %xmm10 | 
|  | PSHUFB_XMM %xmm10, %xmm0 | 
|  |  | 
|  | # shuffle xmm0 back to output as ciphertext | 
|  |  | 
|  | # Output %r13 bytes | 
|  | MOVQ_R64_XMM %xmm0, %rax | 
|  | cmp $8, %r13 | 
|  | jle _less_than_8_bytes_left_encrypt | 
|  | mov %rax, (%arg2 , %r11, 1) | 
|  | add $8, %r11 | 
|  | psrldq $8, %xmm0 | 
|  | MOVQ_R64_XMM %xmm0, %rax | 
|  | sub $8, %r13 | 
|  | _less_than_8_bytes_left_encrypt: | 
|  | mov %al,  (%arg2, %r11, 1) | 
|  | add $1, %r11 | 
|  | shr $8, %rax | 
|  | sub $1, %r13 | 
|  | jne _less_than_8_bytes_left_encrypt | 
|  | _multiple_of_16_bytes_encrypt: | 
|  | mov	arg8, %r12    # %r12 = addLen (number of bytes) | 
|  | shl	$3, %r12 | 
|  | movd	%r12d, %xmm15       # len(A) in %xmm15 | 
|  | shl	$3, %arg4               # len(C) in bits (*128) | 
|  | MOVQ_R64_XMM	%arg4, %xmm1 | 
|  | pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000 | 
|  | pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C) | 
|  | pxor	%xmm15, %xmm8 | 
|  | GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | 
|  | # final GHASH computation | 
|  | movdqa SHUF_MASK(%rip), %xmm10 | 
|  | PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap | 
|  |  | 
|  | mov	%arg5, %rax		       # %rax  = *Y0 | 
|  | movdqu	(%rax), %xmm0		       # %xmm0 = Y0 | 
|  | ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0) | 
|  | pxor	%xmm8, %xmm0 | 
|  | _return_T_encrypt: | 
|  | mov	arg9, %r10                     # %r10 = authTag | 
|  | mov	arg10, %r11                    # %r11 = auth_tag_len | 
|  | cmp	$16, %r11 | 
|  | je	_T_16_encrypt | 
|  | cmp	$8, %r11 | 
|  | jl	_T_4_encrypt | 
|  | _T_8_encrypt: | 
|  | MOVQ_R64_XMM	%xmm0, %rax | 
|  | mov	%rax, (%r10) | 
|  | add	$8, %r10 | 
|  | sub	$8, %r11 | 
|  | psrldq	$8, %xmm0 | 
|  | cmp	$0, %r11 | 
|  | je	_return_T_done_encrypt | 
|  | _T_4_encrypt: | 
|  | movd	%xmm0, %eax | 
|  | mov	%eax, (%r10) | 
|  | add	$4, %r10 | 
|  | sub	$4, %r11 | 
|  | psrldq	$4, %xmm0 | 
|  | cmp	$0, %r11 | 
|  | je	_return_T_done_encrypt | 
|  | _T_123_encrypt: | 
|  | movd	%xmm0, %eax | 
|  | cmp	$2, %r11 | 
|  | jl	_T_1_encrypt | 
|  | mov	%ax, (%r10) | 
|  | cmp	$2, %r11 | 
|  | je	_return_T_done_encrypt | 
|  | add	$2, %r10 | 
|  | sar	$16, %eax | 
|  | _T_1_encrypt: | 
|  | mov	%al, (%r10) | 
|  | jmp	_return_T_done_encrypt | 
|  | _T_16_encrypt: | 
|  | movdqu	%xmm0, (%r10) | 
|  | _return_T_done_encrypt: | 
|  | mov	%r14, %rsp | 
|  | pop	%r14 | 
|  | pop	%r13 | 
|  | pop	%r12 | 
|  | ret | 
|  | ENDPROC(aesni_gcm_enc) | 
|  |  | 
|  | #endif | 
|  |  | 
|  |  | 
|  | .align 4 | 
|  | _key_expansion_128: | 
|  | _key_expansion_256a: | 
|  | pshufd $0b11111111, %xmm1, %xmm1 | 
|  | shufps $0b00010000, %xmm0, %xmm4 | 
|  | pxor %xmm4, %xmm0 | 
|  | shufps $0b10001100, %xmm0, %xmm4 | 
|  | pxor %xmm4, %xmm0 | 
|  | pxor %xmm1, %xmm0 | 
|  | movaps %xmm0, (TKEYP) | 
|  | add $0x10, TKEYP | 
|  | ret | 
|  | ENDPROC(_key_expansion_128) | 
|  | ENDPROC(_key_expansion_256a) | 
|  |  | 
|  | .align 4 | 
|  | _key_expansion_192a: | 
|  | pshufd $0b01010101, %xmm1, %xmm1 | 
|  | shufps $0b00010000, %xmm0, %xmm4 | 
|  | pxor %xmm4, %xmm0 | 
|  | shufps $0b10001100, %xmm0, %xmm4 | 
|  | pxor %xmm4, %xmm0 | 
|  | pxor %xmm1, %xmm0 | 
|  |  | 
|  | movaps %xmm2, %xmm5 | 
|  | movaps %xmm2, %xmm6 | 
|  | pslldq $4, %xmm5 | 
|  | pshufd $0b11111111, %xmm0, %xmm3 | 
|  | pxor %xmm3, %xmm2 | 
|  | pxor %xmm5, %xmm2 | 
|  |  | 
|  | movaps %xmm0, %xmm1 | 
|  | shufps $0b01000100, %xmm0, %xmm6 | 
|  | movaps %xmm6, (TKEYP) | 
|  | shufps $0b01001110, %xmm2, %xmm1 | 
|  | movaps %xmm1, 0x10(TKEYP) | 
|  | add $0x20, TKEYP | 
|  | ret | 
|  | ENDPROC(_key_expansion_192a) | 
|  |  | 
|  | .align 4 | 
|  | _key_expansion_192b: | 
|  | pshufd $0b01010101, %xmm1, %xmm1 | 
|  | shufps $0b00010000, %xmm0, %xmm4 | 
|  | pxor %xmm4, %xmm0 | 
|  | shufps $0b10001100, %xmm0, %xmm4 | 
|  | pxor %xmm4, %xmm0 | 
|  | pxor %xmm1, %xmm0 | 
|  |  | 
|  | movaps %xmm2, %xmm5 | 
|  | pslldq $4, %xmm5 | 
|  | pshufd $0b11111111, %xmm0, %xmm3 | 
|  | pxor %xmm3, %xmm2 | 
|  | pxor %xmm5, %xmm2 | 
|  |  | 
|  | movaps %xmm0, (TKEYP) | 
|  | add $0x10, TKEYP | 
|  | ret | 
|  | ENDPROC(_key_expansion_192b) | 
|  |  | 
|  | .align 4 | 
|  | _key_expansion_256b: | 
|  | pshufd $0b10101010, %xmm1, %xmm1 | 
|  | shufps $0b00010000, %xmm2, %xmm4 | 
|  | pxor %xmm4, %xmm2 | 
|  | shufps $0b10001100, %xmm2, %xmm4 | 
|  | pxor %xmm4, %xmm2 | 
|  | pxor %xmm1, %xmm2 | 
|  | movaps %xmm2, (TKEYP) | 
|  | add $0x10, TKEYP | 
|  | ret | 
|  | ENDPROC(_key_expansion_256b) | 
|  |  | 
|  | /* | 
|  | * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, | 
|  | *                   unsigned int key_len) | 
|  | */ | 
|  | ENTRY(aesni_set_key) | 
|  | FRAME_BEGIN | 
|  | #ifndef __x86_64__ | 
|  | pushl KEYP | 
|  | movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx | 
|  | movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key | 
|  | movl (FRAME_OFFSET+16)(%esp), %edx	# key_len | 
|  | #endif | 
|  | movups (UKEYP), %xmm0		# user key (first 16 bytes) | 
|  | movaps %xmm0, (KEYP) | 
|  | lea 0x10(KEYP), TKEYP		# key addr | 
|  | movl %edx, 480(KEYP) | 
|  | pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x | 
|  | cmp $24, %dl | 
|  | jb .Lenc_key128 | 
|  | je .Lenc_key192 | 
|  | movups 0x10(UKEYP), %xmm2	# other user key | 
|  | movaps %xmm2, (TKEYP) | 
|  | add $0x10, TKEYP | 
|  | AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1 | 
|  | call _key_expansion_256a | 
|  | AESKEYGENASSIST 0x1 %xmm0 %xmm1 | 
|  | call _key_expansion_256b | 
|  | AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2 | 
|  | call _key_expansion_256a | 
|  | AESKEYGENASSIST 0x2 %xmm0 %xmm1 | 
|  | call _key_expansion_256b | 
|  | AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3 | 
|  | call _key_expansion_256a | 
|  | AESKEYGENASSIST 0x4 %xmm0 %xmm1 | 
|  | call _key_expansion_256b | 
|  | AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4 | 
|  | call _key_expansion_256a | 
|  | AESKEYGENASSIST 0x8 %xmm0 %xmm1 | 
|  | call _key_expansion_256b | 
|  | AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5 | 
|  | call _key_expansion_256a | 
|  | AESKEYGENASSIST 0x10 %xmm0 %xmm1 | 
|  | call _key_expansion_256b | 
|  | AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6 | 
|  | call _key_expansion_256a | 
|  | AESKEYGENASSIST 0x20 %xmm0 %xmm1 | 
|  | call _key_expansion_256b | 
|  | AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7 | 
|  | call _key_expansion_256a | 
|  | jmp .Ldec_key | 
|  | .Lenc_key192: | 
|  | movq 0x10(UKEYP), %xmm2		# other user key | 
|  | AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1 | 
|  | call _key_expansion_192a | 
|  | AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2 | 
|  | call _key_expansion_192b | 
|  | AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3 | 
|  | call _key_expansion_192a | 
|  | AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4 | 
|  | call _key_expansion_192b | 
|  | AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5 | 
|  | call _key_expansion_192a | 
|  | AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6 | 
|  | call _key_expansion_192b | 
|  | AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7 | 
|  | call _key_expansion_192a | 
|  | AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8 | 
|  | call _key_expansion_192b | 
|  | jmp .Ldec_key | 
|  | .Lenc_key128: | 
|  | AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1 | 
|  | call _key_expansion_128 | 
|  | AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2 | 
|  | call _key_expansion_128 | 
|  | AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3 | 
|  | call _key_expansion_128 | 
|  | AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4 | 
|  | call _key_expansion_128 | 
|  | AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5 | 
|  | call _key_expansion_128 | 
|  | AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6 | 
|  | call _key_expansion_128 | 
|  | AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7 | 
|  | call _key_expansion_128 | 
|  | AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8 | 
|  | call _key_expansion_128 | 
|  | AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9 | 
|  | call _key_expansion_128 | 
|  | AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10 | 
|  | call _key_expansion_128 | 
|  | .Ldec_key: | 
|  | sub $0x10, TKEYP | 
|  | movaps (KEYP), %xmm0 | 
|  | movaps (TKEYP), %xmm1 | 
|  | movaps %xmm0, 240(TKEYP) | 
|  | movaps %xmm1, 240(KEYP) | 
|  | add $0x10, KEYP | 
|  | lea 240-16(TKEYP), UKEYP | 
|  | .align 4 | 
|  | .Ldec_key_loop: | 
|  | movaps (KEYP), %xmm0 | 
|  | AESIMC %xmm0 %xmm1 | 
|  | movaps %xmm1, (UKEYP) | 
|  | add $0x10, KEYP | 
|  | sub $0x10, UKEYP | 
|  | cmp TKEYP, KEYP | 
|  | jb .Ldec_key_loop | 
|  | xor AREG, AREG | 
|  | #ifndef __x86_64__ | 
|  | popl KEYP | 
|  | #endif | 
|  | FRAME_END | 
|  | ret | 
|  | ENDPROC(aesni_set_key) | 
|  |  | 
|  | /* | 
|  | * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) | 
|  | */ | 
|  | ENTRY(aesni_enc) | 
|  | FRAME_BEGIN | 
|  | #ifndef __x86_64__ | 
|  | pushl KEYP | 
|  | pushl KLEN | 
|  | movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx | 
|  | movl (FRAME_OFFSET+16)(%esp), OUTP	# dst | 
|  | movl (FRAME_OFFSET+20)(%esp), INP	# src | 
|  | #endif | 
|  | movl 480(KEYP), KLEN		# key length | 
|  | movups (INP), STATE		# input | 
|  | call _aesni_enc1 | 
|  | movups STATE, (OUTP)		# output | 
|  | #ifndef __x86_64__ | 
|  | popl KLEN | 
|  | popl KEYP | 
|  | #endif | 
|  | FRAME_END | 
|  | ret | 
|  | ENDPROC(aesni_enc) | 
|  |  | 
|  | /* | 
|  | * _aesni_enc1:		internal ABI | 
|  | * input: | 
|  | *	KEYP:		key struct pointer | 
|  | *	KLEN:		round count | 
|  | *	STATE:		initial state (input) | 
|  | * output: | 
|  | *	STATE:		finial state (output) | 
|  | * changed: | 
|  | *	KEY | 
|  | *	TKEYP (T1) | 
|  | */ | 
|  | .align 4 | 
|  | _aesni_enc1: | 
|  | movaps (KEYP), KEY		# key | 
|  | mov KEYP, TKEYP | 
|  | pxor KEY, STATE		# round 0 | 
|  | add $0x30, TKEYP | 
|  | cmp $24, KLEN | 
|  | jb .Lenc128 | 
|  | lea 0x20(TKEYP), TKEYP | 
|  | je .Lenc192 | 
|  | add $0x20, TKEYP | 
|  | movaps -0x60(TKEYP), KEY | 
|  | AESENC KEY STATE | 
|  | movaps -0x50(TKEYP), KEY | 
|  | AESENC KEY STATE | 
|  | .align 4 | 
|  | .Lenc192: | 
|  | movaps -0x40(TKEYP), KEY | 
|  | AESENC KEY STATE | 
|  | movaps -0x30(TKEYP), KEY | 
|  | AESENC KEY STATE | 
|  | .align 4 | 
|  | .Lenc128: | 
|  | movaps -0x20(TKEYP), KEY | 
|  | AESENC KEY STATE | 
|  | movaps -0x10(TKEYP), KEY | 
|  | AESENC KEY STATE | 
|  | movaps (TKEYP), KEY | 
|  | AESENC KEY STATE | 
|  | movaps 0x10(TKEYP), KEY | 
|  | AESENC KEY STATE | 
|  | movaps 0x20(TKEYP), KEY | 
|  | AESENC KEY STATE | 
|  | movaps 0x30(TKEYP), KEY | 
|  | AESENC KEY STATE | 
|  | movaps 0x40(TKEYP), KEY | 
|  | AESENC KEY STATE | 
|  | movaps 0x50(TKEYP), KEY | 
|  | AESENC KEY STATE | 
|  | movaps 0x60(TKEYP), KEY | 
|  | AESENC KEY STATE | 
|  | movaps 0x70(TKEYP), KEY | 
|  | AESENCLAST KEY STATE | 
|  | ret | 
|  | ENDPROC(_aesni_enc1) | 
|  |  | 
|  | /* | 
|  | * _aesni_enc4:	internal ABI | 
|  | * input: | 
|  | *	KEYP:		key struct pointer | 
|  | *	KLEN:		round count | 
|  | *	STATE1:		initial state (input) | 
|  | *	STATE2 | 
|  | *	STATE3 | 
|  | *	STATE4 | 
|  | * output: | 
|  | *	STATE1:		finial state (output) | 
|  | *	STATE2 | 
|  | *	STATE3 | 
|  | *	STATE4 | 
|  | * changed: | 
|  | *	KEY | 
|  | *	TKEYP (T1) | 
|  | */ | 
|  | .align 4 | 
|  | _aesni_enc4: | 
|  | movaps (KEYP), KEY		# key | 
|  | mov KEYP, TKEYP | 
|  | pxor KEY, STATE1		# round 0 | 
|  | pxor KEY, STATE2 | 
|  | pxor KEY, STATE3 | 
|  | pxor KEY, STATE4 | 
|  | add $0x30, TKEYP | 
|  | cmp $24, KLEN | 
|  | jb .L4enc128 | 
|  | lea 0x20(TKEYP), TKEYP | 
|  | je .L4enc192 | 
|  | add $0x20, TKEYP | 
|  | movaps -0x60(TKEYP), KEY | 
|  | AESENC KEY STATE1 | 
|  | AESENC KEY STATE2 | 
|  | AESENC KEY STATE3 | 
|  | AESENC KEY STATE4 | 
|  | movaps -0x50(TKEYP), KEY | 
|  | AESENC KEY STATE1 | 
|  | AESENC KEY STATE2 | 
|  | AESENC KEY STATE3 | 
|  | AESENC KEY STATE4 | 
|  | #.align 4 | 
|  | .L4enc192: | 
|  | movaps -0x40(TKEYP), KEY | 
|  | AESENC KEY STATE1 | 
|  | AESENC KEY STATE2 | 
|  | AESENC KEY STATE3 | 
|  | AESENC KEY STATE4 | 
|  | movaps -0x30(TKEYP), KEY | 
|  | AESENC KEY STATE1 | 
|  | AESENC KEY STATE2 | 
|  | AESENC KEY STATE3 | 
|  | AESENC KEY STATE4 | 
|  | #.align 4 | 
|  | .L4enc128: | 
|  | movaps -0x20(TKEYP), KEY | 
|  | AESENC KEY STATE1 | 
|  | AESENC KEY STATE2 | 
|  | AESENC KEY STATE3 | 
|  | AESENC KEY STATE4 | 
|  | movaps -0x10(TKEYP), KEY | 
|  | AESENC KEY STATE1 | 
|  | AESENC KEY STATE2 | 
|  | AESENC KEY STATE3 | 
|  | AESENC KEY STATE4 | 
|  | movaps (TKEYP), KEY | 
|  | AESENC KEY STATE1 | 
|  | AESENC KEY STATE2 | 
|  | AESENC KEY STATE3 | 
|  | AESENC KEY STATE4 | 
|  | movaps 0x10(TKEYP), KEY | 
|  | AESENC KEY STATE1 | 
|  | AESENC KEY STATE2 | 
|  | AESENC KEY STATE3 | 
|  | AESENC KEY STATE4 | 
|  | movaps 0x20(TKEYP), KEY | 
|  | AESENC KEY STATE1 | 
|  | AESENC KEY STATE2 | 
|  | AESENC KEY STATE3 | 
|  | AESENC KEY STATE4 | 
|  | movaps 0x30(TKEYP), KEY | 
|  | AESENC KEY STATE1 | 
|  | AESENC KEY STATE2 | 
|  | AESENC KEY STATE3 | 
|  | AESENC KEY STATE4 | 
|  | movaps 0x40(TKEYP), KEY | 
|  | AESENC KEY STATE1 | 
|  | AESENC KEY STATE2 | 
|  | AESENC KEY STATE3 | 
|  | AESENC KEY STATE4 | 
|  | movaps 0x50(TKEYP), KEY | 
|  | AESENC KEY STATE1 | 
|  | AESENC KEY STATE2 | 
|  | AESENC KEY STATE3 | 
|  | AESENC KEY STATE4 | 
|  | movaps 0x60(TKEYP), KEY | 
|  | AESENC KEY STATE1 | 
|  | AESENC KEY STATE2 | 
|  | AESENC KEY STATE3 | 
|  | AESENC KEY STATE4 | 
|  | movaps 0x70(TKEYP), KEY | 
|  | AESENCLAST KEY STATE1		# last round | 
|  | AESENCLAST KEY STATE2 | 
|  | AESENCLAST KEY STATE3 | 
|  | AESENCLAST KEY STATE4 | 
|  | ret | 
|  | ENDPROC(_aesni_enc4) | 
|  |  | 
|  | /* | 
|  | * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) | 
|  | */ | 
|  | ENTRY(aesni_dec) | 
|  | FRAME_BEGIN | 
|  | #ifndef __x86_64__ | 
|  | pushl KEYP | 
|  | pushl KLEN | 
|  | movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx | 
|  | movl (FRAME_OFFSET+16)(%esp), OUTP	# dst | 
|  | movl (FRAME_OFFSET+20)(%esp), INP	# src | 
|  | #endif | 
|  | mov 480(KEYP), KLEN		# key length | 
|  | add $240, KEYP | 
|  | movups (INP), STATE		# input | 
|  | call _aesni_dec1 | 
|  | movups STATE, (OUTP)		#output | 
|  | #ifndef __x86_64__ | 
|  | popl KLEN | 
|  | popl KEYP | 
|  | #endif | 
|  | FRAME_END | 
|  | ret | 
|  | ENDPROC(aesni_dec) | 
|  |  | 
|  | /* | 
|  | * _aesni_dec1:		internal ABI | 
|  | * input: | 
|  | *	KEYP:		key struct pointer | 
|  | *	KLEN:		key length | 
|  | *	STATE:		initial state (input) | 
|  | * output: | 
|  | *	STATE:		finial state (output) | 
|  | * changed: | 
|  | *	KEY | 
|  | *	TKEYP (T1) | 
|  | */ | 
|  | .align 4 | 
|  | _aesni_dec1: | 
|  | movaps (KEYP), KEY		# key | 
|  | mov KEYP, TKEYP | 
|  | pxor KEY, STATE		# round 0 | 
|  | add $0x30, TKEYP | 
|  | cmp $24, KLEN | 
|  | jb .Ldec128 | 
|  | lea 0x20(TKEYP), TKEYP | 
|  | je .Ldec192 | 
|  | add $0x20, TKEYP | 
|  | movaps -0x60(TKEYP), KEY | 
|  | AESDEC KEY STATE | 
|  | movaps -0x50(TKEYP), KEY | 
|  | AESDEC KEY STATE | 
|  | .align 4 | 
|  | .Ldec192: | 
|  | movaps -0x40(TKEYP), KEY | 
|  | AESDEC KEY STATE | 
|  | movaps -0x30(TKEYP), KEY | 
|  | AESDEC KEY STATE | 
|  | .align 4 | 
|  | .Ldec128: | 
|  | movaps -0x20(TKEYP), KEY | 
|  | AESDEC KEY STATE | 
|  | movaps -0x10(TKEYP), KEY | 
|  | AESDEC KEY STATE | 
|  | movaps (TKEYP), KEY | 
|  | AESDEC KEY STATE | 
|  | movaps 0x10(TKEYP), KEY | 
|  | AESDEC KEY STATE | 
|  | movaps 0x20(TKEYP), KEY | 
|  | AESDEC KEY STATE | 
|  | movaps 0x30(TKEYP), KEY | 
|  | AESDEC KEY STATE | 
|  | movaps 0x40(TKEYP), KEY | 
|  | AESDEC KEY STATE | 
|  | movaps 0x50(TKEYP), KEY | 
|  | AESDEC KEY STATE | 
|  | movaps 0x60(TKEYP), KEY | 
|  | AESDEC KEY STATE | 
|  | movaps 0x70(TKEYP), KEY | 
|  | AESDECLAST KEY STATE | 
|  | ret | 
|  | ENDPROC(_aesni_dec1) | 
|  |  | 
|  | /* | 
|  | * _aesni_dec4:	internal ABI | 
|  | * input: | 
|  | *	KEYP:		key struct pointer | 
|  | *	KLEN:		key length | 
|  | *	STATE1:		initial state (input) | 
|  | *	STATE2 | 
|  | *	STATE3 | 
|  | *	STATE4 | 
|  | * output: | 
|  | *	STATE1:		finial state (output) | 
|  | *	STATE2 | 
|  | *	STATE3 | 
|  | *	STATE4 | 
|  | * changed: | 
|  | *	KEY | 
|  | *	TKEYP (T1) | 
|  | */ | 
|  | .align 4 | 
|  | _aesni_dec4: | 
|  | movaps (KEYP), KEY		# key | 
|  | mov KEYP, TKEYP | 
|  | pxor KEY, STATE1		# round 0 | 
|  | pxor KEY, STATE2 | 
|  | pxor KEY, STATE3 | 
|  | pxor KEY, STATE4 | 
|  | add $0x30, TKEYP | 
|  | cmp $24, KLEN | 
|  | jb .L4dec128 | 
|  | lea 0x20(TKEYP), TKEYP | 
|  | je .L4dec192 | 
|  | add $0x20, TKEYP | 
|  | movaps -0x60(TKEYP), KEY | 
|  | AESDEC KEY STATE1 | 
|  | AESDEC KEY STATE2 | 
|  | AESDEC KEY STATE3 | 
|  | AESDEC KEY STATE4 | 
|  | movaps -0x50(TKEYP), KEY | 
|  | AESDEC KEY STATE1 | 
|  | AESDEC KEY STATE2 | 
|  | AESDEC KEY STATE3 | 
|  | AESDEC KEY STATE4 | 
|  | .align 4 | 
|  | .L4dec192: | 
|  | movaps -0x40(TKEYP), KEY | 
|  | AESDEC KEY STATE1 | 
|  | AESDEC KEY STATE2 | 
|  | AESDEC KEY STATE3 | 
|  | AESDEC KEY STATE4 | 
|  | movaps -0x30(TKEYP), KEY | 
|  | AESDEC KEY STATE1 | 
|  | AESDEC KEY STATE2 | 
|  | AESDEC KEY STATE3 | 
|  | AESDEC KEY STATE4 | 
|  | .align 4 | 
|  | .L4dec128: | 
|  | movaps -0x20(TKEYP), KEY | 
|  | AESDEC KEY STATE1 | 
|  | AESDEC KEY STATE2 | 
|  | AESDEC KEY STATE3 | 
|  | AESDEC KEY STATE4 | 
|  | movaps -0x10(TKEYP), KEY | 
|  | AESDEC KEY STATE1 | 
|  | AESDEC KEY STATE2 | 
|  | AESDEC KEY STATE3 | 
|  | AESDEC KEY STATE4 | 
|  | movaps (TKEYP), KEY | 
|  | AESDEC KEY STATE1 | 
|  | AESDEC KEY STATE2 | 
|  | AESDEC KEY STATE3 | 
|  | AESDEC KEY STATE4 | 
|  | movaps 0x10(TKEYP), KEY | 
|  | AESDEC KEY STATE1 | 
|  | AESDEC KEY STATE2 | 
|  | AESDEC KEY STATE3 | 
|  | AESDEC KEY STATE4 | 
|  | movaps 0x20(TKEYP), KEY | 
|  | AESDEC KEY STATE1 | 
|  | AESDEC KEY STATE2 | 
|  | AESDEC KEY STATE3 | 
|  | AESDEC KEY STATE4 | 
|  | movaps 0x30(TKEYP), KEY | 
|  | AESDEC KEY STATE1 | 
|  | AESDEC KEY STATE2 | 
|  | AESDEC KEY STATE3 | 
|  | AESDEC KEY STATE4 | 
|  | movaps 0x40(TKEYP), KEY | 
|  | AESDEC KEY STATE1 | 
|  | AESDEC KEY STATE2 | 
|  | AESDEC KEY STATE3 | 
|  | AESDEC KEY STATE4 | 
|  | movaps 0x50(TKEYP), KEY | 
|  | AESDEC KEY STATE1 | 
|  | AESDEC KEY STATE2 | 
|  | AESDEC KEY STATE3 | 
|  | AESDEC KEY STATE4 | 
|  | movaps 0x60(TKEYP), KEY | 
|  | AESDEC KEY STATE1 | 
|  | AESDEC KEY STATE2 | 
|  | AESDEC KEY STATE3 | 
|  | AESDEC KEY STATE4 | 
|  | movaps 0x70(TKEYP), KEY | 
|  | AESDECLAST KEY STATE1		# last round | 
|  | AESDECLAST KEY STATE2 | 
|  | AESDECLAST KEY STATE3 | 
|  | AESDECLAST KEY STATE4 | 
|  | ret | 
|  | ENDPROC(_aesni_dec4) | 
|  |  | 
|  | /* | 
|  | * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | 
|  | *		      size_t len) | 
|  | */ | 
|  | ENTRY(aesni_ecb_enc) | 
|  | FRAME_BEGIN | 
|  | #ifndef __x86_64__ | 
|  | pushl LEN | 
|  | pushl KEYP | 
|  | pushl KLEN | 
|  | movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx | 
|  | movl (FRAME_OFFSET+20)(%esp), OUTP	# dst | 
|  | movl (FRAME_OFFSET+24)(%esp), INP	# src | 
|  | movl (FRAME_OFFSET+28)(%esp), LEN	# len | 
|  | #endif | 
|  | test LEN, LEN		# check length | 
|  | jz .Lecb_enc_ret | 
|  | mov 480(KEYP), KLEN | 
|  | cmp $16, LEN | 
|  | jb .Lecb_enc_ret | 
|  | cmp $64, LEN | 
|  | jb .Lecb_enc_loop1 | 
|  | .align 4 | 
|  | .Lecb_enc_loop4: | 
|  | movups (INP), STATE1 | 
|  | movups 0x10(INP), STATE2 | 
|  | movups 0x20(INP), STATE3 | 
|  | movups 0x30(INP), STATE4 | 
|  | call _aesni_enc4 | 
|  | movups STATE1, (OUTP) | 
|  | movups STATE2, 0x10(OUTP) | 
|  | movups STATE3, 0x20(OUTP) | 
|  | movups STATE4, 0x30(OUTP) | 
|  | sub $64, LEN | 
|  | add $64, INP | 
|  | add $64, OUTP | 
|  | cmp $64, LEN | 
|  | jge .Lecb_enc_loop4 | 
|  | cmp $16, LEN | 
|  | jb .Lecb_enc_ret | 
|  | .align 4 | 
|  | .Lecb_enc_loop1: | 
|  | movups (INP), STATE1 | 
|  | call _aesni_enc1 | 
|  | movups STATE1, (OUTP) | 
|  | sub $16, LEN | 
|  | add $16, INP | 
|  | add $16, OUTP | 
|  | cmp $16, LEN | 
|  | jge .Lecb_enc_loop1 | 
|  | .Lecb_enc_ret: | 
|  | #ifndef __x86_64__ | 
|  | popl KLEN | 
|  | popl KEYP | 
|  | popl LEN | 
|  | #endif | 
|  | FRAME_END | 
|  | ret | 
|  | ENDPROC(aesni_ecb_enc) | 
|  |  | 
|  | /* | 
|  | * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | 
|  | *		      size_t len); | 
|  | */ | 
|  | ENTRY(aesni_ecb_dec) | 
|  | FRAME_BEGIN | 
|  | #ifndef __x86_64__ | 
|  | pushl LEN | 
|  | pushl KEYP | 
|  | pushl KLEN | 
|  | movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx | 
|  | movl (FRAME_OFFSET+20)(%esp), OUTP	# dst | 
|  | movl (FRAME_OFFSET+24)(%esp), INP	# src | 
|  | movl (FRAME_OFFSET+28)(%esp), LEN	# len | 
|  | #endif | 
|  | test LEN, LEN | 
|  | jz .Lecb_dec_ret | 
|  | mov 480(KEYP), KLEN | 
|  | add $240, KEYP | 
|  | cmp $16, LEN | 
|  | jb .Lecb_dec_ret | 
|  | cmp $64, LEN | 
|  | jb .Lecb_dec_loop1 | 
|  | .align 4 | 
|  | .Lecb_dec_loop4: | 
|  | movups (INP), STATE1 | 
|  | movups 0x10(INP), STATE2 | 
|  | movups 0x20(INP), STATE3 | 
|  | movups 0x30(INP), STATE4 | 
|  | call _aesni_dec4 | 
|  | movups STATE1, (OUTP) | 
|  | movups STATE2, 0x10(OUTP) | 
|  | movups STATE3, 0x20(OUTP) | 
|  | movups STATE4, 0x30(OUTP) | 
|  | sub $64, LEN | 
|  | add $64, INP | 
|  | add $64, OUTP | 
|  | cmp $64, LEN | 
|  | jge .Lecb_dec_loop4 | 
|  | cmp $16, LEN | 
|  | jb .Lecb_dec_ret | 
|  | .align 4 | 
|  | .Lecb_dec_loop1: | 
|  | movups (INP), STATE1 | 
|  | call _aesni_dec1 | 
|  | movups STATE1, (OUTP) | 
|  | sub $16, LEN | 
|  | add $16, INP | 
|  | add $16, OUTP | 
|  | cmp $16, LEN | 
|  | jge .Lecb_dec_loop1 | 
|  | .Lecb_dec_ret: | 
|  | #ifndef __x86_64__ | 
|  | popl KLEN | 
|  | popl KEYP | 
|  | popl LEN | 
|  | #endif | 
|  | FRAME_END | 
|  | ret | 
|  | ENDPROC(aesni_ecb_dec) | 
|  |  | 
|  | /* | 
|  | * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | 
|  | *		      size_t len, u8 *iv) | 
|  | */ | 
|  | ENTRY(aesni_cbc_enc) | 
|  | FRAME_BEGIN | 
|  | #ifndef __x86_64__ | 
|  | pushl IVP | 
|  | pushl LEN | 
|  | pushl KEYP | 
|  | pushl KLEN | 
|  | movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx | 
|  | movl (FRAME_OFFSET+24)(%esp), OUTP	# dst | 
|  | movl (FRAME_OFFSET+28)(%esp), INP	# src | 
|  | movl (FRAME_OFFSET+32)(%esp), LEN	# len | 
|  | movl (FRAME_OFFSET+36)(%esp), IVP	# iv | 
|  | #endif | 
|  | cmp $16, LEN | 
|  | jb .Lcbc_enc_ret | 
|  | mov 480(KEYP), KLEN | 
|  | movups (IVP), STATE	# load iv as initial state | 
|  | .align 4 | 
|  | .Lcbc_enc_loop: | 
|  | movups (INP), IN	# load input | 
|  | pxor IN, STATE | 
|  | call _aesni_enc1 | 
|  | movups STATE, (OUTP)	# store output | 
|  | sub $16, LEN | 
|  | add $16, INP | 
|  | add $16, OUTP | 
|  | cmp $16, LEN | 
|  | jge .Lcbc_enc_loop | 
|  | movups STATE, (IVP) | 
|  | .Lcbc_enc_ret: | 
|  | #ifndef __x86_64__ | 
|  | popl KLEN | 
|  | popl KEYP | 
|  | popl LEN | 
|  | popl IVP | 
|  | #endif | 
|  | FRAME_END | 
|  | ret | 
|  | ENDPROC(aesni_cbc_enc) | 
|  |  | 
|  | /* | 
|  | * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | 
|  | *		      size_t len, u8 *iv) | 
|  | */ | 
|  | ENTRY(aesni_cbc_dec) | 
|  | FRAME_BEGIN | 
|  | #ifndef __x86_64__ | 
|  | pushl IVP | 
|  | pushl LEN | 
|  | pushl KEYP | 
|  | pushl KLEN | 
|  | movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx | 
|  | movl (FRAME_OFFSET+24)(%esp), OUTP	# dst | 
|  | movl (FRAME_OFFSET+28)(%esp), INP	# src | 
|  | movl (FRAME_OFFSET+32)(%esp), LEN	# len | 
|  | movl (FRAME_OFFSET+36)(%esp), IVP	# iv | 
|  | #endif | 
|  | cmp $16, LEN | 
|  | jb .Lcbc_dec_just_ret | 
|  | mov 480(KEYP), KLEN | 
|  | add $240, KEYP | 
|  | movups (IVP), IV | 
|  | cmp $64, LEN | 
|  | jb .Lcbc_dec_loop1 | 
|  | .align 4 | 
|  | .Lcbc_dec_loop4: | 
|  | movups (INP), IN1 | 
|  | movaps IN1, STATE1 | 
|  | movups 0x10(INP), IN2 | 
|  | movaps IN2, STATE2 | 
|  | #ifdef __x86_64__ | 
|  | movups 0x20(INP), IN3 | 
|  | movaps IN3, STATE3 | 
|  | movups 0x30(INP), IN4 | 
|  | movaps IN4, STATE4 | 
|  | #else | 
|  | movups 0x20(INP), IN1 | 
|  | movaps IN1, STATE3 | 
|  | movups 0x30(INP), IN2 | 
|  | movaps IN2, STATE4 | 
|  | #endif | 
|  | call _aesni_dec4 | 
|  | pxor IV, STATE1 | 
|  | #ifdef __x86_64__ | 
|  | pxor IN1, STATE2 | 
|  | pxor IN2, STATE3 | 
|  | pxor IN3, STATE4 | 
|  | movaps IN4, IV | 
|  | #else | 
|  | pxor IN1, STATE4 | 
|  | movaps IN2, IV | 
|  | movups (INP), IN1 | 
|  | pxor IN1, STATE2 | 
|  | movups 0x10(INP), IN2 | 
|  | pxor IN2, STATE3 | 
|  | #endif | 
|  | movups STATE1, (OUTP) | 
|  | movups STATE2, 0x10(OUTP) | 
|  | movups STATE3, 0x20(OUTP) | 
|  | movups STATE4, 0x30(OUTP) | 
|  | sub $64, LEN | 
|  | add $64, INP | 
|  | add $64, OUTP | 
|  | cmp $64, LEN | 
|  | jge .Lcbc_dec_loop4 | 
|  | cmp $16, LEN | 
|  | jb .Lcbc_dec_ret | 
|  | .align 4 | 
|  | .Lcbc_dec_loop1: | 
|  | movups (INP), IN | 
|  | movaps IN, STATE | 
|  | call _aesni_dec1 | 
|  | pxor IV, STATE | 
|  | movups STATE, (OUTP) | 
|  | movaps IN, IV | 
|  | sub $16, LEN | 
|  | add $16, INP | 
|  | add $16, OUTP | 
|  | cmp $16, LEN | 
|  | jge .Lcbc_dec_loop1 | 
|  | .Lcbc_dec_ret: | 
|  | movups IV, (IVP) | 
|  | .Lcbc_dec_just_ret: | 
|  | #ifndef __x86_64__ | 
|  | popl KLEN | 
|  | popl KEYP | 
|  | popl LEN | 
|  | popl IVP | 
|  | #endif | 
|  | FRAME_END | 
|  | ret | 
|  | ENDPROC(aesni_cbc_dec) | 
|  |  | 
|  | #ifdef __x86_64__ | 
|  | .pushsection .rodata | 
|  | .align 16 | 
|  | .Lbswap_mask: | 
|  | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | 
|  | .popsection | 
|  |  | 
|  | /* | 
|  | * _aesni_inc_init:	internal ABI | 
|  | *	setup registers used by _aesni_inc | 
|  | * input: | 
|  | *	IV | 
|  | * output: | 
|  | *	CTR:	== IV, in little endian | 
|  | *	TCTR_LOW: == lower qword of CTR | 
|  | *	INC:	== 1, in little endian | 
|  | *	BSWAP_MASK == endian swapping mask | 
|  | */ | 
|  | .align 4 | 
|  | _aesni_inc_init: | 
|  | movaps .Lbswap_mask, BSWAP_MASK | 
|  | movaps IV, CTR | 
|  | PSHUFB_XMM BSWAP_MASK CTR | 
|  | mov $1, TCTR_LOW | 
|  | MOVQ_R64_XMM TCTR_LOW INC | 
|  | MOVQ_R64_XMM CTR TCTR_LOW | 
|  | ret | 
|  | ENDPROC(_aesni_inc_init) | 
|  |  | 
|  | /* | 
|  | * _aesni_inc:		internal ABI | 
|  | *	Increase IV by 1, IV is in big endian | 
|  | * input: | 
|  | *	IV | 
|  | *	CTR:	== IV, in little endian | 
|  | *	TCTR_LOW: == lower qword of CTR | 
|  | *	INC:	== 1, in little endian | 
|  | *	BSWAP_MASK == endian swapping mask | 
|  | * output: | 
|  | *	IV:	Increase by 1 | 
|  | * changed: | 
|  | *	CTR:	== output IV, in little endian | 
|  | *	TCTR_LOW: == lower qword of CTR | 
|  | */ | 
|  | .align 4 | 
|  | _aesni_inc: | 
|  | paddq INC, CTR | 
|  | add $1, TCTR_LOW | 
|  | jnc .Linc_low | 
|  | pslldq $8, INC | 
|  | paddq INC, CTR | 
|  | psrldq $8, INC | 
|  | .Linc_low: | 
|  | movaps CTR, IV | 
|  | PSHUFB_XMM BSWAP_MASK IV | 
|  | ret | 
|  | ENDPROC(_aesni_inc) | 
|  |  | 
|  | /* | 
|  | * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | 
|  | *		      size_t len, u8 *iv) | 
|  | */ | 
|  | ENTRY(aesni_ctr_enc) | 
|  | FRAME_BEGIN | 
|  | cmp $16, LEN | 
|  | jb .Lctr_enc_just_ret | 
|  | mov 480(KEYP), KLEN | 
|  | movups (IVP), IV | 
|  | call _aesni_inc_init | 
|  | cmp $64, LEN | 
|  | jb .Lctr_enc_loop1 | 
|  | .align 4 | 
|  | .Lctr_enc_loop4: | 
|  | movaps IV, STATE1 | 
|  | call _aesni_inc | 
|  | movups (INP), IN1 | 
|  | movaps IV, STATE2 | 
|  | call _aesni_inc | 
|  | movups 0x10(INP), IN2 | 
|  | movaps IV, STATE3 | 
|  | call _aesni_inc | 
|  | movups 0x20(INP), IN3 | 
|  | movaps IV, STATE4 | 
|  | call _aesni_inc | 
|  | movups 0x30(INP), IN4 | 
|  | call _aesni_enc4 | 
|  | pxor IN1, STATE1 | 
|  | movups STATE1, (OUTP) | 
|  | pxor IN2, STATE2 | 
|  | movups STATE2, 0x10(OUTP) | 
|  | pxor IN3, STATE3 | 
|  | movups STATE3, 0x20(OUTP) | 
|  | pxor IN4, STATE4 | 
|  | movups STATE4, 0x30(OUTP) | 
|  | sub $64, LEN | 
|  | add $64, INP | 
|  | add $64, OUTP | 
|  | cmp $64, LEN | 
|  | jge .Lctr_enc_loop4 | 
|  | cmp $16, LEN | 
|  | jb .Lctr_enc_ret | 
|  | .align 4 | 
|  | .Lctr_enc_loop1: | 
|  | movaps IV, STATE | 
|  | call _aesni_inc | 
|  | movups (INP), IN | 
|  | call _aesni_enc1 | 
|  | pxor IN, STATE | 
|  | movups STATE, (OUTP) | 
|  | sub $16, LEN | 
|  | add $16, INP | 
|  | add $16, OUTP | 
|  | cmp $16, LEN | 
|  | jge .Lctr_enc_loop1 | 
|  | .Lctr_enc_ret: | 
|  | movups IV, (IVP) | 
|  | .Lctr_enc_just_ret: | 
|  | FRAME_END | 
|  | ret | 
|  | ENDPROC(aesni_ctr_enc) | 
|  |  | 
|  | /* | 
|  | * _aesni_gf128mul_x_ble:		internal ABI | 
|  | *	Multiply in GF(2^128) for XTS IVs | 
|  | * input: | 
|  | *	IV:	current IV | 
|  | *	GF128MUL_MASK == mask with 0x87 and 0x01 | 
|  | * output: | 
|  | *	IV:	next IV | 
|  | * changed: | 
|  | *	CTR:	== temporary value | 
|  | */ | 
|  | #define _aesni_gf128mul_x_ble() \ | 
|  | pshufd $0x13, IV, CTR; \ | 
|  | paddq IV, IV; \ | 
|  | psrad $31, CTR; \ | 
|  | pand GF128MUL_MASK, CTR; \ | 
|  | pxor CTR, IV; | 
|  |  | 
|  | /* | 
|  | * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | 
|  | *			 bool enc, u8 *iv) | 
|  | */ | 
|  | ENTRY(aesni_xts_crypt8) | 
|  | FRAME_BEGIN | 
|  | cmpb $0, %cl | 
|  | movl $0, %ecx | 
|  | movl $240, %r10d | 
|  | leaq _aesni_enc4, %r11 | 
|  | leaq _aesni_dec4, %rax | 
|  | cmovel %r10d, %ecx | 
|  | cmoveq %rax, %r11 | 
|  |  | 
|  | movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK | 
|  | movups (IVP), IV | 
|  |  | 
|  | mov 480(KEYP), KLEN | 
|  | addq %rcx, KEYP | 
|  |  | 
|  | movdqa IV, STATE1 | 
|  | movdqu 0x00(INP), INC | 
|  | pxor INC, STATE1 | 
|  | movdqu IV, 0x00(OUTP) | 
|  |  | 
|  | _aesni_gf128mul_x_ble() | 
|  | movdqa IV, STATE2 | 
|  | movdqu 0x10(INP), INC | 
|  | pxor INC, STATE2 | 
|  | movdqu IV, 0x10(OUTP) | 
|  |  | 
|  | _aesni_gf128mul_x_ble() | 
|  | movdqa IV, STATE3 | 
|  | movdqu 0x20(INP), INC | 
|  | pxor INC, STATE3 | 
|  | movdqu IV, 0x20(OUTP) | 
|  |  | 
|  | _aesni_gf128mul_x_ble() | 
|  | movdqa IV, STATE4 | 
|  | movdqu 0x30(INP), INC | 
|  | pxor INC, STATE4 | 
|  | movdqu IV, 0x30(OUTP) | 
|  |  | 
|  | CALL_NOSPEC %r11 | 
|  |  | 
|  | movdqu 0x00(OUTP), INC | 
|  | pxor INC, STATE1 | 
|  | movdqu STATE1, 0x00(OUTP) | 
|  |  | 
|  | _aesni_gf128mul_x_ble() | 
|  | movdqa IV, STATE1 | 
|  | movdqu 0x40(INP), INC | 
|  | pxor INC, STATE1 | 
|  | movdqu IV, 0x40(OUTP) | 
|  |  | 
|  | movdqu 0x10(OUTP), INC | 
|  | pxor INC, STATE2 | 
|  | movdqu STATE2, 0x10(OUTP) | 
|  |  | 
|  | _aesni_gf128mul_x_ble() | 
|  | movdqa IV, STATE2 | 
|  | movdqu 0x50(INP), INC | 
|  | pxor INC, STATE2 | 
|  | movdqu IV, 0x50(OUTP) | 
|  |  | 
|  | movdqu 0x20(OUTP), INC | 
|  | pxor INC, STATE3 | 
|  | movdqu STATE3, 0x20(OUTP) | 
|  |  | 
|  | _aesni_gf128mul_x_ble() | 
|  | movdqa IV, STATE3 | 
|  | movdqu 0x60(INP), INC | 
|  | pxor INC, STATE3 | 
|  | movdqu IV, 0x60(OUTP) | 
|  |  | 
|  | movdqu 0x30(OUTP), INC | 
|  | pxor INC, STATE4 | 
|  | movdqu STATE4, 0x30(OUTP) | 
|  |  | 
|  | _aesni_gf128mul_x_ble() | 
|  | movdqa IV, STATE4 | 
|  | movdqu 0x70(INP), INC | 
|  | pxor INC, STATE4 | 
|  | movdqu IV, 0x70(OUTP) | 
|  |  | 
|  | _aesni_gf128mul_x_ble() | 
|  | movups IV, (IVP) | 
|  |  | 
|  | CALL_NOSPEC %r11 | 
|  |  | 
|  | movdqu 0x40(OUTP), INC | 
|  | pxor INC, STATE1 | 
|  | movdqu STATE1, 0x40(OUTP) | 
|  |  | 
|  | movdqu 0x50(OUTP), INC | 
|  | pxor INC, STATE2 | 
|  | movdqu STATE2, 0x50(OUTP) | 
|  |  | 
|  | movdqu 0x60(OUTP), INC | 
|  | pxor INC, STATE3 | 
|  | movdqu STATE3, 0x60(OUTP) | 
|  |  | 
|  | movdqu 0x70(OUTP), INC | 
|  | pxor INC, STATE4 | 
|  | movdqu STATE4, 0x70(OUTP) | 
|  |  | 
|  | FRAME_END | 
|  | ret | 
|  | ENDPROC(aesni_xts_crypt8) | 
|  |  | 
|  | #endif |