arch/x86/crypto/camellia-x86_64-asm_64.S - linux-mtk - Git at Google

 /*
  * Camellia Cipher Algorithm (x86_64)
  *
  * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
  * USA
  *
  */

 #include <linux/linkage.h>

 .file "camellia-x86_64-asm_64.S"
 .text

 .extern camellia_sp10011110;
 .extern camellia_sp22000222;
 .extern camellia_sp03303033;
 .extern camellia_sp00444404;
 .extern camellia_sp02220222;
 .extern camellia_sp30333033;
 .extern camellia_sp44044404;
 .extern camellia_sp11101110;

 #define sp10011110 camellia_sp10011110
 #define sp22000222 camellia_sp22000222
 #define sp03303033 camellia_sp03303033
 #define sp00444404 camellia_sp00444404
 #define sp02220222 camellia_sp02220222
 #define sp30333033 camellia_sp30333033
 #define sp44044404 camellia_sp44044404
 #define sp11101110 camellia_sp11101110

 #define CAMELLIA_TABLE_BYTE_LEN 272

 /* struct camellia_ctx: */
 #define key_table 0
 #define key_length CAMELLIA_TABLE_BYTE_LEN

 /* register macros */
 #define CTX %rdi
 #define RIO %rsi
 #define RIOd %esi

 #define RAB0 %rax
 #define RCD0 %rcx
 #define RAB1 %rbx
 #define RCD1 %rdx

 #define RAB0d %eax
 #define RCD0d %ecx
 #define RAB1d %ebx
 #define RCD1d %edx

 #define RAB0bl %al
 #define RCD0bl %cl
 #define RAB1bl %bl
 #define RCD1bl %dl

 #define RAB0bh %ah
 #define RCD0bh %ch
 #define RAB1bh %bh
 #define RCD1bh %dh

 #define RT0 %rsi
 #define RT1 %r12
 #define RT2 %r8

 #define RT0d %esi
 #define RT1d %r12d
 #define RT2d %r8d

 #define RT2bl %r8b

 #define RXOR %r9
 #define RR12 %r10
 #define RDST %r11

 #define RXORd %r9d
 #define RXORbl %r9b

 #define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
 	movzbl ab ## bl,		tmp2 ## d; \
 	movzbl ab ## bh,		tmp1 ## d; \
 	rorq $16,			ab; \
 	xorq T0(, tmp2, 8),		dst; \
 	xorq T1(, tmp1, 8),		dst;

 /**********************************************************************
   1-way camellia
  **********************************************************************/
 #define roundsm(ab, subkey, cd) \
 	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
 	\
 	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
 	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
 	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
 	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
 	\
 	xorq RT2,					cd ## 0;

 #define fls(l, r, kl, kr) \
 	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
 	andl l ## 0d,					RT0d; \
 	roll $1,					RT0d; \
 	shlq $32,					RT0; \
 	xorq RT0,					l ## 0; \
 	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
 	orq r ## 0,					RT1; \
 	shrq $32,					RT1; \
 	xorq RT1,					r ## 0; \
 	\
 	movq (key_table + ((kl) * 2) * 4)(CTX),		RT2; \
 	orq l ## 0,					RT2; \
 	shrq $32,					RT2; \
 	xorq RT2,					l ## 0; \
 	movl (key_table + ((kr) * 2) * 4)(CTX),		RT0d; \
 	andl r ## 0d,					RT0d; \
 	roll $1,					RT0d; \
 	shlq $32,					RT0; \
 	xorq RT0,					r ## 0;

 #define enc_rounds(i) \
 	roundsm(RAB, i + 2, RCD); \
 	roundsm(RCD, i + 3, RAB); \
 	roundsm(RAB, i + 4, RCD); \
 	roundsm(RCD, i + 5, RAB); \
 	roundsm(RAB, i + 6, RCD); \
 	roundsm(RCD, i + 7, RAB);

 #define enc_fls(i) \
 	fls(RAB, RCD, i + 0, i + 1);

 #define enc_inpack() \
 	movq (RIO),			RAB0; \
 	bswapq				RAB0; \
 	rolq $32,			RAB0; \
 	movq 4*2(RIO),			RCD0; \
 	bswapq				RCD0; \
 	rorq $32,			RCD0; \
 	xorq key_table(CTX),		RAB0;

 #define enc_outunpack(op, max) \
 	xorq key_table(CTX, max, 8),	RCD0; \
 	rorq $32,			RCD0; \
 	bswapq				RCD0; \
 	op ## q RCD0,			(RIO); \
 	rolq $32,			RAB0; \
 	bswapq				RAB0; \
 	op ## q RAB0,			4*2(RIO);

 #define dec_rounds(i) \
 	roundsm(RAB, i + 7, RCD); \
 	roundsm(RCD, i + 6, RAB); \
 	roundsm(RAB, i + 5, RCD); \
 	roundsm(RCD, i + 4, RAB); \
 	roundsm(RAB, i + 3, RCD); \
 	roundsm(RCD, i + 2, RAB);

 #define dec_fls(i) \
 	fls(RAB, RCD, i + 1, i + 0);

 #define dec_inpack(max) \
 	movq (RIO),			RAB0; \
 	bswapq				RAB0; \
 	rolq $32,			RAB0; \
 	movq 4*2(RIO),			RCD0; \
 	bswapq				RCD0; \
 	rorq $32,			RCD0; \
 	xorq key_table(CTX, max, 8),	RAB0;

 #define dec_outunpack() \
 	xorq key_table(CTX),		RCD0; \
 	rorq $32,			RCD0; \
 	bswapq				RCD0; \
 	movq RCD0,			(RIO); \
 	rolq $32,			RAB0; \
 	bswapq				RAB0; \
 	movq RAB0,			4*2(RIO);

 ENTRY(__camellia_enc_blk)
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
 	 *	%rdx: src
 	 *	%rcx: bool xor
 	 */
 	movq %r12, RR12;

 	movq %rcx, RXOR;
 	movq %rsi, RDST;
 	movq %rdx, RIO;

 	enc_inpack();

 	enc_rounds(0);
 	enc_fls(8);
 	enc_rounds(8);
 	enc_fls(16);
 	enc_rounds(16);
 	movl $24, RT1d; /* max */

 	cmpb $16, key_length(CTX);
 	je .L__enc_done;

 	enc_fls(24);
 	enc_rounds(24);
 	movl $32, RT1d; /* max */

 .L__enc_done:
 	testb RXORbl, RXORbl;
 	movq RDST, RIO;

 	jnz .L__enc_xor;

 	enc_outunpack(mov, RT1);

 	movq RR12, %r12;
 	ret;

 .L__enc_xor:
 	enc_outunpack(xor, RT1);

 	movq RR12, %r12;
 	ret;
 ENDPROC(__camellia_enc_blk)

 ENTRY(camellia_dec_blk)
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
 	cmpl $16, key_length(CTX);
 	movl $32, RT2d;
 	movl $24, RXORd;
 	cmovel RXORd, RT2d; /* max */

 	movq %r12, RR12;
 	movq %rsi, RDST;
 	movq %rdx, RIO;

 	dec_inpack(RT2);

 	cmpb $24, RT2bl;
 	je .L__dec_rounds16;

 	dec_rounds(24);
 	dec_fls(24);

 .L__dec_rounds16:
 	dec_rounds(16);
 	dec_fls(16);
 	dec_rounds(8);
 	dec_fls(8);
 	dec_rounds(0);

 	movq RDST, RIO;

 	dec_outunpack();

 	movq RR12, %r12;
 	ret;
 ENDPROC(camellia_dec_blk)

 /**********************************************************************
   2-way camellia
  **********************************************************************/
 #define roundsm2(ab, subkey, cd) \
 	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
 	xorq RT2,					cd ## 1; \
 	\
 	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
 	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
 	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
 	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
 	\
 		xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
 		xorq RT2,					cd ## 0; \
 		xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
 		xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
 		xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);

 #define fls2(l, r, kl, kr) \
 	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
 	andl l ## 0d,					RT0d; \
 	roll $1,					RT0d; \
 	shlq $32,					RT0; \
 	xorq RT0,					l ## 0; \
 	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
 	orq r ## 0,					RT1; \
 	shrq $32,					RT1; \
 	xorq RT1,					r ## 0; \
 	\
 		movl (key_table + ((kl) * 2) * 4)(CTX),		RT2d; \
 		andl l ## 1d,					RT2d; \
 		roll $1,					RT2d; \
 		shlq $32,					RT2; \
 		xorq RT2,					l ## 1; \
 		movq (key_table + ((kr) * 2) * 4)(CTX),		RT0; \
 		orq r ## 1,					RT0; \
 		shrq $32,					RT0; \
 		xorq RT0,					r ## 1; \
 	\
 	movq (key_table + ((kl) * 2) * 4)(CTX),		RT1; \
 	orq l ## 0,					RT1; \
 	shrq $32,					RT1; \
 	xorq RT1,					l ## 0; \
 	movl (key_table + ((kr) * 2) * 4)(CTX),		RT2d; \
 	andl r ## 0d,					RT2d; \
 	roll $1,					RT2d; \
 	shlq $32,					RT2; \
 	xorq RT2,					r ## 0; \
 	\
 		movq (key_table + ((kl) * 2) * 4)(CTX),		RT0; \
 		orq l ## 1,					RT0; \
 		shrq $32,					RT0; \
 		xorq RT0,					l ## 1; \
 		movl (key_table + ((kr) * 2) * 4)(CTX),		RT1d; \
 		andl r ## 1d,					RT1d; \
 		roll $1,					RT1d; \
 		shlq $32,					RT1; \
 		xorq RT1,					r ## 1;

 #define enc_rounds2(i) \
 	roundsm2(RAB, i + 2, RCD); \
 	roundsm2(RCD, i + 3, RAB); \
 	roundsm2(RAB, i + 4, RCD); \
 	roundsm2(RCD, i + 5, RAB); \
 	roundsm2(RAB, i + 6, RCD); \
 	roundsm2(RCD, i + 7, RAB);

 #define enc_fls2(i) \
 	fls2(RAB, RCD, i + 0, i + 1);

 #define enc_inpack2() \
 	movq (RIO),			RAB0; \
 	bswapq				RAB0; \
 	rorq $32,			RAB0; \
 	movq 4*2(RIO),			RCD0; \
 	bswapq				RCD0; \
 	rolq $32,			RCD0; \
 	xorq key_table(CTX),		RAB0; \
 	\
 		movq 8*2(RIO),			RAB1; \
 		bswapq				RAB1; \
 		rorq $32,			RAB1; \
 		movq 12*2(RIO),			RCD1; \
 		bswapq				RCD1; \
 		rolq $32,			RCD1; \
 		xorq key_table(CTX),		RAB1;

 #define enc_outunpack2(op, max) \
 	xorq key_table(CTX, max, 8),	RCD0; \
 	rolq $32,			RCD0; \
 	bswapq				RCD0; \
 	op ## q RCD0,			(RIO); \
 	rorq $32,			RAB0; \
 	bswapq				RAB0; \
 	op ## q RAB0,			4*2(RIO); \
 	\
 		xorq key_table(CTX, max, 8),	RCD1; \
 		rolq $32,			RCD1; \
 		bswapq				RCD1; \
 		op ## q RCD1,			8*2(RIO); \
 		rorq $32,			RAB1; \
 		bswapq				RAB1; \
 		op ## q RAB1,			12*2(RIO);

 #define dec_rounds2(i) \
 	roundsm2(RAB, i + 7, RCD); \
 	roundsm2(RCD, i + 6, RAB); \
 	roundsm2(RAB, i + 5, RCD); \
 	roundsm2(RCD, i + 4, RAB); \
 	roundsm2(RAB, i + 3, RCD); \
 	roundsm2(RCD, i + 2, RAB);

 #define dec_fls2(i) \
 	fls2(RAB, RCD, i + 1, i + 0);

 #define dec_inpack2(max) \
 	movq (RIO),			RAB0; \
 	bswapq				RAB0; \
 	rorq $32,			RAB0; \
 	movq 4*2(RIO),			RCD0; \
 	bswapq				RCD0; \
 	rolq $32,			RCD0; \
 	xorq key_table(CTX, max, 8),	RAB0; \
 	\
 		movq 8*2(RIO),			RAB1; \
 		bswapq				RAB1; \
 		rorq $32,			RAB1; \
 		movq 12*2(RIO),			RCD1; \
 		bswapq				RCD1; \
 		rolq $32,			RCD1; \
 		xorq key_table(CTX, max, 8),	RAB1;

 #define dec_outunpack2() \
 	xorq key_table(CTX),		RCD0; \
 	rolq $32,			RCD0; \
 	bswapq				RCD0; \
 	movq RCD0,			(RIO); \
 	rorq $32,			RAB0; \
 	bswapq				RAB0; \
 	movq RAB0,			4*2(RIO); \
 	\
 		xorq key_table(CTX),		RCD1; \
 		rolq $32,			RCD1; \
 		bswapq				RCD1; \
 		movq RCD1,			8*2(RIO); \
 		rorq $32,			RAB1; \
 		bswapq				RAB1; \
 		movq RAB1,			12*2(RIO);

 ENTRY(__camellia_enc_blk_2way)
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
 	 *	%rdx: src
 	 *	%rcx: bool xor
 	 */
 	pushq %rbx;

 	movq %r12, RR12;
 	movq %rcx, RXOR;
 	movq %rsi, RDST;
 	movq %rdx, RIO;

 	enc_inpack2();

 	enc_rounds2(0);
 	enc_fls2(8);
 	enc_rounds2(8);
 	enc_fls2(16);
 	enc_rounds2(16);
 	movl $24, RT2d; /* max */

 	cmpb $16, key_length(CTX);
 	je .L__enc2_done;

 	enc_fls2(24);
 	enc_rounds2(24);
 	movl $32, RT2d; /* max */

 .L__enc2_done:
 	test RXORbl, RXORbl;
 	movq RDST, RIO;
 	jnz .L__enc2_xor;

 	enc_outunpack2(mov, RT2);

 	movq RR12, %r12;
 	popq %rbx;
 	ret;

 .L__enc2_xor:
 	enc_outunpack2(xor, RT2);

 	movq RR12, %r12;
 	popq %rbx;
 	ret;
 ENDPROC(__camellia_enc_blk_2way)

 ENTRY(camellia_dec_blk_2way)
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
 	cmpl $16, key_length(CTX);
 	movl $32, RT2d;
 	movl $24, RXORd;
 	cmovel RXORd, RT2d; /* max */

 	movq %rbx, RXOR;
 	movq %r12, RR12;
 	movq %rsi, RDST;
 	movq %rdx, RIO;

 	dec_inpack2(RT2);

 	cmpb $24, RT2bl;
 	je .L__dec2_rounds16;

 	dec_rounds2(24);
 	dec_fls2(24);

 .L__dec2_rounds16:
 	dec_rounds2(16);
 	dec_fls2(16);
 	dec_rounds2(8);
 	dec_fls2(8);
 	dec_rounds2(0);

 	movq RDST, RIO;

 	dec_outunpack2();

 	movq RR12, %r12;
 	movq RXOR, %rbx;
 	ret;
 ENDPROC(camellia_dec_blk_2way)
	/*
	* Camellia Cipher Algorithm (x86_64)
	*
	* Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 2 of the License, or
	* (at your option) any later version.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	* USA
	*
	*/

	#include <linux/linkage.h>

	.file "camellia-x86_64-asm_64.S"
	.text

	.extern camellia_sp10011110;
	.extern camellia_sp22000222;
	.extern camellia_sp03303033;
	.extern camellia_sp00444404;
	.extern camellia_sp02220222;
	.extern camellia_sp30333033;
	.extern camellia_sp44044404;
	.extern camellia_sp11101110;

	#define sp10011110 camellia_sp10011110
	#define sp22000222 camellia_sp22000222
	#define sp03303033 camellia_sp03303033
	#define sp00444404 camellia_sp00444404
	#define sp02220222 camellia_sp02220222
	#define sp30333033 camellia_sp30333033
	#define sp44044404 camellia_sp44044404
	#define sp11101110 camellia_sp11101110

	#define CAMELLIA_TABLE_BYTE_LEN 272

	/* struct camellia_ctx: */
	#define key_table 0
	#define key_length CAMELLIA_TABLE_BYTE_LEN

	/* register macros */
	#define CTX %rdi
	#define RIO %rsi
	#define RIOd %esi

	#define RAB0 %rax
	#define RCD0 %rcx
	#define RAB1 %rbx
	#define RCD1 %rdx

	#define RAB0d %eax
	#define RCD0d %ecx
	#define RAB1d %ebx
	#define RCD1d %edx

	#define RAB0bl %al
	#define RCD0bl %cl
	#define RAB1bl %bl
	#define RCD1bl %dl

	#define RAB0bh %ah
	#define RCD0bh %ch
	#define RAB1bh %bh
	#define RCD1bh %dh

	#define RT0 %rsi
	#define RT1 %r12
	#define RT2 %r8

	#define RT0d %esi
	#define RT1d %r12d
	#define RT2d %r8d

	#define RT2bl %r8b

	#define RXOR %r9
	#define RR12 %r10
	#define RDST %r11

	#define RXORd %r9d
	#define RXORbl %r9b

	#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
	movzbl ab ## bl, tmp2 ## d; \
	movzbl ab ## bh, tmp1 ## d; \
	rorq $16, ab; \
	xorq T0(, tmp2, 8), dst; \
	xorq T1(, tmp1, 8), dst;

	/**********************************************************************
	1-way camellia
	**********************************************************************/
	#define roundsm(ab, subkey, cd) \
	movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
	\
	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
	\
	xorq RT2, cd ## 0;

	#define fls(l, r, kl, kr) \
	movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
	andl l ## 0d, RT0d; \
	roll $1, RT0d; \
	shlq $32, RT0; \
	xorq RT0, l ## 0; \
	movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
	orq r ## 0, RT1; \
	shrq $32, RT1; \
	xorq RT1, r ## 0; \
	\
	movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \
	orq l ## 0, RT2; \
	shrq $32, RT2; \
	xorq RT2, l ## 0; \
	movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \
	andl r ## 0d, RT0d; \
	roll $1, RT0d; \
	shlq $32, RT0; \
	xorq RT0, r ## 0;

	#define enc_rounds(i) \
	roundsm(RAB, i + 2, RCD); \
	roundsm(RCD, i + 3, RAB); \
	roundsm(RAB, i + 4, RCD); \
	roundsm(RCD, i + 5, RAB); \
	roundsm(RAB, i + 6, RCD); \
	roundsm(RCD, i + 7, RAB);

	#define enc_fls(i) \
	fls(RAB, RCD, i + 0, i + 1);

	#define enc_inpack() \
	movq (RIO), RAB0; \
	bswapq RAB0; \
	rolq $32, RAB0; \
	movq 4*2(RIO), RCD0; \
	bswapq RCD0; \
	rorq $32, RCD0; \
	xorq key_table(CTX), RAB0;

	#define enc_outunpack(op, max) \
	xorq key_table(CTX, max, 8), RCD0; \
	rorq $32, RCD0; \
	bswapq RCD0; \
	op ## q RCD0, (RIO); \
	rolq $32, RAB0; \
	bswapq RAB0; \
	op ## q RAB0, 4*2(RIO);

	#define dec_rounds(i) \
	roundsm(RAB, i + 7, RCD); \
	roundsm(RCD, i + 6, RAB); \
	roundsm(RAB, i + 5, RCD); \
	roundsm(RCD, i + 4, RAB); \
	roundsm(RAB, i + 3, RCD); \
	roundsm(RCD, i + 2, RAB);

	#define dec_fls(i) \
	fls(RAB, RCD, i + 1, i + 0);

	#define dec_inpack(max) \
	movq (RIO), RAB0; \
	bswapq RAB0; \
	rolq $32, RAB0; \
	movq 4*2(RIO), RCD0; \
	bswapq RCD0; \
	rorq $32, RCD0; \
	xorq key_table(CTX, max, 8), RAB0;

	#define dec_outunpack() \
	xorq key_table(CTX), RCD0; \
	rorq $32, RCD0; \
	bswapq RCD0; \
	movq RCD0, (RIO); \
	rolq $32, RAB0; \
	bswapq RAB0; \
	movq RAB0, 4*2(RIO);

	ENTRY(__camellia_enc_blk)
	/* input:
	* %rdi: ctx, CTX
	* %rsi: dst
	* %rdx: src
	* %rcx: bool xor
	*/
	movq %r12, RR12;

	movq %rcx, RXOR;
	movq %rsi, RDST;
	movq %rdx, RIO;

	enc_inpack();

	enc_rounds(0);
	enc_fls(8);
	enc_rounds(8);
	enc_fls(16);
	enc_rounds(16);
	movl $24, RT1d; /* max */

	cmpb $16, key_length(CTX);
	je .L__enc_done;

	enc_fls(24);
	enc_rounds(24);
	movl $32, RT1d; /* max */

	.L__enc_done:
	testb RXORbl, RXORbl;
	movq RDST, RIO;

	jnz .L__enc_xor;

	enc_outunpack(mov, RT1);

	movq RR12, %r12;
	ret;

	.L__enc_xor:
	enc_outunpack(xor, RT1);

	movq RR12, %r12;
	ret;
	ENDPROC(__camellia_enc_blk)

	ENTRY(camellia_dec_blk)
	/* input:
	* %rdi: ctx, CTX
	* %rsi: dst
	* %rdx: src
	*/
	cmpl $16, key_length(CTX);
	movl $32, RT2d;
	movl $24, RXORd;
	cmovel RXORd, RT2d; /* max */

	movq %r12, RR12;
	movq %rsi, RDST;
	movq %rdx, RIO;

	dec_inpack(RT2);

	cmpb $24, RT2bl;
	je .L__dec_rounds16;

	dec_rounds(24);
	dec_fls(24);

	.L__dec_rounds16:
	dec_rounds(16);
	dec_fls(16);
	dec_rounds(8);
	dec_fls(8);
	dec_rounds(0);

	movq RDST, RIO;

	dec_outunpack();

	movq RR12, %r12;
	ret;
	ENDPROC(camellia_dec_blk)

	/**********************************************************************
	2-way camellia
	**********************************************************************/
	#define roundsm2(ab, subkey, cd) \
	movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
	xorq RT2, cd ## 1; \
	\
	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
	\
	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
	xorq RT2, cd ## 0; \
	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);

	#define fls2(l, r, kl, kr) \
	movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
	andl l ## 0d, RT0d; \
	roll $1, RT0d; \
	shlq $32, RT0; \
	xorq RT0, l ## 0; \
	movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
	orq r ## 0, RT1; \
	shrq $32, RT1; \
	xorq RT1, r ## 0; \
	\
	movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \
	andl l ## 1d, RT2d; \
	roll $1, RT2d; \
	shlq $32, RT2; \
	xorq RT2, l ## 1; \
	movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \
	orq r ## 1, RT0; \
	shrq $32, RT0; \
	xorq RT0, r ## 1; \
	\
	movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \
	orq l ## 0, RT1; \
	shrq $32, RT1; \
	xorq RT1, l ## 0; \
	movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \
	andl r ## 0d, RT2d; \
	roll $1, RT2d; \
	shlq $32, RT2; \
	xorq RT2, r ## 0; \
	\
	movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \
	orq l ## 1, RT0; \
	shrq $32, RT0; \
	xorq RT0, l ## 1; \
	movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \
	andl r ## 1d, RT1d; \
	roll $1, RT1d; \
	shlq $32, RT1; \
	xorq RT1, r ## 1;

	#define enc_rounds2(i) \
	roundsm2(RAB, i + 2, RCD); \
	roundsm2(RCD, i + 3, RAB); \
	roundsm2(RAB, i + 4, RCD); \
	roundsm2(RCD, i + 5, RAB); \
	roundsm2(RAB, i + 6, RCD); \
	roundsm2(RCD, i + 7, RAB);

	#define enc_fls2(i) \
	fls2(RAB, RCD, i + 0, i + 1);

	#define enc_inpack2() \
	movq (RIO), RAB0; \
	bswapq RAB0; \
	rorq $32, RAB0; \
	movq 4*2(RIO), RCD0; \
	bswapq RCD0; \
	rolq $32, RCD0; \
	xorq key_table(CTX), RAB0; \
	\
	movq 8*2(RIO), RAB1; \
	bswapq RAB1; \
	rorq $32, RAB1; \
	movq 12*2(RIO), RCD1; \
	bswapq RCD1; \
	rolq $32, RCD1; \
	xorq key_table(CTX), RAB1;

	#define enc_outunpack2(op, max) \
	xorq key_table(CTX, max, 8), RCD0; \
	rolq $32, RCD0; \
	bswapq RCD0; \
	op ## q RCD0, (RIO); \
	rorq $32, RAB0; \
	bswapq RAB0; \
	op ## q RAB0, 4*2(RIO); \
	\
	xorq key_table(CTX, max, 8), RCD1; \
	rolq $32, RCD1; \
	bswapq RCD1; \
	op ## q RCD1, 8*2(RIO); \
	rorq $32, RAB1; \
	bswapq RAB1; \
	op ## q RAB1, 12*2(RIO);

	#define dec_rounds2(i) \
	roundsm2(RAB, i + 7, RCD); \
	roundsm2(RCD, i + 6, RAB); \
	roundsm2(RAB, i + 5, RCD); \
	roundsm2(RCD, i + 4, RAB); \
	roundsm2(RAB, i + 3, RCD); \
	roundsm2(RCD, i + 2, RAB);

	#define dec_fls2(i) \
	fls2(RAB, RCD, i + 1, i + 0);

	#define dec_inpack2(max) \
	movq (RIO), RAB0; \
	bswapq RAB0; \
	rorq $32, RAB0; \
	movq 4*2(RIO), RCD0; \
	bswapq RCD0; \
	rolq $32, RCD0; \
	xorq key_table(CTX, max, 8), RAB0; \
	\
	movq 8*2(RIO), RAB1; \
	bswapq RAB1; \
	rorq $32, RAB1; \
	movq 12*2(RIO), RCD1; \
	bswapq RCD1; \
	rolq $32, RCD1; \
	xorq key_table(CTX, max, 8), RAB1;

	#define dec_outunpack2() \
	xorq key_table(CTX), RCD0; \
	rolq $32, RCD0; \
	bswapq RCD0; \
	movq RCD0, (RIO); \
	rorq $32, RAB0; \
	bswapq RAB0; \
	movq RAB0, 4*2(RIO); \
	\
	xorq key_table(CTX), RCD1; \
	rolq $32, RCD1; \
	bswapq RCD1; \
	movq RCD1, 8*2(RIO); \
	rorq $32, RAB1; \
	bswapq RAB1; \
	movq RAB1, 12*2(RIO);

	ENTRY(__camellia_enc_blk_2way)
	/* input:
	* %rdi: ctx, CTX
	* %rsi: dst
	* %rdx: src
	* %rcx: bool xor
	*/
	pushq %rbx;

	movq %r12, RR12;
	movq %rcx, RXOR;
	movq %rsi, RDST;
	movq %rdx, RIO;

	enc_inpack2();

	enc_rounds2(0);
	enc_fls2(8);
	enc_rounds2(8);
	enc_fls2(16);
	enc_rounds2(16);
	movl $24, RT2d; /* max */

	cmpb $16, key_length(CTX);
	je .L__enc2_done;

	enc_fls2(24);
	enc_rounds2(24);
	movl $32, RT2d; /* max */

	.L__enc2_done:
	test RXORbl, RXORbl;
	movq RDST, RIO;
	jnz .L__enc2_xor;

	enc_outunpack2(mov, RT2);

	movq RR12, %r12;
	popq %rbx;
	ret;

	.L__enc2_xor:
	enc_outunpack2(xor, RT2);

	movq RR12, %r12;
	popq %rbx;
	ret;
	ENDPROC(__camellia_enc_blk_2way)

	ENTRY(camellia_dec_blk_2way)
	/* input:
	* %rdi: ctx, CTX
	* %rsi: dst
	* %rdx: src
	*/
	cmpl $16, key_length(CTX);
	movl $32, RT2d;
	movl $24, RXORd;
	cmovel RXORd, RT2d; /* max */

	movq %rbx, RXOR;
	movq %r12, RR12;
	movq %rsi, RDST;
	movq %rdx, RIO;

	dec_inpack2(RT2);

	cmpb $24, RT2bl;
	je .L__dec2_rounds16;

	dec_rounds2(24);
	dec_fls2(24);

	.L__dec2_rounds16:
	dec_rounds2(16);
	dec_fls2(16);
	dec_rounds2(8);
	dec_fls2(8);
	dec_rounds2(0);

	movq RDST, RIO;

	dec_outunpack2();

	movq RR12, %r12;
	movq RXOR, %rbx;
	ret;
	ENDPROC(camellia_dec_blk_2way)