diff options
Diffstat (limited to 'arch')
22 files changed, 3160 insertions, 620 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 5bacb4a226a..e0ca7c9ac38 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o  obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o  obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o +obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o  obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o  obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o  obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o @@ -34,6 +35,8 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o  aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o  camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o +camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ +			       camellia_aesni_avx_glue.o  cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o  cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o  blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o @@ -47,3 +50,5 @@ serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o  aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o  ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o  sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o +crc32c-intel-y := crc32c-intel_glue.o +crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S new file mode 100644 index 00000000000..2306d2e4816 --- /dev/null +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -0,0 +1,1102 @@ +/* + * x86_64/AVX/AES-NI assembler implementation of Camellia + * + * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +/* + * Version licensed under 2-clause BSD License is available at: + *	http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz + */ + +#define CAMELLIA_TABLE_BYTE_LEN 272 + +/* struct camellia_ctx: */ +#define key_table 0 +#define key_length CAMELLIA_TABLE_BYTE_LEN + +/* register macros */ +#define CTX %rdi + +/********************************************************************** +  16-way camellia + **********************************************************************/ +#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ +	vpand x, mask4bit, tmp0; \ +	vpandn x, mask4bit, x; \ +	vpsrld $4, x, x; \ +	\ +	vpshufb tmp0, lo_t, tmp0; \ +	vpshufb x, hi_t, x; \ +	vpxor tmp0, x, x; + +/* + * IN: + *   x0..x7: byte-sliced AB state + *   mem_cd: register pointer storing CD state + *   key: index for key material + * OUT: + *   x0..x7: new byte-sliced CD state + */ +#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ +		  t7, mem_cd, key) \ +	/* \ +	 * S-function with AES subbytes \ +	 */ \ +	vmovdqa .Linv_shift_row, t4; \ +	vbroadcastss .L0f0f0f0f, t7; \ +	vmovdqa .Lpre_tf_lo_s1, t0; \ +	vmovdqa .Lpre_tf_hi_s1, t1; \ +	\ +	/* AES inverse shift rows */ \ +	vpshufb t4, x0, x0; \ +	vpshufb t4, x7, x7; \ +	vpshufb t4, x1, x1; \ +	vpshufb t4, x4, x4; \ +	vpshufb t4, x2, x2; \ +	vpshufb t4, x5, x5; \ +	vpshufb t4, x3, x3; \ +	vpshufb t4, x6, x6; \ +	\ +	/* prefilter sboxes 1, 2 and 3 */ \ +	vmovdqa .Lpre_tf_lo_s4, t2; \ +	vmovdqa .Lpre_tf_hi_s4, t3; \ +	filter_8bit(x0, t0, t1, t7, t6); \ +	filter_8bit(x7, t0, t1, t7, t6); \ +	filter_8bit(x1, t0, t1, t7, t6); \ +	filter_8bit(x4, t0, t1, t7, t6); \ +	filter_8bit(x2, t0, t1, t7, t6); \ +	filter_8bit(x5, t0, t1, t7, t6); \ +	\ +	/* prefilter sbox 4 */ \ +	vpxor t4, t4, t4; \ +	filter_8bit(x3, t2, t3, t7, t6); \ +	filter_8bit(x6, t2, t3, t7, t6); \ +	\ +	/* AES subbytes + AES shift rows */ \ +	vmovdqa .Lpost_tf_lo_s1, t0; \ +	vmovdqa .Lpost_tf_hi_s1, t1; \ +	vaesenclast t4, x0, x0; \ +	vaesenclast t4, x7, x7; \ +	vaesenclast t4, x1, x1; \ +	vaesenclast t4, x4, x4; \ +	vaesenclast t4, x2, x2; \ +	vaesenclast t4, x5, x5; \ +	vaesenclast t4, x3, x3; \ +	vaesenclast t4, x6, x6; \ +	\ +	/* postfilter sboxes 1 and 4 */ \ +	vmovdqa .Lpost_tf_lo_s3, t2; \ +	vmovdqa .Lpost_tf_hi_s3, t3; \ +	filter_8bit(x0, t0, t1, t7, t6); \ +	filter_8bit(x7, t0, t1, t7, t6); \ +	filter_8bit(x3, t0, t1, t7, t6); \ +	filter_8bit(x6, t0, t1, t7, t6); \ +	\ +	/* postfilter sbox 3 */ \ +	vmovdqa .Lpost_tf_lo_s2, t4; \ +	vmovdqa .Lpost_tf_hi_s2, t5; \ +	filter_8bit(x2, t2, t3, t7, t6); \ +	filter_8bit(x5, t2, t3, t7, t6); \ +	\ +	vpxor t6, t6, t6; \ +	vmovq key, t0; \ +	\ +	/* postfilter sbox 2 */ \ +	filter_8bit(x1, t4, t5, t7, t2); \ +	filter_8bit(x4, t4, t5, t7, t2); \ +	\ +	vpsrldq $5, t0, t5; \ +	vpsrldq $1, t0, t1; \ +	vpsrldq $2, t0, t2; \ +	vpsrldq $3, t0, t3; \ +	vpsrldq $4, t0, t4; \ +	vpshufb t6, t0, t0; \ +	vpshufb t6, t1, t1; \ +	vpshufb t6, t2, t2; \ +	vpshufb t6, t3, t3; \ +	vpshufb t6, t4, t4; \ +	vpsrldq $2, t5, t7; \ +	vpshufb t6, t7, t7; \ +	\ +	/* \ +	 * P-function \ +	 */ \ +	vpxor x5, x0, x0; \ +	vpxor x6, x1, x1; \ +	vpxor x7, x2, x2; \ +	vpxor x4, x3, x3; \ +	\ +	vpxor x2, x4, x4; \ +	vpxor x3, x5, x5; \ +	vpxor x0, x6, x6; \ +	vpxor x1, x7, x7; \ +	\ +	vpxor x7, x0, x0; \ +	vpxor x4, x1, x1; \ +	vpxor x5, x2, x2; \ +	vpxor x6, x3, x3; \ +	\ +	vpxor x3, x4, x4; \ +	vpxor x0, x5, x5; \ +	vpxor x1, x6, x6; \ +	vpxor x2, x7, x7; /* note: high and low parts swapped */ \ +	\ +	/* \ +	 * Add key material and result to CD (x becomes new CD) \ +	 */ \ +	\ +	vpxor t3, x4, x4; \ +	vpxor 0 * 16(mem_cd), x4, x4; \ +	\ +	vpxor t2, x5, x5; \ +	vpxor 1 * 16(mem_cd), x5, x5; \ +	\ +	vpsrldq $1, t5, t3; \ +	vpshufb t6, t5, t5; \ +	vpshufb t6, t3, t6; \ +	\ +	vpxor t1, x6, x6; \ +	vpxor 2 * 16(mem_cd), x6, x6; \ +	\ +	vpxor t0, x7, x7; \ +	vpxor 3 * 16(mem_cd), x7, x7; \ +	\ +	vpxor t7, x0, x0; \ +	vpxor 4 * 16(mem_cd), x0, x0; \ +	\ +	vpxor t6, x1, x1; \ +	vpxor 5 * 16(mem_cd), x1, x1; \ +	\ +	vpxor t5, x2, x2; \ +	vpxor 6 * 16(mem_cd), x2, x2; \ +	\ +	vpxor t4, x3, x3; \ +	vpxor 7 * 16(mem_cd), x3, x3; + +/* + * Size optimization... with inlined roundsm16, binary would be over 5 times + * larger and would only be 0.5% faster (on sandy-bridge). + */ +.align 8 +roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: +	roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +		  %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, +		  %rcx, (%r9)); +	ret; + +.align 8 +roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: +	roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3, +		  %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11, +		  %rax, (%r9)); +	ret; + +/* + * IN/OUT: + *  x0..x7: byte-sliced AB state preloaded + *  mem_ab: byte-sliced AB state in memory + *  mem_cb: byte-sliced CD state in memory + */ +#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ +		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ +	leaq (key_table + (i) * 8)(CTX), %r9; \ +	call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \ +	\ +	vmovdqu x4, 0 * 16(mem_cd); \ +	vmovdqu x5, 1 * 16(mem_cd); \ +	vmovdqu x6, 2 * 16(mem_cd); \ +	vmovdqu x7, 3 * 16(mem_cd); \ +	vmovdqu x0, 4 * 16(mem_cd); \ +	vmovdqu x1, 5 * 16(mem_cd); \ +	vmovdqu x2, 6 * 16(mem_cd); \ +	vmovdqu x3, 7 * 16(mem_cd); \ +	\ +	leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \ +	call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \ +	\ +	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); + +#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ + +#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ +	/* Store new AB state */ \ +	vmovdqu x0, 0 * 16(mem_ab); \ +	vmovdqu x1, 1 * 16(mem_ab); \ +	vmovdqu x2, 2 * 16(mem_ab); \ +	vmovdqu x3, 3 * 16(mem_ab); \ +	vmovdqu x4, 4 * 16(mem_ab); \ +	vmovdqu x5, 5 * 16(mem_ab); \ +	vmovdqu x6, 6 * 16(mem_ab); \ +	vmovdqu x7, 7 * 16(mem_ab); + +#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ +		      y6, y7, mem_ab, mem_cd, i) \ +	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ +		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ +	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ +		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ +	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ +		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); + +#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ +		      y6, y7, mem_ab, mem_cd, i) \ +	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ +		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ +	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ +		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ +	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ +		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); + +/* + * IN: + *  v0..3: byte-sliced 32-bit integers + * OUT: + *  v0..3: (IN <<< 1) + */ +#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \ +	vpcmpgtb v0, zero, t0; \ +	vpaddb v0, v0, v0; \ +	vpabsb t0, t0; \ +	\ +	vpcmpgtb v1, zero, t1; \ +	vpaddb v1, v1, v1; \ +	vpabsb t1, t1; \ +	\ +	vpcmpgtb v2, zero, t2; \ +	vpaddb v2, v2, v2; \ +	vpabsb t2, t2; \ +	\ +	vpor t0, v1, v1; \ +	\ +	vpcmpgtb v3, zero, t0; \ +	vpaddb v3, v3, v3; \ +	vpabsb t0, t0; \ +	\ +	vpor t1, v2, v2; \ +	vpor t2, v3, v3; \ +	vpor t0, v0, v0; + +/* + * IN: + *   r: byte-sliced AB state in memory + *   l: byte-sliced CD state in memory + * OUT: + *   x0..x7: new byte-sliced CD state + */ +#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ +	      tt1, tt2, tt3, kll, klr, krl, krr) \ +	/* \ +	 * t0 = kll; \ +	 * t0 &= ll; \ +	 * lr ^= rol32(t0, 1); \ +	 */ \ +	vpxor tt0, tt0, tt0; \ +	vmovd kll, t0; \ +	vpshufb tt0, t0, t3; \ +	vpsrldq $1, t0, t0; \ +	vpshufb tt0, t0, t2; \ +	vpsrldq $1, t0, t0; \ +	vpshufb tt0, t0, t1; \ +	vpsrldq $1, t0, t0; \ +	vpshufb tt0, t0, t0; \ +	\ +	vpand l0, t0, t0; \ +	vpand l1, t1, t1; \ +	vpand l2, t2, t2; \ +	vpand l3, t3, t3; \ +	\ +	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ +	\ +	vpxor l4, t0, l4; \ +	vmovdqu l4, 4 * 16(l); \ +	vpxor l5, t1, l5; \ +	vmovdqu l5, 5 * 16(l); \ +	vpxor l6, t2, l6; \ +	vmovdqu l6, 6 * 16(l); \ +	vpxor l7, t3, l7; \ +	vmovdqu l7, 7 * 16(l); \ +	\ +	/* \ +	 * t2 = krr; \ +	 * t2 |= rr; \ +	 * rl ^= t2; \ +	 */ \ +	\ +	vmovd krr, t0; \ +	vpshufb tt0, t0, t3; \ +	vpsrldq $1, t0, t0; \ +	vpshufb tt0, t0, t2; \ +	vpsrldq $1, t0, t0; \ +	vpshufb tt0, t0, t1; \ +	vpsrldq $1, t0, t0; \ +	vpshufb tt0, t0, t0; \ +	\ +	vpor 4 * 16(r), t0, t0; \ +	vpor 5 * 16(r), t1, t1; \ +	vpor 6 * 16(r), t2, t2; \ +	vpor 7 * 16(r), t3, t3; \ +	\ +	vpxor 0 * 16(r), t0, t0; \ +	vpxor 1 * 16(r), t1, t1; \ +	vpxor 2 * 16(r), t2, t2; \ +	vpxor 3 * 16(r), t3, t3; \ +	vmovdqu t0, 0 * 16(r); \ +	vmovdqu t1, 1 * 16(r); \ +	vmovdqu t2, 2 * 16(r); \ +	vmovdqu t3, 3 * 16(r); \ +	\ +	/* \ +	 * t2 = krl; \ +	 * t2 &= rl; \ +	 * rr ^= rol32(t2, 1); \ +	 */ \ +	vmovd krl, t0; \ +	vpshufb tt0, t0, t3; \ +	vpsrldq $1, t0, t0; \ +	vpshufb tt0, t0, t2; \ +	vpsrldq $1, t0, t0; \ +	vpshufb tt0, t0, t1; \ +	vpsrldq $1, t0, t0; \ +	vpshufb tt0, t0, t0; \ +	\ +	vpand 0 * 16(r), t0, t0; \ +	vpand 1 * 16(r), t1, t1; \ +	vpand 2 * 16(r), t2, t2; \ +	vpand 3 * 16(r), t3, t3; \ +	\ +	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ +	\ +	vpxor 4 * 16(r), t0, t0; \ +	vpxor 5 * 16(r), t1, t1; \ +	vpxor 6 * 16(r), t2, t2; \ +	vpxor 7 * 16(r), t3, t3; \ +	vmovdqu t0, 4 * 16(r); \ +	vmovdqu t1, 5 * 16(r); \ +	vmovdqu t2, 6 * 16(r); \ +	vmovdqu t3, 7 * 16(r); \ +	\ +	/* \ +	 * t0 = klr; \ +	 * t0 |= lr; \ +	 * ll ^= t0; \ +	 */ \ +	\ +	vmovd klr, t0; \ +	vpshufb tt0, t0, t3; \ +	vpsrldq $1, t0, t0; \ +	vpshufb tt0, t0, t2; \ +	vpsrldq $1, t0, t0; \ +	vpshufb tt0, t0, t1; \ +	vpsrldq $1, t0, t0; \ +	vpshufb tt0, t0, t0; \ +	\ +	vpor l4, t0, t0; \ +	vpor l5, t1, t1; \ +	vpor l6, t2, t2; \ +	vpor l7, t3, t3; \ +	\ +	vpxor l0, t0, l0; \ +	vmovdqu l0, 0 * 16(l); \ +	vpxor l1, t1, l1; \ +	vmovdqu l1, 1 * 16(l); \ +	vpxor l2, t2, l2; \ +	vmovdqu l2, 2 * 16(l); \ +	vpxor l3, t3, l3; \ +	vmovdqu l3, 3 * 16(l); + +#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ +	vpunpckhdq x1, x0, t2; \ +	vpunpckldq x1, x0, x0; \ +	\ +	vpunpckldq x3, x2, t1; \ +	vpunpckhdq x3, x2, x2; \ +	\ +	vpunpckhqdq t1, x0, x1; \ +	vpunpcklqdq t1, x0, x0; \ +	\ +	vpunpckhqdq x2, t2, x3; \ +	vpunpcklqdq x2, t2, x2; + +#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \ +			 b3, c3, d3, st0, st1) \ +	vmovdqu d2, st0; \ +	vmovdqu d3, st1; \ +	transpose_4x4(a0, a1, a2, a3, d2, d3); \ +	transpose_4x4(b0, b1, b2, b3, d2, d3); \ +	vmovdqu st0, d2; \ +	vmovdqu st1, d3; \ +	\ +	vmovdqu a0, st0; \ +	vmovdqu a1, st1; \ +	transpose_4x4(c0, c1, c2, c3, a0, a1); \ +	transpose_4x4(d0, d1, d2, d3, a0, a1); \ +	\ +	vmovdqu .Lshufb_16x16b, a0; \ +	vmovdqu st1, a1; \ +	vpshufb a0, a2, a2; \ +	vpshufb a0, a3, a3; \ +	vpshufb a0, b0, b0; \ +	vpshufb a0, b1, b1; \ +	vpshufb a0, b2, b2; \ +	vpshufb a0, b3, b3; \ +	vpshufb a0, a1, a1; \ +	vpshufb a0, c0, c0; \ +	vpshufb a0, c1, c1; \ +	vpshufb a0, c2, c2; \ +	vpshufb a0, c3, c3; \ +	vpshufb a0, d0, d0; \ +	vpshufb a0, d1, d1; \ +	vpshufb a0, d2, d2; \ +	vpshufb a0, d3, d3; \ +	vmovdqu d3, st1; \ +	vmovdqu st0, d3; \ +	vpshufb a0, d3, a0; \ +	vmovdqu d2, st0; \ +	\ +	transpose_4x4(a0, b0, c0, d0, d2, d3); \ +	transpose_4x4(a1, b1, c1, d1, d2, d3); \ +	vmovdqu st0, d2; \ +	vmovdqu st1, d3; \ +	\ +	vmovdqu b0, st0; \ +	vmovdqu b1, st1; \ +	transpose_4x4(a2, b2, c2, d2, b0, b1); \ +	transpose_4x4(a3, b3, c3, d3, b0, b1); \ +	vmovdqu st0, b0; \ +	vmovdqu st1, b1; \ +	/* does not adjust output bytes inside vectors */ + +/* load blocks to registers and apply pre-whitening */ +#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ +		     y6, y7, rio, key) \ +	vmovq key, x0; \ +	vpshufb .Lpack_bswap, x0, x0; \ +	\ +	vpxor 0 * 16(rio), x0, y7; \ +	vpxor 1 * 16(rio), x0, y6; \ +	vpxor 2 * 16(rio), x0, y5; \ +	vpxor 3 * 16(rio), x0, y4; \ +	vpxor 4 * 16(rio), x0, y3; \ +	vpxor 5 * 16(rio), x0, y2; \ +	vpxor 6 * 16(rio), x0, y1; \ +	vpxor 7 * 16(rio), x0, y0; \ +	vpxor 8 * 16(rio), x0, x7; \ +	vpxor 9 * 16(rio), x0, x6; \ +	vpxor 10 * 16(rio), x0, x5; \ +	vpxor 11 * 16(rio), x0, x4; \ +	vpxor 12 * 16(rio), x0, x3; \ +	vpxor 13 * 16(rio), x0, x2; \ +	vpxor 14 * 16(rio), x0, x1; \ +	vpxor 15 * 16(rio), x0, x0; + +/* byteslice pre-whitened blocks and store to temporary memory */ +#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ +		      y6, y7, mem_ab, mem_cd) \ +	byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ +			 y5, y6, y7, (mem_ab), (mem_cd)); \ +	\ +	vmovdqu x0, 0 * 16(mem_ab); \ +	vmovdqu x1, 1 * 16(mem_ab); \ +	vmovdqu x2, 2 * 16(mem_ab); \ +	vmovdqu x3, 3 * 16(mem_ab); \ +	vmovdqu x4, 4 * 16(mem_ab); \ +	vmovdqu x5, 5 * 16(mem_ab); \ +	vmovdqu x6, 6 * 16(mem_ab); \ +	vmovdqu x7, 7 * 16(mem_ab); \ +	vmovdqu y0, 0 * 16(mem_cd); \ +	vmovdqu y1, 1 * 16(mem_cd); \ +	vmovdqu y2, 2 * 16(mem_cd); \ +	vmovdqu y3, 3 * 16(mem_cd); \ +	vmovdqu y4, 4 * 16(mem_cd); \ +	vmovdqu y5, 5 * 16(mem_cd); \ +	vmovdqu y6, 6 * 16(mem_cd); \ +	vmovdqu y7, 7 * 16(mem_cd); + +/* de-byteslice, apply post-whitening and store blocks */ +#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ +		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \ +	byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \ +			 y7, x3, x7, stack_tmp0, stack_tmp1); \ +	\ +	vmovdqu x0, stack_tmp0; \ +	\ +	vmovq key, x0; \ +	vpshufb .Lpack_bswap, x0, x0; \ +	\ +	vpxor x0, y7, y7; \ +	vpxor x0, y6, y6; \ +	vpxor x0, y5, y5; \ +	vpxor x0, y4, y4; \ +	vpxor x0, y3, y3; \ +	vpxor x0, y2, y2; \ +	vpxor x0, y1, y1; \ +	vpxor x0, y0, y0; \ +	vpxor x0, x7, x7; \ +	vpxor x0, x6, x6; \ +	vpxor x0, x5, x5; \ +	vpxor x0, x4, x4; \ +	vpxor x0, x3, x3; \ +	vpxor x0, x2, x2; \ +	vpxor x0, x1, x1; \ +	vpxor stack_tmp0, x0, x0; + +#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ +		     y6, y7, rio) \ +	vmovdqu x0, 0 * 16(rio); \ +	vmovdqu x1, 1 * 16(rio); \ +	vmovdqu x2, 2 * 16(rio); \ +	vmovdqu x3, 3 * 16(rio); \ +	vmovdqu x4, 4 * 16(rio); \ +	vmovdqu x5, 5 * 16(rio); \ +	vmovdqu x6, 6 * 16(rio); \ +	vmovdqu x7, 7 * 16(rio); \ +	vmovdqu y0, 8 * 16(rio); \ +	vmovdqu y1, 9 * 16(rio); \ +	vmovdqu y2, 10 * 16(rio); \ +	vmovdqu y3, 11 * 16(rio); \ +	vmovdqu y4, 12 * 16(rio); \ +	vmovdqu y5, 13 * 16(rio); \ +	vmovdqu y6, 14 * 16(rio); \ +	vmovdqu y7, 15 * 16(rio); + +.data +.align 16 + +#define SHUFB_BYTES(idx) \ +	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) + +.Lshufb_16x16b: +	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3); + +.Lpack_bswap: +	.long 0x00010203 +	.long 0x04050607 +	.long 0x80808080 +	.long 0x80808080 + +/* For CTR-mode IV byteswap */ +.Lbswap128_mask: +	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +/* + * pre-SubByte transform + * + * pre-lookup for sbox1, sbox2, sbox3: + *   swap_bitendianness( + *       isom_map_camellia_to_aes( + *           camellia_f( + *               swap_bitendianess(in) + *           ) + *       ) + *   ) + * + * (note: '⊕ 0xc5' inside camellia_f()) + */ +.Lpre_tf_lo_s1: +	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 +	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 +.Lpre_tf_hi_s1: +	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a +	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 + +/* + * pre-SubByte transform + * + * pre-lookup for sbox4: + *   swap_bitendianness( + *       isom_map_camellia_to_aes( + *           camellia_f( + *               swap_bitendianess(in <<< 1) + *           ) + *       ) + *   ) + * + * (note: '⊕ 0xc5' inside camellia_f()) + */ +.Lpre_tf_lo_s4: +	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 +	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 +.Lpre_tf_hi_s4: +	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 +	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf + +/* + * post-SubByte transform + * + * post-lookup for sbox1, sbox4: + *  swap_bitendianness( + *      camellia_h( + *          isom_map_aes_to_camellia( + *              swap_bitendianness( + *                  aes_inverse_affine_transform(in) + *              ) + *          ) + *      ) + *  ) + * + * (note: '⊕ 0x6e' inside camellia_h()) + */ +.Lpost_tf_lo_s1: +	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 +	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 +.Lpost_tf_hi_s1: +	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 +	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c + +/* + * post-SubByte transform + * + * post-lookup for sbox2: + *  swap_bitendianness( + *      camellia_h( + *          isom_map_aes_to_camellia( + *              swap_bitendianness( + *                  aes_inverse_affine_transform(in) + *              ) + *          ) + *      ) + *  ) <<< 1 + * + * (note: '⊕ 0x6e' inside camellia_h()) + */ +.Lpost_tf_lo_s2: +	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 +	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 +.Lpost_tf_hi_s2: +	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 +	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 + +/* + * post-SubByte transform + * + * post-lookup for sbox3: + *  swap_bitendianness( + *      camellia_h( + *          isom_map_aes_to_camellia( + *              swap_bitendianness( + *                  aes_inverse_affine_transform(in) + *              ) + *          ) + *      ) + *  ) >>> 1 + * + * (note: '⊕ 0x6e' inside camellia_h()) + */ +.Lpost_tf_lo_s3: +	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 +	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 +.Lpost_tf_hi_s3: +	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 +	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 + +/* For isolating SubBytes from AESENCLAST, inverse shift row */ +.Linv_shift_row: +	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b +	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 + +/* 4-bit mask */ +.align 4 +.L0f0f0f0f: +	.long 0x0f0f0f0f + +.text + +.align 8 +.type   __camellia_enc_blk16,@function; + +__camellia_enc_blk16: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rax: temporary storage, 256 bytes +	 *	%xmm0..%xmm15: 16 plaintext blocks +	 * output: +	 *	%xmm0..%xmm15: 16 encrypted blocks, order swapped: +	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 +	 */ + +	leaq 8 * 16(%rax), %rcx; + +	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +		      %xmm15, %rax, %rcx); + +	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +		     %xmm15, %rax, %rcx, 0); + +	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +	      %xmm15, +	      ((key_table + (8) * 8) + 0)(CTX), +	      ((key_table + (8) * 8) + 4)(CTX), +	      ((key_table + (8) * 8) + 8)(CTX), +	      ((key_table + (8) * 8) + 12)(CTX)); + +	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +		     %xmm15, %rax, %rcx, 8); + +	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +	      %xmm15, +	      ((key_table + (16) * 8) + 0)(CTX), +	      ((key_table + (16) * 8) + 4)(CTX), +	      ((key_table + (16) * 8) + 8)(CTX), +	      ((key_table + (16) * 8) + 12)(CTX)); + +	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +		     %xmm15, %rax, %rcx, 16); + +	movl $24, %r8d; +	cmpl $16, key_length(CTX); +	jne .Lenc_max32; + +.Lenc_done: +	/* load CD for output */ +	vmovdqu 0 * 16(%rcx), %xmm8; +	vmovdqu 1 * 16(%rcx), %xmm9; +	vmovdqu 2 * 16(%rcx), %xmm10; +	vmovdqu 3 * 16(%rcx), %xmm11; +	vmovdqu 4 * 16(%rcx), %xmm12; +	vmovdqu 5 * 16(%rcx), %xmm13; +	vmovdqu 6 * 16(%rcx), %xmm14; +	vmovdqu 7 * 16(%rcx), %xmm15; + +	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +		    %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax)); + +	ret; + +.align 8 +.Lenc_max32: +	movl $32, %r8d; + +	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +	      %xmm15, +	      ((key_table + (24) * 8) + 0)(CTX), +	      ((key_table + (24) * 8) + 4)(CTX), +	      ((key_table + (24) * 8) + 8)(CTX), +	      ((key_table + (24) * 8) + 12)(CTX)); + +	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +		     %xmm15, %rax, %rcx, 24); + +	jmp .Lenc_done; + +.align 8 +.type   __camellia_dec_blk16,@function; + +__camellia_dec_blk16: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rax: temporary storage, 256 bytes +	 *	%r8d: 24 for 16 byte key, 32 for larger +	 *	%xmm0..%xmm15: 16 encrypted blocks +	 * output: +	 *	%xmm0..%xmm15: 16 plaintext blocks, order swapped: +	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 +	 */ + +	leaq 8 * 16(%rax), %rcx; + +	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +		      %xmm15, %rax, %rcx); + +	cmpl $32, %r8d; +	je .Ldec_max32; + +.Ldec_max24: +	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +		     %xmm15, %rax, %rcx, 16); + +	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +	      %xmm15, +	      ((key_table + (16) * 8) + 8)(CTX), +	      ((key_table + (16) * 8) + 12)(CTX), +	      ((key_table + (16) * 8) + 0)(CTX), +	      ((key_table + (16) * 8) + 4)(CTX)); + +	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +		     %xmm15, %rax, %rcx, 8); + +	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +	      %xmm15, +	      ((key_table + (8) * 8) + 8)(CTX), +	      ((key_table + (8) * 8) + 12)(CTX), +	      ((key_table + (8) * 8) + 0)(CTX), +	      ((key_table + (8) * 8) + 4)(CTX)); + +	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +		     %xmm15, %rax, %rcx, 0); + +	/* load CD for output */ +	vmovdqu 0 * 16(%rcx), %xmm8; +	vmovdqu 1 * 16(%rcx), %xmm9; +	vmovdqu 2 * 16(%rcx), %xmm10; +	vmovdqu 3 * 16(%rcx), %xmm11; +	vmovdqu 4 * 16(%rcx), %xmm12; +	vmovdqu 5 * 16(%rcx), %xmm13; +	vmovdqu 6 * 16(%rcx), %xmm14; +	vmovdqu 7 * 16(%rcx), %xmm15; + +	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +		    %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax)); + +	ret; + +.align 8 +.Ldec_max32: +	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +		     %xmm15, %rax, %rcx, 24); + +	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +	      %xmm15, +	      ((key_table + (24) * 8) + 8)(CTX), +	      ((key_table + (24) * 8) + 12)(CTX), +	      ((key_table + (24) * 8) + 0)(CTX), +	      ((key_table + (24) * 8) + 4)(CTX)); + +	jmp .Ldec_max24; + +.align 8 +.global camellia_ecb_enc_16way +.type   camellia_ecb_enc_16way,@function; + +camellia_ecb_enc_16way: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst (16 blocks) +	 *	%rdx: src (16 blocks) +	 */ + +	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +		     %xmm15, %rdx, (key_table)(CTX)); + +	/* now dst can be used as temporary buffer (even in src == dst case) */ +	movq	%rsi, %rax; + +	call __camellia_enc_blk16; + +	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, +		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, +		     %xmm8, %rsi); + +	ret; + +.align 8 +.global camellia_ecb_dec_16way +.type   camellia_ecb_dec_16way,@function; + +camellia_ecb_dec_16way: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst (16 blocks) +	 *	%rdx: src (16 blocks) +	 */ + +	cmpl $16, key_length(CTX); +	movl $32, %r8d; +	movl $24, %eax; +	cmovel %eax, %r8d; /* max */ + +	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +		     %xmm15, %rdx, (key_table)(CTX, %r8, 8)); + +	/* now dst can be used as temporary buffer (even in src == dst case) */ +	movq	%rsi, %rax; + +	call __camellia_dec_blk16; + +	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, +		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, +		     %xmm8, %rsi); + +	ret; + +.align 8 +.global camellia_cbc_dec_16way +.type   camellia_cbc_dec_16way,@function; + +camellia_cbc_dec_16way: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst (16 blocks) +	 *	%rdx: src (16 blocks) +	 */ + +	cmpl $16, key_length(CTX); +	movl $32, %r8d; +	movl $24, %eax; +	cmovel %eax, %r8d; /* max */ + +	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, +		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, +		     %xmm15, %rdx, (key_table)(CTX, %r8, 8)); + +	/* +	 * dst might still be in-use (in case dst == src), so use stack for +	 * temporary storage. +	 */ +	subq $(16 * 16), %rsp; +	movq %rsp, %rax; + +	call __camellia_dec_blk16; + +	addq $(16 * 16), %rsp; + +	vpxor (0 * 16)(%rdx), %xmm6, %xmm6; +	vpxor (1 * 16)(%rdx), %xmm5, %xmm5; +	vpxor (2 * 16)(%rdx), %xmm4, %xmm4; +	vpxor (3 * 16)(%rdx), %xmm3, %xmm3; +	vpxor (4 * 16)(%rdx), %xmm2, %xmm2; +	vpxor (5 * 16)(%rdx), %xmm1, %xmm1; +	vpxor (6 * 16)(%rdx), %xmm0, %xmm0; +	vpxor (7 * 16)(%rdx), %xmm15, %xmm15; +	vpxor (8 * 16)(%rdx), %xmm14, %xmm14; +	vpxor (9 * 16)(%rdx), %xmm13, %xmm13; +	vpxor (10 * 16)(%rdx), %xmm12, %xmm12; +	vpxor (11 * 16)(%rdx), %xmm11, %xmm11; +	vpxor (12 * 16)(%rdx), %xmm10, %xmm10; +	vpxor (13 * 16)(%rdx), %xmm9, %xmm9; +	vpxor (14 * 16)(%rdx), %xmm8, %xmm8; +	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, +		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, +		     %xmm8, %rsi); + +	ret; + +#define inc_le128(x, minus_one, tmp) \ +	vpcmpeqq minus_one, x, tmp; \ +	vpsubq minus_one, x, x; \ +	vpslldq $8, tmp, tmp; \ +	vpsubq tmp, x, x; + +.align 8 +.global camellia_ctr_16way +.type   camellia_ctr_16way,@function; + +camellia_ctr_16way: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst (16 blocks) +	 *	%rdx: src (16 blocks) +	 *	%rcx: iv (little endian, 128bit) +	 */ + +	subq $(16 * 16), %rsp; +	movq %rsp, %rax; + +	vmovdqa .Lbswap128_mask, %xmm14; + +	/* load IV and byteswap */ +	vmovdqu (%rcx), %xmm0; +	vpshufb %xmm14, %xmm0, %xmm15; +	vmovdqu %xmm15, 15 * 16(%rax); + +	vpcmpeqd %xmm15, %xmm15, %xmm15; +	vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */ + +	/* construct IVs */ +	inc_le128(%xmm0, %xmm15, %xmm13); +	vpshufb %xmm14, %xmm0, %xmm13; +	vmovdqu %xmm13, 14 * 16(%rax); +	inc_le128(%xmm0, %xmm15, %xmm13); +	vpshufb %xmm14, %xmm0, %xmm13; +	vmovdqu %xmm13, 13 * 16(%rax); +	inc_le128(%xmm0, %xmm15, %xmm13); +	vpshufb %xmm14, %xmm0, %xmm12; +	inc_le128(%xmm0, %xmm15, %xmm13); +	vpshufb %xmm14, %xmm0, %xmm11; +	inc_le128(%xmm0, %xmm15, %xmm13); +	vpshufb %xmm14, %xmm0, %xmm10; +	inc_le128(%xmm0, %xmm15, %xmm13); +	vpshufb %xmm14, %xmm0, %xmm9; +	inc_le128(%xmm0, %xmm15, %xmm13); +	vpshufb %xmm14, %xmm0, %xmm8; +	inc_le128(%xmm0, %xmm15, %xmm13); +	vpshufb %xmm14, %xmm0, %xmm7; +	inc_le128(%xmm0, %xmm15, %xmm13); +	vpshufb %xmm14, %xmm0, %xmm6; +	inc_le128(%xmm0, %xmm15, %xmm13); +	vpshufb %xmm14, %xmm0, %xmm5; +	inc_le128(%xmm0, %xmm15, %xmm13); +	vpshufb %xmm14, %xmm0, %xmm4; +	inc_le128(%xmm0, %xmm15, %xmm13); +	vpshufb %xmm14, %xmm0, %xmm3; +	inc_le128(%xmm0, %xmm15, %xmm13); +	vpshufb %xmm14, %xmm0, %xmm2; +	inc_le128(%xmm0, %xmm15, %xmm13); +	vpshufb %xmm14, %xmm0, %xmm1; +	inc_le128(%xmm0, %xmm15, %xmm13); +	vmovdqa %xmm0, %xmm13; +	vpshufb %xmm14, %xmm0, %xmm0; +	inc_le128(%xmm13, %xmm15, %xmm14); +	vmovdqu %xmm13, (%rcx); + +	/* inpack16_pre: */ +	vmovq (key_table)(CTX), %xmm15; +	vpshufb .Lpack_bswap, %xmm15, %xmm15; +	vpxor %xmm0, %xmm15, %xmm0; +	vpxor %xmm1, %xmm15, %xmm1; +	vpxor %xmm2, %xmm15, %xmm2; +	vpxor %xmm3, %xmm15, %xmm3; +	vpxor %xmm4, %xmm15, %xmm4; +	vpxor %xmm5, %xmm15, %xmm5; +	vpxor %xmm6, %xmm15, %xmm6; +	vpxor %xmm7, %xmm15, %xmm7; +	vpxor %xmm8, %xmm15, %xmm8; +	vpxor %xmm9, %xmm15, %xmm9; +	vpxor %xmm10, %xmm15, %xmm10; +	vpxor %xmm11, %xmm15, %xmm11; +	vpxor %xmm12, %xmm15, %xmm12; +	vpxor 13 * 16(%rax), %xmm15, %xmm13; +	vpxor 14 * 16(%rax), %xmm15, %xmm14; +	vpxor 15 * 16(%rax), %xmm15, %xmm15; + +	call __camellia_enc_blk16; + +	addq $(16 * 16), %rsp; + +	vpxor 0 * 16(%rdx), %xmm7, %xmm7; +	vpxor 1 * 16(%rdx), %xmm6, %xmm6; +	vpxor 2 * 16(%rdx), %xmm5, %xmm5; +	vpxor 3 * 16(%rdx), %xmm4, %xmm4; +	vpxor 4 * 16(%rdx), %xmm3, %xmm3; +	vpxor 5 * 16(%rdx), %xmm2, %xmm2; +	vpxor 6 * 16(%rdx), %xmm1, %xmm1; +	vpxor 7 * 16(%rdx), %xmm0, %xmm0; +	vpxor 8 * 16(%rdx), %xmm15, %xmm15; +	vpxor 9 * 16(%rdx), %xmm14, %xmm14; +	vpxor 10 * 16(%rdx), %xmm13, %xmm13; +	vpxor 11 * 16(%rdx), %xmm12, %xmm12; +	vpxor 12 * 16(%rdx), %xmm11, %xmm11; +	vpxor 13 * 16(%rdx), %xmm10, %xmm10; +	vpxor 14 * 16(%rdx), %xmm9, %xmm9; +	vpxor 15 * 16(%rdx), %xmm8, %xmm8; +	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, +		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, +		     %xmm8, %rsi); + +	ret; diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c new file mode 100644 index 00000000000..96cbb6068fc --- /dev/null +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -0,0 +1,558 @@ +/* + * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia + * + * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/crypto.h> +#include <linux/err.h> +#include <crypto/algapi.h> +#include <crypto/ctr.h> +#include <crypto/lrw.h> +#include <crypto/xts.h> +#include <asm/xcr.h> +#include <asm/xsave.h> +#include <asm/crypto/camellia.h> +#include <asm/crypto/ablk_helper.h> +#include <asm/crypto/glue_helper.h> + +#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 + +/* 16-way AES-NI parallel cipher functions */ +asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst, +				       const u8 *src); +asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst, +				       const u8 *src); + +asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst, +				       const u8 *src); +asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, +				   const u8 *src, le128 *iv); + +static const struct common_glue_ctx camellia_enc = { +	.num_funcs = 3, +	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, + +	.funcs = { { +		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, +		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) } +	}, { +		.num_blocks = 2, +		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) } +	}, { +		.num_blocks = 1, +		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) } +	} } +}; + +static const struct common_glue_ctx camellia_ctr = { +	.num_funcs = 3, +	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, + +	.funcs = { { +		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, +		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) } +	}, { +		.num_blocks = 2, +		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) } +	}, { +		.num_blocks = 1, +		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) } +	} } +}; + +static const struct common_glue_ctx camellia_dec = { +	.num_funcs = 3, +	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, + +	.funcs = { { +		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, +		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) } +	}, { +		.num_blocks = 2, +		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) } +	}, { +		.num_blocks = 1, +		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) } +	} } +}; + +static const struct common_glue_ctx camellia_dec_cbc = { +	.num_funcs = 3, +	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, + +	.funcs = { { +		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, +		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) } +	}, { +		.num_blocks = 2, +		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) } +	}, { +		.num_blocks = 1, +		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } +	} } +}; + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes); +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc, +				       dst, src, nbytes); +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src, +				       nbytes); +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		     struct scatterlist *src, unsigned int nbytes) +{ +	return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes); +} + +static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes) +{ +	return glue_fpu_begin(CAMELLIA_BLOCK_SIZE, +			      CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled, +			      nbytes); +} + +static inline void camellia_fpu_end(bool fpu_enabled) +{ +	glue_fpu_end(fpu_enabled); +} + +static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, +			   unsigned int key_len) +{ +	return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len, +				 &tfm->crt_flags); +} + +struct crypt_priv { +	struct camellia_ctx *ctx; +	bool fpu_enabled; +}; + +static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ +	const unsigned int bsize = CAMELLIA_BLOCK_SIZE; +	struct crypt_priv *ctx = priv; +	int i; + +	ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); + +	if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { +		camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst); +		srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; +		nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; +	} + +	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { +		camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst); +		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; +		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; +	} + +	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) +		camellia_enc_blk(ctx->ctx, srcdst, srcdst); +} + +static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ +	const unsigned int bsize = CAMELLIA_BLOCK_SIZE; +	struct crypt_priv *ctx = priv; +	int i; + +	ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); + +	if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { +		camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst); +		srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; +		nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; +	} + +	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { +		camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst); +		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; +		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; +	} + +	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) +		camellia_dec_blk(ctx->ctx, srcdst, srcdst); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; +	struct crypt_priv crypt_ctx = { +		.ctx = &ctx->camellia_ctx, +		.fpu_enabled = false, +	}; +	struct lrw_crypt_req req = { +		.tbuf = buf, +		.tbuflen = sizeof(buf), + +		.table_ctx = &ctx->lrw_table, +		.crypt_ctx = &crypt_ctx, +		.crypt_fn = encrypt_callback, +	}; +	int ret; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	ret = lrw_crypt(desc, dst, src, nbytes, &req); +	camellia_fpu_end(crypt_ctx.fpu_enabled); + +	return ret; +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; +	struct crypt_priv crypt_ctx = { +		.ctx = &ctx->camellia_ctx, +		.fpu_enabled = false, +	}; +	struct lrw_crypt_req req = { +		.tbuf = buf, +		.tbuflen = sizeof(buf), + +		.table_ctx = &ctx->lrw_table, +		.crypt_ctx = &crypt_ctx, +		.crypt_fn = decrypt_callback, +	}; +	int ret; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	ret = lrw_crypt(desc, dst, src, nbytes, &req); +	camellia_fpu_end(crypt_ctx.fpu_enabled); + +	return ret; +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; +	struct crypt_priv crypt_ctx = { +		.ctx = &ctx->crypt_ctx, +		.fpu_enabled = false, +	}; +	struct xts_crypt_req req = { +		.tbuf = buf, +		.tbuflen = sizeof(buf), + +		.tweak_ctx = &ctx->tweak_ctx, +		.tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), +		.crypt_ctx = &crypt_ctx, +		.crypt_fn = encrypt_callback, +	}; +	int ret; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	ret = xts_crypt(desc, dst, src, nbytes, &req); +	camellia_fpu_end(crypt_ctx.fpu_enabled); + +	return ret; +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; +	struct crypt_priv crypt_ctx = { +		.ctx = &ctx->crypt_ctx, +		.fpu_enabled = false, +	}; +	struct xts_crypt_req req = { +		.tbuf = buf, +		.tbuflen = sizeof(buf), + +		.tweak_ctx = &ctx->tweak_ctx, +		.tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), +		.crypt_ctx = &crypt_ctx, +		.crypt_fn = decrypt_callback, +	}; +	int ret; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	ret = xts_crypt(desc, dst, src, nbytes, &req); +	camellia_fpu_end(crypt_ctx.fpu_enabled); + +	return ret; +} + +static struct crypto_alg cmll_algs[10] = { { +	.cra_name		= "__ecb-camellia-aesni", +	.cra_driver_name	= "__driver-ecb-camellia-aesni", +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= CAMELLIA_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct camellia_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_u = { +		.blkcipher = { +			.min_keysize	= CAMELLIA_MIN_KEY_SIZE, +			.max_keysize	= CAMELLIA_MAX_KEY_SIZE, +			.setkey		= camellia_setkey, +			.encrypt	= ecb_encrypt, +			.decrypt	= ecb_decrypt, +		}, +	}, +}, { +	.cra_name		= "__cbc-camellia-aesni", +	.cra_driver_name	= "__driver-cbc-camellia-aesni", +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= CAMELLIA_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct camellia_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_u = { +		.blkcipher = { +			.min_keysize	= CAMELLIA_MIN_KEY_SIZE, +			.max_keysize	= CAMELLIA_MAX_KEY_SIZE, +			.setkey		= camellia_setkey, +			.encrypt	= cbc_encrypt, +			.decrypt	= cbc_decrypt, +		}, +	}, +}, { +	.cra_name		= "__ctr-camellia-aesni", +	.cra_driver_name	= "__driver-ctr-camellia-aesni", +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= 1, +	.cra_ctxsize		= sizeof(struct camellia_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_u = { +		.blkcipher = { +			.min_keysize	= CAMELLIA_MIN_KEY_SIZE, +			.max_keysize	= CAMELLIA_MAX_KEY_SIZE, +			.ivsize		= CAMELLIA_BLOCK_SIZE, +			.setkey		= camellia_setkey, +			.encrypt	= ctr_crypt, +			.decrypt	= ctr_crypt, +		}, +	}, +}, { +	.cra_name		= "__lrw-camellia-aesni", +	.cra_driver_name	= "__driver-lrw-camellia-aesni", +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= CAMELLIA_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct camellia_lrw_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_exit		= lrw_camellia_exit_tfm, +	.cra_u = { +		.blkcipher = { +			.min_keysize	= CAMELLIA_MIN_KEY_SIZE + +					  CAMELLIA_BLOCK_SIZE, +			.max_keysize	= CAMELLIA_MAX_KEY_SIZE + +					  CAMELLIA_BLOCK_SIZE, +			.ivsize		= CAMELLIA_BLOCK_SIZE, +			.setkey		= lrw_camellia_setkey, +			.encrypt	= lrw_encrypt, +			.decrypt	= lrw_decrypt, +		}, +	}, +}, { +	.cra_name		= "__xts-camellia-aesni", +	.cra_driver_name	= "__driver-xts-camellia-aesni", +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= CAMELLIA_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct camellia_xts_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_u = { +		.blkcipher = { +			.min_keysize	= CAMELLIA_MIN_KEY_SIZE * 2, +			.max_keysize	= CAMELLIA_MAX_KEY_SIZE * 2, +			.ivsize		= CAMELLIA_BLOCK_SIZE, +			.setkey		= xts_camellia_setkey, +			.encrypt	= xts_encrypt, +			.decrypt	= xts_decrypt, +		}, +	}, +}, { +	.cra_name		= "ecb(camellia)", +	.cra_driver_name	= "ecb-camellia-aesni", +	.cra_priority		= 400, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, +	.cra_blocksize		= CAMELLIA_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_u = { +		.ablkcipher = { +			.min_keysize	= CAMELLIA_MIN_KEY_SIZE, +			.max_keysize	= CAMELLIA_MAX_KEY_SIZE, +			.setkey		= ablk_set_key, +			.encrypt	= ablk_encrypt, +			.decrypt	= ablk_decrypt, +		}, +	}, +}, { +	.cra_name		= "cbc(camellia)", +	.cra_driver_name	= "cbc-camellia-aesni", +	.cra_priority		= 400, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, +	.cra_blocksize		= CAMELLIA_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_u = { +		.ablkcipher = { +			.min_keysize	= CAMELLIA_MIN_KEY_SIZE, +			.max_keysize	= CAMELLIA_MAX_KEY_SIZE, +			.ivsize		= CAMELLIA_BLOCK_SIZE, +			.setkey		= ablk_set_key, +			.encrypt	= __ablk_encrypt, +			.decrypt	= ablk_decrypt, +		}, +	}, +}, { +	.cra_name		= "ctr(camellia)", +	.cra_driver_name	= "ctr-camellia-aesni", +	.cra_priority		= 400, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, +	.cra_blocksize		= 1, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_u = { +		.ablkcipher = { +			.min_keysize	= CAMELLIA_MIN_KEY_SIZE, +			.max_keysize	= CAMELLIA_MAX_KEY_SIZE, +			.ivsize		= CAMELLIA_BLOCK_SIZE, +			.setkey		= ablk_set_key, +			.encrypt	= ablk_encrypt, +			.decrypt	= ablk_encrypt, +			.geniv		= "chainiv", +		}, +	}, +}, { +	.cra_name		= "lrw(camellia)", +	.cra_driver_name	= "lrw-camellia-aesni", +	.cra_priority		= 400, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, +	.cra_blocksize		= CAMELLIA_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_u = { +		.ablkcipher = { +			.min_keysize	= CAMELLIA_MIN_KEY_SIZE + +					  CAMELLIA_BLOCK_SIZE, +			.max_keysize	= CAMELLIA_MAX_KEY_SIZE + +					  CAMELLIA_BLOCK_SIZE, +			.ivsize		= CAMELLIA_BLOCK_SIZE, +			.setkey		= ablk_set_key, +			.encrypt	= ablk_encrypt, +			.decrypt	= ablk_decrypt, +		}, +	}, +}, { +	.cra_name		= "xts(camellia)", +	.cra_driver_name	= "xts-camellia-aesni", +	.cra_priority		= 400, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, +	.cra_blocksize		= CAMELLIA_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_u = { +		.ablkcipher = { +			.min_keysize	= CAMELLIA_MIN_KEY_SIZE * 2, +			.max_keysize	= CAMELLIA_MAX_KEY_SIZE * 2, +			.ivsize		= CAMELLIA_BLOCK_SIZE, +			.setkey		= ablk_set_key, +			.encrypt	= ablk_encrypt, +			.decrypt	= ablk_decrypt, +		}, +	}, +} }; + +static int __init camellia_aesni_init(void) +{ +	u64 xcr0; + +	if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { +		pr_info("AVX or AES-NI instructions are not detected.\n"); +		return -ENODEV; +	} + +	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); +	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { +		pr_info("AVX detected but unusable.\n"); +		return -ENODEV; +	} + +	return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); +} + +static void __exit camellia_aesni_fini(void) +{ +	crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); +} + +module_init(camellia_aesni_init); +module_exit(camellia_aesni_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized"); +MODULE_ALIAS("camellia"); +MODULE_ALIAS("camellia-asm"); diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 42ffd2bbab5..5cb86ccd4ac 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -32,53 +32,24 @@  #include <crypto/algapi.h>  #include <crypto/lrw.h>  #include <crypto/xts.h> +#include <asm/crypto/camellia.h>  #include <asm/crypto/glue_helper.h> -#define CAMELLIA_MIN_KEY_SIZE	16 -#define CAMELLIA_MAX_KEY_SIZE	32 -#define CAMELLIA_BLOCK_SIZE	16 -#define CAMELLIA_TABLE_BYTE_LEN	272 - -struct camellia_ctx { -	u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)]; -	u32 key_length; -}; -  /* regular block cipher functions */  asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,  				   const u8 *src, bool xor); +EXPORT_SYMBOL_GPL(__camellia_enc_blk);  asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst,  				 const u8 *src); +EXPORT_SYMBOL_GPL(camellia_dec_blk);  /* 2-way parallel cipher functions */  asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,  					const u8 *src, bool xor); +EXPORT_SYMBOL_GPL(__camellia_enc_blk_2way);  asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,  				      const u8 *src); - -static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, -				    const u8 *src) -{ -	__camellia_enc_blk(ctx, dst, src, false); -} - -static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst, -					const u8 *src) -{ -	__camellia_enc_blk(ctx, dst, src, true); -} - -static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, -					 const u8 *src) -{ -	__camellia_enc_blk_2way(ctx, dst, src, false); -} - -static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst, -					     const u8 *src) -{ -	__camellia_enc_blk_2way(ctx, dst, src, true); -} +EXPORT_SYMBOL_GPL(camellia_dec_blk_2way);  static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)  { @@ -1275,9 +1246,8 @@ static void camellia_setup192(const unsigned char *key, u64 *subkey)  	camellia_setup256(kk, subkey);  } -static int __camellia_setkey(struct camellia_ctx *cctx, -			     const unsigned char *key, -			     unsigned int key_len, u32 *flags) +int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key, +		      unsigned int key_len, u32 *flags)  {  	if (key_len != 16 && key_len != 24 && key_len != 32) {  		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; @@ -1300,6 +1270,7 @@ static int __camellia_setkey(struct camellia_ctx *cctx,  	return 0;  } +EXPORT_SYMBOL_GPL(__camellia_setkey);  static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,  			   unsigned int key_len) @@ -1308,7 +1279,7 @@ static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,  				 &tfm->crt_flags);  } -static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src) +void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)  {  	u128 iv = *src; @@ -1316,22 +1287,23 @@ static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)  	u128_xor(&dst[1], &dst[1], &iv);  } +EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way); -static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) +void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)  {  	be128 ctrblk;  	if (dst != src)  		*dst = *src; -	u128_to_be128(&ctrblk, iv); -	u128_inc(iv); +	le128_to_be128(&ctrblk, iv); +	le128_inc(iv);  	camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);  } +EXPORT_SYMBOL_GPL(camellia_crypt_ctr); -static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, -				    u128 *iv) +void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, le128 *iv)  {  	be128 ctrblks[2]; @@ -1340,13 +1312,14 @@ static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,  		dst[1] = src[1];  	} -	u128_to_be128(&ctrblks[0], iv); -	u128_inc(iv); -	u128_to_be128(&ctrblks[1], iv); -	u128_inc(iv); +	le128_to_be128(&ctrblks[0], iv); +	le128_inc(iv); +	le128_to_be128(&ctrblks[1], iv); +	le128_inc(iv);  	camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);  } +EXPORT_SYMBOL_GPL(camellia_crypt_ctr_2way);  static const struct common_glue_ctx camellia_enc = {  	.num_funcs = 2, @@ -1464,13 +1437,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)  		camellia_dec_blk(ctx, srcdst, srcdst);  } -struct camellia_lrw_ctx { -	struct lrw_table_ctx lrw_table; -	struct camellia_ctx camellia_ctx; -}; - -static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, -			      unsigned int keylen) +int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, +			unsigned int keylen)  {  	struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm);  	int err; @@ -1484,6 +1452,7 @@ static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,  	return lrw_init_table(&ctx->lrw_table,  			      key + keylen - CAMELLIA_BLOCK_SIZE);  } +EXPORT_SYMBOL_GPL(lrw_camellia_setkey);  static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,  		       struct scatterlist *src, unsigned int nbytes) @@ -1519,20 +1488,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,  	return lrw_crypt(desc, dst, src, nbytes, &req);  } -static void lrw_exit_tfm(struct crypto_tfm *tfm) +void lrw_camellia_exit_tfm(struct crypto_tfm *tfm)  {  	struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm);  	lrw_free_table(&ctx->lrw_table);  } +EXPORT_SYMBOL_GPL(lrw_camellia_exit_tfm); -struct camellia_xts_ctx { -	struct camellia_ctx tweak_ctx; -	struct camellia_ctx crypt_ctx; -}; - -static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, -			      unsigned int keylen) +int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, +			unsigned int keylen)  {  	struct camellia_xts_ctx *ctx = crypto_tfm_ctx(tfm);  	u32 *flags = &tfm->crt_flags; @@ -1555,6 +1520,7 @@ static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,  	return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,  				flags);  } +EXPORT_SYMBOL_GPL(xts_camellia_setkey);  static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,  		       struct scatterlist *src, unsigned int nbytes) @@ -1679,7 +1645,7 @@ static struct crypto_alg camellia_algs[6] = { {  	.cra_alignmask		= 0,  	.cra_type		= &crypto_blkcipher_type,  	.cra_module		= THIS_MODULE, -	.cra_exit		= lrw_exit_tfm, +	.cra_exit		= lrw_camellia_exit_tfm,  	.cra_u = {  		.blkcipher = {  			.min_keysize	= CAMELLIA_MIN_KEY_SIZE + diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index a41a3aaba22..15b00ac7cbd 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -25,10 +25,10 @@  .file "cast5-avx-x86_64-asm_64.S" -.extern cast5_s1 -.extern cast5_s2 -.extern cast5_s3 -.extern cast5_s4 +.extern cast_s1 +.extern cast_s2 +.extern cast_s3 +.extern cast_s4  /* structure of crypto context */  #define km	0 @@ -36,10 +36,10 @@  #define rr	((16*4)+16)  /* s-boxes */ -#define s1	cast5_s1 -#define s2	cast5_s2 -#define s3	cast5_s3 -#define s4	cast5_s4 +#define s1	cast_s1 +#define s2	cast_s2 +#define s3	cast_s3 +#define s4	cast_s4  /**********************************************************************    16-way AVX cast5 @@ -180,31 +180,17 @@  	vpunpcklqdq		t1, t0, x0; \  	vpunpckhqdq		t1, t0, x1; -#define inpack_blocks(in, x0, x1, t0, t1, rmask) \ -	vmovdqu (0*4*4)(in),	x0; \ -	vmovdqu (1*4*4)(in),	x1; \ +#define inpack_blocks(x0, x1, t0, t1, rmask) \  	vpshufb rmask, 	x0,	x0; \  	vpshufb rmask, 	x1,	x1; \  	\  	transpose_2x4(x0, x1, t0, t1) -#define outunpack_blocks(out, x0, x1, t0, t1, rmask) \ +#define outunpack_blocks(x0, x1, t0, t1, rmask) \  	transpose_2x4(x0, x1, t0, t1) \  	\  	vpshufb rmask,	x0, x0;           \ -	vpshufb rmask,	x1, x1;           \ -	vmovdqu 	x0, (0*4*4)(out); \ -	vmovdqu		x1, (1*4*4)(out); - -#define outunpack_xor_blocks(out, x0, x1, t0, t1, rmask) \ -	transpose_2x4(x0, x1, t0, t1) \ -	\ -	vpshufb rmask,	x0, x0;               \ -	vpshufb rmask,	x1, x1;               \ -	vpxor		(0*4*4)(out), x0, x0; \ -	vmovdqu 	x0, (0*4*4)(out);     \ -	vpxor		(1*4*4)(out), x1, x1; \ -	vmovdqu	        x1, (1*4*4)(out); +	vpshufb rmask,	x1, x1;  .data @@ -213,6 +199,8 @@  	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12  .Lbswap128_mask:  	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +.Lbswap_iv_mask: +	.byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0  .L16_mask:  	.byte 16, 16, 16, 16  .L32_mask: @@ -223,35 +211,42 @@  .text  .align 16 -.global __cast5_enc_blk_16way -.type   __cast5_enc_blk_16way,@function; +.type   __cast5_enc_blk16,@function; -__cast5_enc_blk_16way: +__cast5_enc_blk16:  	/* input:  	 *	%rdi: ctx, CTX -	 *	%rsi: dst -	 *	%rdx: src -	 *	%rcx: bool, if true: xor output +	 *	RL1: blocks 1 and 2 +	 *	RR1: blocks 3 and 4 +	 *	RL2: blocks 5 and 6 +	 *	RR2: blocks 7 and 8 +	 *	RL3: blocks 9 and 10 +	 *	RR3: blocks 11 and 12 +	 *	RL4: blocks 13 and 14 +	 *	RR4: blocks 15 and 16 +	 * output: +	 *	RL1: encrypted blocks 1 and 2 +	 *	RR1: encrypted blocks 3 and 4 +	 *	RL2: encrypted blocks 5 and 6 +	 *	RR2: encrypted blocks 7 and 8 +	 *	RL3: encrypted blocks 9 and 10 +	 *	RR3: encrypted blocks 11 and 12 +	 *	RL4: encrypted blocks 13 and 14 +	 *	RR4: encrypted blocks 15 and 16  	 */  	pushq %rbp;  	pushq %rbx; -	pushq %rcx;  	vmovdqa .Lbswap_mask, RKM;  	vmovd .Lfirst_mask, R1ST;  	vmovd .L32_mask, R32;  	enc_preload_rkr(); -	leaq 1*(2*4*4)(%rdx), %rax; -	inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM); -	inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM); -	leaq 2*(2*4*4)(%rdx), %rax; -	inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM); -	leaq 3*(2*4*4)(%rdx), %rax; -	inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM); - -	movq %rsi, %r11; +	inpack_blocks(RL1, RR1, RTMP, RX, RKM); +	inpack_blocks(RL2, RR2, RTMP, RX, RKM); +	inpack_blocks(RL3, RR3, RTMP, RX, RKM); +	inpack_blocks(RL4, RR4, RTMP, RX, RKM);  	round(RL, RR, 0, 1);  	round(RR, RL, 1, 2); @@ -276,44 +271,41 @@ __cast5_enc_blk_16way:  	round(RR, RL, 15, 1);  __skip_enc: -	popq %rcx;  	popq %rbx;  	popq %rbp;  	vmovdqa .Lbswap_mask, RKM; -	leaq 1*(2*4*4)(%r11), %rax; - -	testb %cl, %cl; -	jnz __enc_xor16; - -	outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM); -	outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM); -	leaq 2*(2*4*4)(%r11), %rax; -	outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM); -	leaq 3*(2*4*4)(%r11), %rax; -	outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM); - -	ret; -__enc_xor16: -	outunpack_xor_blocks(%r11, RR1, RL1, RTMP, RX, RKM); -	outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX, RKM); -	leaq 2*(2*4*4)(%r11), %rax; -	outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX, RKM); -	leaq 3*(2*4*4)(%r11), %rax; -	outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX, RKM); +	outunpack_blocks(RR1, RL1, RTMP, RX, RKM); +	outunpack_blocks(RR2, RL2, RTMP, RX, RKM); +	outunpack_blocks(RR3, RL3, RTMP, RX, RKM); +	outunpack_blocks(RR4, RL4, RTMP, RX, RKM);  	ret;  .align 16 -.global cast5_dec_blk_16way -.type   cast5_dec_blk_16way,@function; +.type   __cast5_dec_blk16,@function; -cast5_dec_blk_16way: +__cast5_dec_blk16:  	/* input:  	 *	%rdi: ctx, CTX -	 *	%rsi: dst -	 *	%rdx: src +	 *	RL1: encrypted blocks 1 and 2 +	 *	RR1: encrypted blocks 3 and 4 +	 *	RL2: encrypted blocks 5 and 6 +	 *	RR2: encrypted blocks 7 and 8 +	 *	RL3: encrypted blocks 9 and 10 +	 *	RR3: encrypted blocks 11 and 12 +	 *	RL4: encrypted blocks 13 and 14 +	 *	RR4: encrypted blocks 15 and 16 +	 * output: +	 *	RL1: decrypted blocks 1 and 2 +	 *	RR1: decrypted blocks 3 and 4 +	 *	RL2: decrypted blocks 5 and 6 +	 *	RR2: decrypted blocks 7 and 8 +	 *	RL3: decrypted blocks 9 and 10 +	 *	RR3: decrypted blocks 11 and 12 +	 *	RL4: decrypted blocks 13 and 14 +	 *	RR4: decrypted blocks 15 and 16  	 */  	pushq %rbp; @@ -324,15 +316,10 @@ cast5_dec_blk_16way:  	vmovd .L32_mask, R32;  	dec_preload_rkr(); -	leaq 1*(2*4*4)(%rdx), %rax; -	inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM); -	inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM); -	leaq 2*(2*4*4)(%rdx), %rax; -	inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM); -	leaq 3*(2*4*4)(%rdx), %rax; -	inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM); - -	movq %rsi, %r11; +	inpack_blocks(RL1, RR1, RTMP, RX, RKM); +	inpack_blocks(RL2, RR2, RTMP, RX, RKM); +	inpack_blocks(RL3, RR3, RTMP, RX, RKM); +	inpack_blocks(RL4, RR4, RTMP, RX, RKM);  	movzbl rr(CTX), %eax;  	testl %eax, %eax; @@ -361,16 +348,211 @@ __dec_tail:  	popq %rbx;  	popq %rbp; -	leaq 1*(2*4*4)(%r11), %rax; -	outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM); -	outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM); -	leaq 2*(2*4*4)(%r11), %rax; -	outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM); -	leaq 3*(2*4*4)(%r11), %rax; -	outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM); +	outunpack_blocks(RR1, RL1, RTMP, RX, RKM); +	outunpack_blocks(RR2, RL2, RTMP, RX, RKM); +	outunpack_blocks(RR3, RL3, RTMP, RX, RKM); +	outunpack_blocks(RR4, RL4, RTMP, RX, RKM);  	ret;  __skip_dec:  	vpsrldq $4, RKR, RKR;  	jmp __dec_tail; + +.align 16 +.global cast5_ecb_enc_16way +.type   cast5_ecb_enc_16way,@function; + +cast5_ecb_enc_16way: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 */ + +	movq %rsi, %r11; + +	vmovdqu (0*4*4)(%rdx), RL1; +	vmovdqu (1*4*4)(%rdx), RR1; +	vmovdqu (2*4*4)(%rdx), RL2; +	vmovdqu (3*4*4)(%rdx), RR2; +	vmovdqu (4*4*4)(%rdx), RL3; +	vmovdqu (5*4*4)(%rdx), RR3; +	vmovdqu (6*4*4)(%rdx), RL4; +	vmovdqu (7*4*4)(%rdx), RR4; + +	call __cast5_enc_blk16; + +	vmovdqu RR1, (0*4*4)(%r11); +	vmovdqu RL1, (1*4*4)(%r11); +	vmovdqu RR2, (2*4*4)(%r11); +	vmovdqu RL2, (3*4*4)(%r11); +	vmovdqu RR3, (4*4*4)(%r11); +	vmovdqu RL3, (5*4*4)(%r11); +	vmovdqu RR4, (6*4*4)(%r11); +	vmovdqu RL4, (7*4*4)(%r11); + +	ret; + +.align 16 +.global cast5_ecb_dec_16way +.type   cast5_ecb_dec_16way,@function; + +cast5_ecb_dec_16way: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 */ + +	movq %rsi, %r11; + +	vmovdqu (0*4*4)(%rdx), RL1; +	vmovdqu (1*4*4)(%rdx), RR1; +	vmovdqu (2*4*4)(%rdx), RL2; +	vmovdqu (3*4*4)(%rdx), RR2; +	vmovdqu (4*4*4)(%rdx), RL3; +	vmovdqu (5*4*4)(%rdx), RR3; +	vmovdqu (6*4*4)(%rdx), RL4; +	vmovdqu (7*4*4)(%rdx), RR4; + +	call __cast5_dec_blk16; + +	vmovdqu RR1, (0*4*4)(%r11); +	vmovdqu RL1, (1*4*4)(%r11); +	vmovdqu RR2, (2*4*4)(%r11); +	vmovdqu RL2, (3*4*4)(%r11); +	vmovdqu RR3, (4*4*4)(%r11); +	vmovdqu RL3, (5*4*4)(%r11); +	vmovdqu RR4, (6*4*4)(%r11); +	vmovdqu RL4, (7*4*4)(%r11); + +	ret; + +.align 16 +.global cast5_cbc_dec_16way +.type   cast5_cbc_dec_16way,@function; + +cast5_cbc_dec_16way: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 */ + +	pushq %r12; + +	movq %rsi, %r11; +	movq %rdx, %r12; + +	vmovdqu (0*16)(%rdx), RL1; +	vmovdqu (1*16)(%rdx), RR1; +	vmovdqu (2*16)(%rdx), RL2; +	vmovdqu (3*16)(%rdx), RR2; +	vmovdqu (4*16)(%rdx), RL3; +	vmovdqu (5*16)(%rdx), RR3; +	vmovdqu (6*16)(%rdx), RL4; +	vmovdqu (7*16)(%rdx), RR4; + +	call __cast5_dec_blk16; + +	/* xor with src */ +	vmovq (%r12), RX; +	vpshufd $0x4f, RX, RX; +	vpxor RX, RR1, RR1; +	vpxor 0*16+8(%r12), RL1, RL1; +	vpxor 1*16+8(%r12), RR2, RR2; +	vpxor 2*16+8(%r12), RL2, RL2; +	vpxor 3*16+8(%r12), RR3, RR3; +	vpxor 4*16+8(%r12), RL3, RL3; +	vpxor 5*16+8(%r12), RR4, RR4; +	vpxor 6*16+8(%r12), RL4, RL4; + +	vmovdqu RR1, (0*16)(%r11); +	vmovdqu RL1, (1*16)(%r11); +	vmovdqu RR2, (2*16)(%r11); +	vmovdqu RL2, (3*16)(%r11); +	vmovdqu RR3, (4*16)(%r11); +	vmovdqu RL3, (5*16)(%r11); +	vmovdqu RR4, (6*16)(%r11); +	vmovdqu RL4, (7*16)(%r11); + +	popq %r12; + +	ret; + +.align 16 +.global cast5_ctr_16way +.type   cast5_ctr_16way,@function; + +cast5_ctr_16way: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 *	%rcx: iv (big endian, 64bit) +	 */ + +	pushq %r12; + +	movq %rsi, %r11; +	movq %rdx, %r12; + +	vpcmpeqd RTMP, RTMP, RTMP; +	vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */ + +	vpcmpeqd RKR, RKR, RKR; +	vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */ +	vmovdqa .Lbswap_iv_mask, R1ST; +	vmovdqa .Lbswap128_mask, RKM; + +	/* load IV and byteswap */ +	vmovq (%rcx), RX; +	vpshufb R1ST, RX, RX; + +	/* construct IVs */ +	vpsubq RTMP, RX, RX;  /* le: IV1, IV0 */ +	vpshufb RKM, RX, RL1; /* be: IV0, IV1 */ +	vpsubq RKR, RX, RX; +	vpshufb RKM, RX, RR1; /* be: IV2, IV3 */ +	vpsubq RKR, RX, RX; +	vpshufb RKM, RX, RL2; /* be: IV4, IV5 */ +	vpsubq RKR, RX, RX; +	vpshufb RKM, RX, RR2; /* be: IV6, IV7 */ +	vpsubq RKR, RX, RX; +	vpshufb RKM, RX, RL3; /* be: IV8, IV9 */ +	vpsubq RKR, RX, RX; +	vpshufb RKM, RX, RR3; /* be: IV10, IV11 */ +	vpsubq RKR, RX, RX; +	vpshufb RKM, RX, RL4; /* be: IV12, IV13 */ +	vpsubq RKR, RX, RX; +	vpshufb RKM, RX, RR4; /* be: IV14, IV15 */ + +	/* store last IV */ +	vpsubq RTMP, RX, RX; /* le: IV16, IV14 */ +	vpshufb R1ST, RX, RX; /* be: IV16, IV16 */ +	vmovq RX, (%rcx); + +	call __cast5_enc_blk16; + +	/* dst = src ^ iv */ +	vpxor (0*16)(%r12), RR1, RR1; +	vpxor (1*16)(%r12), RL1, RL1; +	vpxor (2*16)(%r12), RR2, RR2; +	vpxor (3*16)(%r12), RL2, RL2; +	vpxor (4*16)(%r12), RR3, RR3; +	vpxor (5*16)(%r12), RL3, RL3; +	vpxor (6*16)(%r12), RR4, RR4; +	vpxor (7*16)(%r12), RL4, RL4; +	vmovdqu RR1, (0*16)(%r11); +	vmovdqu RL1, (1*16)(%r11); +	vmovdqu RR2, (2*16)(%r11); +	vmovdqu RL2, (3*16)(%r11); +	vmovdqu RR3, (4*16)(%r11); +	vmovdqu RL3, (5*16)(%r11); +	vmovdqu RR4, (6*16)(%r11); +	vmovdqu RL4, (7*16)(%r11); + +	popq %r12; + +	ret; diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index e0ea14f9547..c6631813dc1 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -37,29 +37,14 @@  #define CAST5_PARALLEL_BLOCKS 16 -asmlinkage void __cast5_enc_blk_16way(struct cast5_ctx *ctx, u8 *dst, -				      const u8 *src, bool xor); -asmlinkage void cast5_dec_blk_16way(struct cast5_ctx *ctx, u8 *dst, +asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst,  				    const u8 *src); - -static inline void cast5_enc_blk_xway(struct cast5_ctx *ctx, u8 *dst, -				      const u8 *src) -{ -	__cast5_enc_blk_16way(ctx, dst, src, false); -} - -static inline void cast5_enc_blk_xway_xor(struct cast5_ctx *ctx, u8 *dst, -					  const u8 *src) -{ -	__cast5_enc_blk_16way(ctx, dst, src, true); -} - -static inline void cast5_dec_blk_xway(struct cast5_ctx *ctx, u8 *dst, -				      const u8 *src) -{ -	cast5_dec_blk_16way(ctx, dst, src); -} - +asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst, +				    const u8 *src); +asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst, +				    const u8 *src); +asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src, +				__be64 *iv);  static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes)  { @@ -79,8 +64,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,  	struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);  	const unsigned int bsize = CAST5_BLOCK_SIZE;  	unsigned int nbytes; +	void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src);  	int err; +	fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way; +  	err = blkcipher_walk_virt(desc, walk);  	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; @@ -93,10 +81,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,  		/* Process multi-block batch */  		if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {  			do { -				if (enc) -					cast5_enc_blk_xway(ctx, wdst, wsrc); -				else -					cast5_dec_blk_xway(ctx, wdst, wsrc); +				fn(ctx, wdst, wsrc);  				wsrc += bsize * CAST5_PARALLEL_BLOCKS;  				wdst += bsize * CAST5_PARALLEL_BLOCKS; @@ -107,12 +92,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,  				goto done;  		} +		fn = (enc) ? __cast5_encrypt : __cast5_decrypt; +  		/* Handle leftovers */  		do { -			if (enc) -				__cast5_encrypt(ctx, wdst, wsrc); -			else -				__cast5_decrypt(ctx, wdst, wsrc); +			fn(ctx, wdst, wsrc);  			wsrc += bsize;  			wdst += bsize; @@ -194,9 +178,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,  	unsigned int nbytes = walk->nbytes;  	u64 *src = (u64 *)walk->src.virt.addr;  	u64 *dst = (u64 *)walk->dst.virt.addr; -	u64 ivs[CAST5_PARALLEL_BLOCKS - 1];  	u64 last_iv; -	int i;  	/* Start of the last block. */  	src += nbytes / bsize - 1; @@ -211,13 +193,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,  			src -= CAST5_PARALLEL_BLOCKS - 1;  			dst -= CAST5_PARALLEL_BLOCKS - 1; -			for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++) -				ivs[i] = src[i]; - -			cast5_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); - -			for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++) -				*(dst + (i + 1)) ^= *(ivs + i); +			cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src);  			nbytes -= bsize;  			if (nbytes < bsize) @@ -298,23 +274,12 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,  	unsigned int nbytes = walk->nbytes;  	u64 *src = (u64 *)walk->src.virt.addr;  	u64 *dst = (u64 *)walk->dst.virt.addr; -	u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); -	__be64 ctrblocks[CAST5_PARALLEL_BLOCKS]; -	int i;  	/* Process multi-block batch */  	if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {  		do { -			/* create ctrblks for parallel encrypt */ -			for (i = 0; i < CAST5_PARALLEL_BLOCKS; i++) { -				if (dst != src) -					dst[i] = src[i]; - -				ctrblocks[i] = cpu_to_be64(ctrblk++); -			} - -			cast5_enc_blk_xway_xor(ctx, (u8 *)dst, -					       (u8 *)ctrblocks); +			cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src, +					(__be64 *)walk->iv);  			src += CAST5_PARALLEL_BLOCKS;  			dst += CAST5_PARALLEL_BLOCKS; @@ -327,13 +292,16 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,  	/* Handle leftovers */  	do { +		u64 ctrblk; +  		if (dst != src)  			*dst = *src; -		ctrblocks[0] = cpu_to_be64(ctrblk++); +		ctrblk = *(u64 *)walk->iv; +		be64_add_cpu((__be64 *)walk->iv, 1); -		__cast5_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); -		*dst ^= ctrblocks[0]; +		__cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); +		*dst ^= ctrblk;  		src += 1;  		dst += 1; @@ -341,7 +309,6 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,  	} while (nbytes >= bsize);  done: -	*(__be64 *)walk->iv = cpu_to_be64(ctrblk);  	return nbytes;  } diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 218d283772f..2569d0da841 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -23,22 +23,24 @@   *   */ +#include "glue_helper-asm-avx.S" +  .file "cast6-avx-x86_64-asm_64.S" -.extern cast6_s1 -.extern cast6_s2 -.extern cast6_s3 -.extern cast6_s4 +.extern cast_s1 +.extern cast_s2 +.extern cast_s3 +.extern cast_s4  /* structure of crypto context */  #define km	0  #define kr	(12*4*4)  /* s-boxes */ -#define s1	cast6_s1 -#define s2	cast6_s2 -#define s3	cast6_s3 -#define s4	cast6_s4 +#define s1	cast_s1 +#define s2	cast_s2 +#define s3	cast_s3 +#define s4	cast_s4  /**********************************************************************    8-way AVX cast6 @@ -205,11 +207,7 @@  	vpunpcklqdq		x3, t2, x2; \  	vpunpckhqdq		x3, t2, x3; -#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \ -	vmovdqu (0*4*4)(in),	x0; \ -	vmovdqu (1*4*4)(in),	x1; \ -	vmovdqu (2*4*4)(in),	x2; \ -	vmovdqu (3*4*4)(in),	x3; \ +#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \  	vpshufb rmask, x0,	x0; \  	vpshufb rmask, x1,	x1; \  	vpshufb rmask, x2,	x2; \ @@ -217,39 +215,21 @@  	\  	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) -#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \ +#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \  	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \  	\  	vpshufb rmask,		x0, x0;       \  	vpshufb rmask,		x1, x1;       \  	vpshufb rmask,		x2, x2;       \ -	vpshufb rmask,		x3, x3;       \ -	vmovdqu x0,		(0*4*4)(out); \ -	vmovdqu	x1,		(1*4*4)(out); \ -	vmovdqu	x2,		(2*4*4)(out); \ -	vmovdqu	x3,		(3*4*4)(out); - -#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \ -	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ -	\ -	vpshufb rmask,		x0, x0;       \ -	vpshufb rmask,		x1, x1;       \ -	vpshufb rmask,		x2, x2;       \ -	vpshufb rmask,		x3, x3;       \ -	vpxor (0*4*4)(out),	x0, x0;       \ -	vmovdqu	x0,		(0*4*4)(out); \ -	vpxor (1*4*4)(out),	x1, x1;       \ -	vmovdqu	x1,		(1*4*4)(out); \ -	vpxor (2*4*4)(out),	x2, x2;       \ -	vmovdqu x2,		(2*4*4)(out); \ -	vpxor (3*4*4)(out),	x3, x3;       \ -	vmovdqu x3,		(3*4*4)(out); +	vpshufb rmask,		x3, x3;  .data  .align 16  .Lbswap_mask:  	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 +.Lbswap128_mask: +	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0  .Lrkr_enc_Q_Q_QBAR_QBAR:  	.byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12  .Lrkr_enc_QBAR_QBAR_QBAR_QBAR: @@ -269,31 +249,26 @@  .text -.align 16 -.global __cast6_enc_blk_8way -.type   __cast6_enc_blk_8way,@function; +.align 8 +.type   __cast6_enc_blk8,@function; -__cast6_enc_blk_8way: +__cast6_enc_blk8:  	/* input:  	 *	%rdi: ctx, CTX -	 *	%rsi: dst -	 *	%rdx: src -	 *	%rcx: bool, if true: xor output +	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks +	 * output: +	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks  	 */  	pushq %rbp;  	pushq %rbx; -	pushq %rcx;  	vmovdqa .Lbswap_mask, RKM;  	vmovd .Lfirst_mask, R1ST;  	vmovd .L32_mask, R32; -	leaq (4*4*4)(%rdx), %rax; -	inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); -	inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); - -	movq %rsi, %r11; +	inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); +	inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);  	preload_rkr(0, dummy, none);  	Q(0); @@ -311,36 +286,25 @@ __cast6_enc_blk_8way:  	QBAR(10);  	QBAR(11); -	popq %rcx;  	popq %rbx;  	popq %rbp;  	vmovdqa .Lbswap_mask, RKM; -	leaq (4*4*4)(%r11), %rax; - -	testb %cl, %cl; -	jnz __enc_xor8; - -	outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); -	outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); - -	ret; -__enc_xor8: -	outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); -	outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); +	outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); +	outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);  	ret; -.align 16 -.global cast6_dec_blk_8way -.type   cast6_dec_blk_8way,@function; +.align 8 +.type   __cast6_dec_blk8,@function; -cast6_dec_blk_8way: +__cast6_dec_blk8:  	/* input:  	 *	%rdi: ctx, CTX -	 *	%rsi: dst -	 *	%rdx: src +	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks +	 * output: +	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks  	 */  	pushq %rbp; @@ -350,11 +314,8 @@ cast6_dec_blk_8way:  	vmovd .Lfirst_mask, R1ST;  	vmovd .L32_mask, R32; -	leaq (4*4*4)(%rdx), %rax; -	inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); -	inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); - -	movq %rsi, %r11; +	inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); +	inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);  	preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);  	Q(11); @@ -376,8 +337,103 @@ cast6_dec_blk_8way:  	popq %rbp;  	vmovdqa .Lbswap_mask, RKM; -	leaq (4*4*4)(%r11), %rax; -	outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); -	outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); +	outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); +	outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); + +	ret; + +.align 8 +.global cast6_ecb_enc_8way +.type   cast6_ecb_enc_8way,@function; + +cast6_ecb_enc_8way: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 */ + +	movq %rsi, %r11; + +	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + +	call __cast6_enc_blk8; + +	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + +	ret; + +.align 8 +.global cast6_ecb_dec_8way +.type   cast6_ecb_dec_8way,@function; + +cast6_ecb_dec_8way: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 */ + +	movq %rsi, %r11; + +	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + +	call __cast6_dec_blk8; + +	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + +	ret; + +.align 8 +.global cast6_cbc_dec_8way +.type   cast6_cbc_dec_8way,@function; + +cast6_cbc_dec_8way: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 */ + +	pushq %r12; + +	movq %rsi, %r11; +	movq %rdx, %r12; + +	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + +	call __cast6_dec_blk8; + +	store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + +	popq %r12; + +	ret; + +.align 8 +.global cast6_ctr_8way +.type   cast6_ctr_8way,@function; + +cast6_ctr_8way: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 *	%rcx: iv (little endian, 128bit) +	 */ + +	pushq %r12; + +	movq %rsi, %r11; +	movq %rdx, %r12; + +	load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, +		      RD2, RX, RKR, RKM); + +	call __cast6_enc_blk8; + +	store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + +	popq %r12;  	ret; diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 15e5f85a501..92f7ca24790 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -40,79 +40,34 @@  #define CAST6_PARALLEL_BLOCKS 8 -asmlinkage void __cast6_enc_blk_8way(struct cast6_ctx *ctx, u8 *dst, -				     const u8 *src, bool xor); -asmlinkage void cast6_dec_blk_8way(struct cast6_ctx *ctx, u8 *dst, +asmlinkage void cast6_ecb_enc_8way(struct cast6_ctx *ctx, u8 *dst, +				   const u8 *src); +asmlinkage void cast6_ecb_dec_8way(struct cast6_ctx *ctx, u8 *dst,  				   const u8 *src); -static inline void cast6_enc_blk_xway(struct cast6_ctx *ctx, u8 *dst, -				      const u8 *src) -{ -	__cast6_enc_blk_8way(ctx, dst, src, false); -} - -static inline void cast6_enc_blk_xway_xor(struct cast6_ctx *ctx, u8 *dst, -					  const u8 *src) -{ -	__cast6_enc_blk_8way(ctx, dst, src, true); -} - -static inline void cast6_dec_blk_xway(struct cast6_ctx *ctx, u8 *dst, -				      const u8 *src) -{ -	cast6_dec_blk_8way(ctx, dst, src); -} - - -static void cast6_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) -{ -	u128 ivs[CAST6_PARALLEL_BLOCKS - 1]; -	unsigned int j; - -	for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++) -		ivs[j] = src[j]; - -	cast6_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); - -	for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++) -		u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); -} +asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst, +				   const u8 *src); +asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src, +			       le128 *iv); -static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) +static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)  {  	be128 ctrblk; -	u128_to_be128(&ctrblk, iv); -	u128_inc(iv); +	le128_to_be128(&ctrblk, iv); +	le128_inc(iv);  	__cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);  	u128_xor(dst, src, (u128 *)&ctrblk);  } -static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, -				   u128 *iv) -{ -	be128 ctrblks[CAST6_PARALLEL_BLOCKS]; -	unsigned int i; - -	for (i = 0; i < CAST6_PARALLEL_BLOCKS; i++) { -		if (dst != src) -			dst[i] = src[i]; - -		u128_to_be128(&ctrblks[i], iv); -		u128_inc(iv); -	} - -	cast6_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); -} -  static const struct common_glue_ctx cast6_enc = {  	.num_funcs = 2,  	.fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,  	.funcs = { {  		.num_blocks = CAST6_PARALLEL_BLOCKS, -		.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_enc_blk_xway) } +		.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_enc_8way) }  	}, {  		.num_blocks = 1,  		.fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) } @@ -125,7 +80,7 @@ static const struct common_glue_ctx cast6_ctr = {  	.funcs = { {  		.num_blocks = CAST6_PARALLEL_BLOCKS, -		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr_xway) } +		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_ctr_8way) }  	}, {  		.num_blocks = 1,  		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) } @@ -138,7 +93,7 @@ static const struct common_glue_ctx cast6_dec = {  	.funcs = { {  		.num_blocks = CAST6_PARALLEL_BLOCKS, -		.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_dec_blk_xway) } +		.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_dec_8way) }  	}, {  		.num_blocks = 1,  		.fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) } @@ -151,7 +106,7 @@ static const struct common_glue_ctx cast6_dec_cbc = {  	.funcs = { {  		.num_blocks = CAST6_PARALLEL_BLOCKS, -		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_decrypt_cbc_xway) } +		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_cbc_dec_8way) }  	}, {  		.num_blocks = 1,  		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) } @@ -215,7 +170,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)  	ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);  	if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { -		cast6_enc_blk_xway(ctx->ctx, srcdst, srcdst); +		cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);  		return;  	} @@ -232,7 +187,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)  	ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);  	if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { -		cast6_dec_blk_xway(ctx->ctx, srcdst, srcdst); +		cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);  		return;  	} diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel_glue.c index 493f959261f..6812ad98355 100644 --- a/arch/x86/crypto/crc32c-intel.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -32,6 +32,8 @@  #include <asm/cpufeature.h>  #include <asm/cpu_device_id.h> +#include <asm/i387.h> +#include <asm/fpu-internal.h>  #define CHKSUM_BLOCK_SIZE	1  #define CHKSUM_DIGEST_SIZE	4 @@ -44,6 +46,31 @@  #define REX_PRE  #endif +#ifdef CONFIG_X86_64 +/* + * use carryless multiply version of crc32c when buffer + * size is >= 512 (when eager fpu is enabled) or + * >= 1024 (when eager fpu is disabled) to account + * for fpu state save/restore overhead. + */ +#define CRC32C_PCL_BREAKEVEN_EAGERFPU	512 +#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU	1024 + +asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, +				unsigned int crc_init); +static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU; +#if defined(X86_FEATURE_EAGER_FPU) +#define set_pcl_breakeven_point()					\ +do {									\ +	if (!use_eager_fpu())						\ +		crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU;	\ +} while (0) +#else +#define set_pcl_breakeven_point()					\ +	(crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU) +#endif +#endif /* CONFIG_X86_64 */ +  static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)  {  	while (length--) { @@ -154,6 +181,52 @@ static int crc32c_intel_cra_init(struct crypto_tfm *tfm)  	return 0;  } +#ifdef CONFIG_X86_64 +static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, +			       unsigned int len) +{ +	u32 *crcp = shash_desc_ctx(desc); + +	/* +	 * use faster PCL version if datasize is large enough to +	 * overcome kernel fpu state save/restore overhead +	 */ +	if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { +		kernel_fpu_begin(); +		*crcp = crc_pcl(data, len, *crcp); +		kernel_fpu_end(); +	} else +		*crcp = crc32c_intel_le_hw(*crcp, data, len); +	return 0; +} + +static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len, +				u8 *out) +{ +	if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { +		kernel_fpu_begin(); +		*(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); +		kernel_fpu_end(); +	} else +		*(__le32 *)out = +			~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len)); +	return 0; +} + +static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data, +			      unsigned int len, u8 *out) +{ +	return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out); +} + +static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data, +			       unsigned int len, u8 *out) +{ +	return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len, +				    out); +} +#endif /* CONFIG_X86_64 */ +  static struct shash_alg alg = {  	.setkey			=	crc32c_intel_setkey,  	.init			=	crc32c_intel_init, @@ -184,6 +257,14 @@ static int __init crc32c_intel_mod_init(void)  {  	if (!x86_match_cpu(crc32c_cpu_id))  		return -ENODEV; +#ifdef CONFIG_X86_64 +	if (cpu_has_pclmulqdq) { +		alg.update = crc32c_pcl_intel_update; +		alg.finup = crc32c_pcl_intel_finup; +		alg.digest = crc32c_pcl_intel_digest; +		set_pcl_breakeven_point(); +	} +#endif  	return crypto_register_shash(&alg);  } diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S new file mode 100644 index 00000000000..93c6d39237a --- /dev/null +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -0,0 +1,460 @@ +/* + * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) + * + * The white paper on CRC32C calculations with PCLMULQDQ instruction can be + * downloaded from: + * http://download.intel.com/design/intarch/papers/323405.pdf + * + * Copyright (C) 2012 Intel Corporation. + * + * Authors: + *	Wajdi Feghali <wajdi.k.feghali@intel.com> + *	James Guilford <james.guilford@intel.com> + *	David Cote <david.m.cote@intel.com> + *	Tim Chen <tim.c.chen@linux.intel.com> + * + * This software is available to you under a choice of one of two + * licenses.  You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + *     Redistribution and use in source and binary forms, with or + *     without modification, are permitted provided that the following + *     conditions are met: + * + *      - Redistributions of source code must retain the above + *        copyright notice, this list of conditions and the following + *        disclaimer. + * + *      - Redistributions in binary form must reproduce the above + *        copyright notice, this list of conditions and the following + *        disclaimer in the documentation and/or other materials + *        provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction + +.macro LABEL prefix n +\prefix\n\(): +.endm + +.macro JMPTBL_ENTRY i +.word crc_\i - crc_array +.endm + +.macro JNC_LESS_THAN j +	jnc less_than_\j +.endm + +# Define threshold where buffers are considered "small" and routed to more +# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so +# SMALL_SIZE can be no larger than 255. + +#define SMALL_SIZE 200 + +.if (SMALL_SIZE > 255) +.error "SMALL_ SIZE must be < 256" +.endif + +# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); + +.global crc_pcl +crc_pcl: +#define    bufp		%rdi +#define    bufp_dw	%edi +#define    bufp_w	%di +#define    bufp_b	%dil +#define    bufptmp	%rcx +#define    block_0	%rcx +#define    block_1	%rdx +#define    block_2	%r11 +#define    len		%rsi +#define    len_dw	%esi +#define    len_w	%si +#define    len_b	%sil +#define    crc_init_arg %rdx +#define    tmp		%rbx +#define    crc_init	%r8 +#define    crc_init_dw	%r8d +#define    crc1		%r9 +#define    crc2		%r10 + +	pushq   %rbx +	pushq   %rdi +	pushq   %rsi + +	## Move crc_init for Linux to a different +	mov     crc_init_arg, crc_init + +	################################################################ +	## 1) ALIGN: +	################################################################ + +	mov     bufp, bufptmp		# rdi = *buf +	neg     bufp +	and     $7, bufp		# calculate the unalignment amount of +					# the address +	je      proc_block		# Skip if aligned + +	## If len is less than 8 and we're unaligned, we need to jump +	## to special code to avoid reading beyond the end of the buffer +	cmp     $8, len +	jae     do_align +	# less_than_8 expects length in upper 3 bits of len_dw +	# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] +	shl     $32-3+1, len_dw +	jmp     less_than_8_post_shl1 + +do_align: +	#### Calculate CRC of unaligned bytes of the buffer (if any) +	movq    (bufptmp), tmp		# load a quadward from the buffer +	add     bufp, bufptmp		# align buffer pointer for quadword +					# processing +	sub     bufp, len		# update buffer length +align_loop: +	crc32b  %bl, crc_init_dw 	# compute crc32 of 1-byte +	shr     $8, tmp			# get next byte +	dec     bufp +	jne     align_loop + +proc_block: + +	################################################################ +	## 2) PROCESS  BLOCKS: +	################################################################ + +	## compute num of bytes to be processed +	movq    len, tmp		# save num bytes in tmp + +	cmpq    $128*24, len +	jae     full_block + +continue_block: +	cmpq    $SMALL_SIZE, len +	jb      small + +	## len < 128*24 +	movq    $2731, %rax		# 2731 = ceil(2^16 / 24) +	mul     len_dw +	shrq    $16, %rax + +	## eax contains floor(bytes / 24) = num 24-byte chunks to do + +	## process rax 24-byte chunks (128 >= rax >= 0) + +	## compute end address of each block +	## block 0 (base addr + RAX * 8) +	## block 1 (base addr + RAX * 16) +	## block 2 (base addr + RAX * 24) +	lea     (bufptmp, %rax, 8), block_0 +	lea     (block_0, %rax, 8), block_1 +	lea     (block_1, %rax, 8), block_2 + +	xor     crc1, crc1 +	xor     crc2, crc2 + +	## branch into array +	lea	jump_table(%rip), bufp +	movzxw  (bufp, %rax, 2), len +	offset=crc_array-jump_table +	lea     offset(bufp, len, 1), bufp +	jmp     *bufp + +	################################################################ +	## 2a) PROCESS FULL BLOCKS: +	################################################################ +full_block: +	movq    $128,%rax +	lea     128*8*2(block_0), block_1 +	lea     128*8*3(block_0), block_2 +	add     $128*8*1, block_0 + +	xor     crc1,crc1 +	xor     crc2,crc2 + +	# Fall thruogh into top of crc array (crc_128) + +	################################################################ +	## 3) CRC Array: +	################################################################ + +crc_array: +	i=128 +.rept 128-1 +.altmacro +LABEL crc_ %i +.noaltmacro +	crc32q   -i*8(block_0), crc_init +	crc32q   -i*8(block_1), crc1 +	crc32q   -i*8(block_2), crc2 +	i=(i-1) +.endr + +.altmacro +LABEL crc_ %i +.noaltmacro +	crc32q   -i*8(block_0), crc_init +	crc32q   -i*8(block_1), crc1 +# SKIP  crc32  -i*8(block_2), crc2 ; Don't do this one yet + +	mov     block_2, block_0 + +	################################################################ +	## 4) Combine three results: +	################################################################ + +	lea	(K_table-16)(%rip), bufp	# first entry is for idx 1 +	shlq    $3, %rax			# rax *= 8 +	subq    %rax, tmp			# tmp -= rax*8 +	shlq    $1, %rax +	subq    %rax, tmp			# tmp -= rax*16 +						# (total tmp -= rax*24) +	addq    %rax, bufp + +	movdqa  (bufp), %xmm0			# 2 consts: K1:K2 + +	movq    crc_init, %xmm1			# CRC for block 1 +	pclmulqdq $0x00,%xmm0,%xmm1		# Multiply by K2 + +	movq    crc1, %xmm2			# CRC for block 2 +	pclmulqdq $0x10, %xmm0, %xmm2		# Multiply by K1 + +	pxor    %xmm2,%xmm1 +	movq    %xmm1, %rax +	xor     -i*8(block_2), %rax +	mov     crc2, crc_init +	crc32   %rax, crc_init + +################################################################ +## 5) Check for end: +################################################################ + +LABEL crc_ 0 +	mov     tmp, len +	cmp     $128*24, tmp +	jae     full_block +	cmp     $24, tmp +	jae     continue_block + +less_than_24: +	shl     $32-4, len_dw			# less_than_16 expects length +						# in upper 4 bits of len_dw +	jnc     less_than_16 +	crc32q  (bufptmp), crc_init +	crc32q  8(bufptmp), crc_init +	jz      do_return +	add     $16, bufptmp +	# len is less than 8 if we got here +	# less_than_8 expects length in upper 3 bits of len_dw +	# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] +	shl     $2, len_dw +	jmp     less_than_8_post_shl1 + +	####################################################################### +	## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) +	####################################################################### +small: +	shl $32-8, len_dw		# Prepare len_dw for less_than_256 +	j=256 +.rept 5					# j = {256, 128, 64, 32, 16} +.altmacro +LABEL less_than_ %j			# less_than_j: Length should be in +					# upper lg(j) bits of len_dw +	j=(j/2) +	shl     $1, len_dw		# Get next MSB +	JNC_LESS_THAN %j +.noaltmacro +	i=0 +.rept (j/8) +	crc32q  i(bufptmp), crc_init	# Compute crc32 of 8-byte data +	i=i+8 +.endr +	jz      do_return		# Return if remaining length is zero +	add     $j, bufptmp		# Advance buf +.endr + +less_than_8:				# Length should be stored in +					# upper 3 bits of len_dw +	shl     $1, len_dw +less_than_8_post_shl1: +	jnc     less_than_4 +	crc32l  (bufptmp), crc_init_dw	# CRC of 4 bytes +	jz      do_return		# return if remaining data is zero +	add     $4, bufptmp +less_than_4:				# Length should be stored in +					# upper 2 bits of len_dw +	shl     $1, len_dw +	jnc     less_than_2 +	crc32w  (bufptmp), crc_init_dw	# CRC of 2 bytes +	jz      do_return		# return if remaining data is zero +	add     $2, bufptmp +less_than_2:				# Length should be stored in the MSB +					# of len_dw +	shl     $1, len_dw +	jnc     less_than_1 +	crc32b  (bufptmp), crc_init_dw	# CRC of 1 byte +less_than_1:				# Length should be zero +do_return: +	movq    crc_init, %rax +	popq    %rsi +	popq    %rdi +	popq    %rbx +        ret + +        ################################################################ +        ## jump table        Table is 129 entries x 2 bytes each +        ################################################################ +.align 4 +jump_table: +	i=0 +.rept 129 +.altmacro +JMPTBL_ENTRY %i +.noaltmacro +	i=i+1 +.endr +	################################################################ +	## PCLMULQDQ tables +	## Table is 128 entries x 2 quad words each +	################################################################ +.data +.align 64 +K_table: +        .quad 0x14cd00bd6,0x105ec76f0 +        .quad 0x0ba4fc28e,0x14cd00bd6 +        .quad 0x1d82c63da,0x0f20c0dfe +        .quad 0x09e4addf8,0x0ba4fc28e +        .quad 0x039d3b296,0x1384aa63a +        .quad 0x102f9b8a2,0x1d82c63da +        .quad 0x14237f5e6,0x01c291d04 +        .quad 0x00d3b6092,0x09e4addf8 +        .quad 0x0c96cfdc0,0x0740eef02 +        .quad 0x18266e456,0x039d3b296 +        .quad 0x0daece73e,0x0083a6eec +        .quad 0x0ab7aff2a,0x102f9b8a2 +        .quad 0x1248ea574,0x1c1733996 +        .quad 0x083348832,0x14237f5e6 +        .quad 0x12c743124,0x02ad91c30 +        .quad 0x0b9e02b86,0x00d3b6092 +        .quad 0x018b33a4e,0x06992cea2 +        .quad 0x1b331e26a,0x0c96cfdc0 +        .quad 0x17d35ba46,0x07e908048 +        .quad 0x1bf2e8b8a,0x18266e456 +        .quad 0x1a3e0968a,0x11ed1f9d8 +        .quad 0x0ce7f39f4,0x0daece73e +        .quad 0x061d82e56,0x0f1d0f55e +        .quad 0x0d270f1a2,0x0ab7aff2a +        .quad 0x1c3f5f66c,0x0a87ab8a8 +        .quad 0x12ed0daac,0x1248ea574 +        .quad 0x065863b64,0x08462d800 +        .quad 0x11eef4f8e,0x083348832 +        .quad 0x1ee54f54c,0x071d111a8 +        .quad 0x0b3e32c28,0x12c743124 +        .quad 0x0064f7f26,0x0ffd852c6 +        .quad 0x0dd7e3b0c,0x0b9e02b86 +        .quad 0x0f285651c,0x0dcb17aa4 +        .quad 0x010746f3c,0x018b33a4e +        .quad 0x1c24afea4,0x0f37c5aee +        .quad 0x0271d9844,0x1b331e26a +        .quad 0x08e766a0c,0x06051d5a2 +        .quad 0x093a5f730,0x17d35ba46 +        .quad 0x06cb08e5c,0x11d5ca20e +        .quad 0x06b749fb2,0x1bf2e8b8a +        .quad 0x1167f94f2,0x021f3d99c +        .quad 0x0cec3662e,0x1a3e0968a +        .quad 0x19329634a,0x08f158014 +        .quad 0x0e6fc4e6a,0x0ce7f39f4 +        .quad 0x08227bb8a,0x1a5e82106 +        .quad 0x0b0cd4768,0x061d82e56 +        .quad 0x13c2b89c4,0x188815ab2 +        .quad 0x0d7a4825c,0x0d270f1a2 +        .quad 0x10f5ff2ba,0x105405f3e +        .quad 0x00167d312,0x1c3f5f66c +        .quad 0x0f6076544,0x0e9adf796 +        .quad 0x026f6a60a,0x12ed0daac +        .quad 0x1a2adb74e,0x096638b34 +        .quad 0x19d34af3a,0x065863b64 +        .quad 0x049c3cc9c,0x1e50585a0 +        .quad 0x068bce87a,0x11eef4f8e +        .quad 0x1524fa6c6,0x19f1c69dc +        .quad 0x16cba8aca,0x1ee54f54c +        .quad 0x042d98888,0x12913343e +        .quad 0x1329d9f7e,0x0b3e32c28 +        .quad 0x1b1c69528,0x088f25a3a +        .quad 0x02178513a,0x0064f7f26 +        .quad 0x0e0ac139e,0x04e36f0b0 +        .quad 0x0170076fa,0x0dd7e3b0c +        .quad 0x141a1a2e2,0x0bd6f81f8 +        .quad 0x16ad828b4,0x0f285651c +        .quad 0x041d17b64,0x19425cbba +        .quad 0x1fae1cc66,0x010746f3c +        .quad 0x1a75b4b00,0x18db37e8a +        .quad 0x0f872e54c,0x1c24afea4 +        .quad 0x01e41e9fc,0x04c144932 +        .quad 0x086d8e4d2,0x0271d9844 +        .quad 0x160f7af7a,0x052148f02 +        .quad 0x05bb8f1bc,0x08e766a0c +        .quad 0x0a90fd27a,0x0a3c6f37a +        .quad 0x0b3af077a,0x093a5f730 +        .quad 0x04984d782,0x1d22c238e +        .quad 0x0ca6ef3ac,0x06cb08e5c +        .quad 0x0234e0b26,0x063ded06a +        .quad 0x1d88abd4a,0x06b749fb2 +        .quad 0x04597456a,0x04d56973c +        .quad 0x0e9e28eb4,0x1167f94f2 +        .quad 0x07b3ff57a,0x19385bf2e +        .quad 0x0c9c8b782,0x0cec3662e +        .quad 0x13a9cba9e,0x0e417f38a +        .quad 0x093e106a4,0x19329634a +        .quad 0x167001a9c,0x14e727980 +        .quad 0x1ddffc5d4,0x0e6fc4e6a +        .quad 0x00df04680,0x0d104b8fc +        .quad 0x02342001e,0x08227bb8a +        .quad 0x00a2a8d7e,0x05b397730 +        .quad 0x168763fa6,0x0b0cd4768 +        .quad 0x1ed5a407a,0x0e78eb416 +        .quad 0x0d2c3ed1a,0x13c2b89c4 +        .quad 0x0995a5724,0x1641378f0 +        .quad 0x19b1afbc4,0x0d7a4825c +        .quad 0x109ffedc0,0x08d96551c +        .quad 0x0f2271e60,0x10f5ff2ba +        .quad 0x00b0bf8ca,0x00bf80dd2 +        .quad 0x123888b7a,0x00167d312 +        .quad 0x1e888f7dc,0x18dcddd1c +        .quad 0x002ee03b2,0x0f6076544 +        .quad 0x183e8d8fe,0x06a45d2b2 +        .quad 0x133d7a042,0x026f6a60a +        .quad 0x116b0f50c,0x1dd3e10e8 +        .quad 0x05fabe670,0x1a2adb74e +        .quad 0x130004488,0x0de87806c +        .quad 0x000bcf5f6,0x19d34af3a +        .quad 0x18f0c7078,0x014338754 +        .quad 0x017f27698,0x049c3cc9c +        .quad 0x058ca5f00,0x15e3e77ee +        .quad 0x1af900c24,0x068bce87a +        .quad 0x0b5cfca28,0x0dd07448e +        .quad 0x0ded288f8,0x1524fa6c6 +        .quad 0x059f229bc,0x1d8048348 +        .quad 0x06d390dec,0x16cba8aca +        .quad 0x037170390,0x0a3e3e02c +        .quad 0x06353c1cc,0x042d98888 +        .quad 0x0c4584f5c,0x0d73c7bea +        .quad 0x1f16a3418,0x1329d9f7e +        .quad 0x0531377e2,0x185137662 +        .quad 0x1d8d9ca7c,0x1b1c69528 +        .quad 0x0b25b29f2,0x18a08b5bc +        .quad 0x19fb2a8b0,0x02178513a +        .quad 0x1a08fe6ac,0x1da758ae0 +        .quad 0x045cddf4e,0x0e0ac139e +        .quad 0x1a91647f2,0x169cf9eb0 +        .quad 0x1a0f717c4,0x0170076fa diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S new file mode 100644 index 00000000000..f7b6ea2ddfd --- /dev/null +++ b/arch/x86/crypto/glue_helper-asm-avx.S @@ -0,0 +1,91 @@ +/* + * Shared glue code for 128bit block ciphers, AVX assembler macros + * + * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + */ + +#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \ +	vmovdqu (0*16)(src), x0; \ +	vmovdqu (1*16)(src), x1; \ +	vmovdqu (2*16)(src), x2; \ +	vmovdqu (3*16)(src), x3; \ +	vmovdqu (4*16)(src), x4; \ +	vmovdqu (5*16)(src), x5; \ +	vmovdqu (6*16)(src), x6; \ +	vmovdqu (7*16)(src), x7; + +#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ +	vmovdqu x0, (0*16)(dst); \ +	vmovdqu x1, (1*16)(dst); \ +	vmovdqu x2, (2*16)(dst); \ +	vmovdqu x3, (3*16)(dst); \ +	vmovdqu x4, (4*16)(dst); \ +	vmovdqu x5, (5*16)(dst); \ +	vmovdqu x6, (6*16)(dst); \ +	vmovdqu x7, (7*16)(dst); + +#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ +	vpxor (0*16)(src), x1, x1; \ +	vpxor (1*16)(src), x2, x2; \ +	vpxor (2*16)(src), x3, x3; \ +	vpxor (3*16)(src), x4, x4; \ +	vpxor (4*16)(src), x5, x5; \ +	vpxor (5*16)(src), x6, x6; \ +	vpxor (6*16)(src), x7, x7; \ +	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); + +#define inc_le128(x, minus_one, tmp) \ +	vpcmpeqq minus_one, x, tmp; \ +	vpsubq minus_one, x, x; \ +	vpslldq $8, tmp, tmp; \ +	vpsubq tmp, x, x; + +#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \ +	vpcmpeqd t0, t0, t0; \ +	vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \ +	vmovdqa bswap, t1; \ +	\ +	/* load IV and byteswap */ \ +	vmovdqu (iv), x7; \ +	vpshufb t1, x7, x0; \ +	\ +	/* construct IVs */ \ +	inc_le128(x7, t0, t2); \ +	vpshufb t1, x7, x1; \ +	inc_le128(x7, t0, t2); \ +	vpshufb t1, x7, x2; \ +	inc_le128(x7, t0, t2); \ +	vpshufb t1, x7, x3; \ +	inc_le128(x7, t0, t2); \ +	vpshufb t1, x7, x4; \ +	inc_le128(x7, t0, t2); \ +	vpshufb t1, x7, x5; \ +	inc_le128(x7, t0, t2); \ +	vpshufb t1, x7, x6; \ +	inc_le128(x7, t0, t2); \ +	vmovdqa x7, t2; \ +	vpshufb t1, x7, x7; \ +	inc_le128(t2, t0, t1); \ +	vmovdqu t2, (iv); + +#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ +	vpxor (0*16)(src), x0, x0; \ +	vpxor (1*16)(src), x1, x1; \ +	vpxor (2*16)(src), x2, x2; \ +	vpxor (3*16)(src), x3, x3; \ +	vpxor (4*16)(src), x4, x4; \ +	vpxor (5*16)(src), x5, x5; \ +	vpxor (6*16)(src), x6, x6; \ +	vpxor (7*16)(src), x7, x7; \ +	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 30b3927bd73..22ce4f683e5 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -221,16 +221,16 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,  	u8 *src = (u8 *)walk->src.virt.addr;  	u8 *dst = (u8 *)walk->dst.virt.addr;  	unsigned int nbytes = walk->nbytes; -	u128 ctrblk; +	le128 ctrblk;  	u128 tmp; -	be128_to_u128(&ctrblk, (be128 *)walk->iv); +	be128_to_le128(&ctrblk, (be128 *)walk->iv);  	memcpy(&tmp, src, nbytes);  	fn_ctr(ctx, &tmp, &tmp, &ctrblk);  	memcpy(dst, &tmp, nbytes); -	u128_to_be128((be128 *)walk->iv, &ctrblk); +	le128_to_be128((be128 *)walk->iv, &ctrblk);  }  EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit); @@ -243,11 +243,11 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,  	unsigned int nbytes = walk->nbytes;  	u128 *src = (u128 *)walk->src.virt.addr;  	u128 *dst = (u128 *)walk->dst.virt.addr; -	u128 ctrblk; +	le128 ctrblk;  	unsigned int num_blocks, func_bytes;  	unsigned int i; -	be128_to_u128(&ctrblk, (be128 *)walk->iv); +	be128_to_le128(&ctrblk, (be128 *)walk->iv);  	/* Process multi-block batch */  	for (i = 0; i < gctx->num_funcs; i++) { @@ -269,7 +269,7 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,  	}  done: -	u128_to_be128((be128 *)walk->iv, &ctrblk); +	le128_to_be128((be128 *)walk->iv, &ctrblk);  	return nbytes;  } diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 504106bf04a..02b0e9fe997 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -24,7 +24,16 @@   *   */ +#include "glue_helper-asm-avx.S" +  .file "serpent-avx-x86_64-asm_64.S" + +.data +.align 16 + +.Lbswap128_mask: +	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +  .text  #define CTX %rdi @@ -550,51 +559,27 @@  	vpunpcklqdq		x3, t2, x2; \  	vpunpckhqdq		x3, t2, x3; -#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ -	vmovdqu (0*4*4)(in),	x0; \ -	vmovdqu (1*4*4)(in),	x1; \ -	vmovdqu (2*4*4)(in),	x2; \ -	vmovdqu (3*4*4)(in),	x3; \ -	\ +#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \  	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) -#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ -	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ -	\ -	vmovdqu x0,		(0*4*4)(out); \ -	vmovdqu x1,		(1*4*4)(out); \ -	vmovdqu x2,		(2*4*4)(out); \ -	vmovdqu x3,		(3*4*4)(out); - -#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ -	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ -	\ -	vpxor (0*4*4)(out),	x0, x0;       \ -	vmovdqu x0,		(0*4*4)(out); \ -	vpxor (1*4*4)(out),	x1, x1;       \ -	vmovdqu x1,		(1*4*4)(out); \ -	vpxor (2*4*4)(out),	x2, x2;       \ -	vmovdqu x2,		(2*4*4)(out); \ -	vpxor (3*4*4)(out),	x3, x3;       \ -	vmovdqu x3,		(3*4*4)(out); +#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ +	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)  .align 8 -.global __serpent_enc_blk_8way_avx -.type   __serpent_enc_blk_8way_avx,@function; +.type   __serpent_enc_blk8_avx,@function; -__serpent_enc_blk_8way_avx: +__serpent_enc_blk8_avx:  	/* input:  	 *	%rdi: ctx, CTX -	 *	%rsi: dst -	 *	%rdx: src -	 *	%rcx: bool, if true: xor output +	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks +	 * output: +	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks  	 */  	vpcmpeqd RNOT, RNOT, RNOT; -	leaq (4*4*4)(%rdx), %rax; -	read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); -	read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); +	read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); +	read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);  						 K2(RA, RB, RC, RD, RE, 0);  	S(S0, RA, RB, RC, RD, RE);		LK2(RC, RB, RD, RA, RE, 1); @@ -630,38 +615,26 @@ __serpent_enc_blk_8way_avx:  	S(S6, RA, RB, RD, RC, RE);		LK2(RD, RE, RB, RC, RA, 31);  	S(S7, RD, RE, RB, RC, RA);		 K2(RA, RB, RC, RD, RE, 32); -	leaq (4*4*4)(%rsi), %rax; - -	testb %cl, %cl; -	jnz __enc_xor8; - -	write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); -	write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); - -	ret; - -__enc_xor8: -	xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); -	xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); +	write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); +	write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);  	ret;  .align 8 -.global serpent_dec_blk_8way_avx -.type   serpent_dec_blk_8way_avx,@function; +.type   __serpent_dec_blk8_avx,@function; -serpent_dec_blk_8way_avx: +__serpent_dec_blk8_avx:  	/* input:  	 *	%rdi: ctx, CTX -	 *	%rsi: dst -	 *	%rdx: src +	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks +	 * output: +	 *	RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks  	 */  	vpcmpeqd RNOT, RNOT, RNOT; -	leaq (4*4*4)(%rdx), %rax; -	read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); -	read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); +	read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); +	read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);  						 K2(RA, RB, RC, RD, RE, 32);  	SP(SI7, RA, RB, RC, RD, RE, 31);	KL2(RB, RD, RA, RE, RC, 31); @@ -697,8 +670,85 @@ serpent_dec_blk_8way_avx:  	SP(SI1, RD, RB, RC, RA, RE, 1);		KL2(RE, RB, RC, RA, RD, 1);  	S(SI0, RE, RB, RC, RA, RD);		 K2(RC, RD, RB, RE, RA, 0); -	leaq (4*4*4)(%rsi), %rax; -	write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); -	write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); +	write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); +	write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); + +	ret; + +.align 8 +.global serpent_ecb_enc_8way_avx +.type   serpent_ecb_enc_8way_avx,@function; + +serpent_ecb_enc_8way_avx: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 */ + +	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + +	call __serpent_enc_blk8_avx; + +	store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + +	ret; + +.align 8 +.global serpent_ecb_dec_8way_avx +.type   serpent_ecb_dec_8way_avx,@function; + +serpent_ecb_dec_8way_avx: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 */ + +	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + +	call __serpent_dec_blk8_avx; + +	store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); + +	ret; + +.align 8 +.global serpent_cbc_dec_8way_avx +.type   serpent_cbc_dec_8way_avx,@function; + +serpent_cbc_dec_8way_avx: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 */ + +	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + +	call __serpent_dec_blk8_avx; + +	store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); + +	ret; + +.align 8 +.global serpent_ctr_8way_avx +.type   serpent_ctr_8way_avx,@function; + +serpent_ctr_8way_avx: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 *	%rcx: iv (little endian, 128bit) +	 */ + +	load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, +		      RD2, RK0, RK1, RK2); + +	call __serpent_enc_blk8_avx; + +	store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);  	ret; diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 3f543a04cf1..52abaaf28e7 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -42,55 +42,24 @@  #include <asm/crypto/ablk_helper.h>  #include <asm/crypto/glue_helper.h> -static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) -{ -	u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; -	unsigned int j; - -	for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) -		ivs[j] = src[j]; - -	serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); - -	for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) -		u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); -} - -static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) +static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)  {  	be128 ctrblk; -	u128_to_be128(&ctrblk, iv); -	u128_inc(iv); +	le128_to_be128(&ctrblk, iv); +	le128_inc(iv);  	__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);  	u128_xor(dst, src, (u128 *)&ctrblk);  } -static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, -				   u128 *iv) -{ -	be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; -	unsigned int i; - -	for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { -		if (dst != src) -			dst[i] = src[i]; - -		u128_to_be128(&ctrblks[i], iv); -		u128_inc(iv); -	} - -	serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); -} -  static const struct common_glue_ctx serpent_enc = {  	.num_funcs = 2,  	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,  	.funcs = { {  		.num_blocks = SERPENT_PARALLEL_BLOCKS, -		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) } +		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }  	}, {  		.num_blocks = 1,  		.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } @@ -103,7 +72,7 @@ static const struct common_glue_ctx serpent_ctr = {  	.funcs = { {  		.num_blocks = SERPENT_PARALLEL_BLOCKS, -		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) } +		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }  	}, {  		.num_blocks = 1,  		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } @@ -116,7 +85,7 @@ static const struct common_glue_ctx serpent_dec = {  	.funcs = { {  		.num_blocks = SERPENT_PARALLEL_BLOCKS, -		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) } +		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }  	}, {  		.num_blocks = 1,  		.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } @@ -129,7 +98,7 @@ static const struct common_glue_ctx serpent_dec_cbc = {  	.funcs = { {  		.num_blocks = SERPENT_PARALLEL_BLOCKS, -		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) } +		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }  	}, {  		.num_blocks = 1,  		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } @@ -193,7 +162,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)  	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);  	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { -		serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst); +		serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);  		return;  	} @@ -210,7 +179,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)  	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);  	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { -		serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst); +		serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);  		return;  	} diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 9107a9908c4..97a356ece24 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -59,19 +59,19 @@ static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)  		u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);  } -static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) +static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)  {  	be128 ctrblk; -	u128_to_be128(&ctrblk, iv); -	u128_inc(iv); +	le128_to_be128(&ctrblk, iv); +	le128_inc(iv);  	__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);  	u128_xor(dst, src, (u128 *)&ctrblk);  }  static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, -				   u128 *iv) +				   le128 *iv)  {  	be128 ctrblks[SERPENT_PARALLEL_BLOCKS];  	unsigned int i; @@ -80,8 +80,8 @@ static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,  		if (dst != src)  			dst[i] = src[i]; -		u128_to_be128(&ctrblks[i], iv); -		u128_inc(iv); +		le128_to_be128(&ctrblks[i], iv); +		le128_inc(iv);  	}  	serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index 1585abb13dd..ebac16bfa83 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -23,7 +23,16 @@   *   */ +#include "glue_helper-asm-avx.S" +  .file "twofish-avx-x86_64-asm_64.S" + +.data +.align 16 + +.Lbswap128_mask: +	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +  .text  /* structure of crypto context */ @@ -217,69 +226,45 @@  	vpunpcklqdq		x3, t2, x2; \  	vpunpckhqdq		x3, t2, x3; -#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \ -	vpxor (0*4*4)(in),	wkey, x0; \ -	vpxor (1*4*4)(in),	wkey, x1; \ -	vpxor (2*4*4)(in),	wkey, x2; \ -	vpxor (3*4*4)(in),	wkey, x3; \ +#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \ +	vpxor		x0, wkey, x0; \ +	vpxor		x1, wkey, x1; \ +	vpxor		x2, wkey, x2; \ +	vpxor		x3, wkey, x3; \  	\  	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) -#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \ -	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ -	\ -	vpxor		x0, wkey, x0;     \ -	vmovdqu 	x0, (0*4*4)(out); \ -	vpxor		x1, wkey, x1;     \ -	vmovdqu		x1, (1*4*4)(out); \ -	vpxor		x2, wkey, x2;     \ -	vmovdqu		x2, (2*4*4)(out); \ -	vpxor		x3, wkey, x3;     \ -	vmovdqu		x3, (3*4*4)(out); - -#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \ +#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \  	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \  	\ -	vpxor		x0, wkey, x0;         \ -	vpxor		(0*4*4)(out), x0, x0; \ -	vmovdqu 	x0, (0*4*4)(out);     \ -	vpxor		x1, wkey, x1;         \ -	vpxor		(1*4*4)(out), x1, x1; \ -	vmovdqu	        x1, (1*4*4)(out);     \ -	vpxor		x2, wkey, x2;         \ -	vpxor           (2*4*4)(out), x2, x2; \ -	vmovdqu		x2, (2*4*4)(out);     \ -	vpxor		x3, wkey, x3;         \ -	vpxor           (3*4*4)(out), x3, x3; \ -	vmovdqu		x3, (3*4*4)(out); +	vpxor		x0, wkey, x0; \ +	vpxor		x1, wkey, x1; \ +	vpxor		x2, wkey, x2; \ +	vpxor		x3, wkey, x3;  .align 8 -.global __twofish_enc_blk_8way -.type   __twofish_enc_blk_8way,@function; +.type	__twofish_enc_blk8,@function; -__twofish_enc_blk_8way: +__twofish_enc_blk8:  	/* input:  	 *	%rdi: ctx, CTX -	 *	%rsi: dst -	 *	%rdx: src -	 *	%rcx: bool, if true: xor output +	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks +	 * output: +	 *	RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks  	 */ +	vmovdqu w(CTX), RK1; +  	pushq %rbp;  	pushq %rbx;  	pushq %rcx; -	vmovdqu w(CTX), RK1; - -	leaq (4*4*4)(%rdx), %rax; -	inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); +	inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);  	preload_rgi(RA1);  	rotate_1l(RD1); -	inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); +	inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);  	rotate_1l(RD2); -	movq %rsi, %r11; -  	encrypt_cycle(0);  	encrypt_cycle(1);  	encrypt_cycle(2); @@ -295,47 +280,33 @@ __twofish_enc_blk_8way:  	popq %rbx;  	popq %rbp; -	leaq (4*4*4)(%r11), %rax; - -	testb %cl, %cl; -	jnz __enc_xor8; - -	outunpack_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); -	outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); - -	ret; - -__enc_xor8: -	outunpack_xor_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); -	outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); +	outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); +	outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);  	ret;  .align 8 -.global twofish_dec_blk_8way -.type   twofish_dec_blk_8way,@function; +.type	__twofish_dec_blk8,@function; -twofish_dec_blk_8way: +__twofish_dec_blk8:  	/* input:  	 *	%rdi: ctx, CTX -	 *	%rsi: dst -	 *	%rdx: src +	 *	RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks +	 * output: +	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks  	 */ +	vmovdqu (w+4*4)(CTX), RK1; +  	pushq %rbp;  	pushq %rbx; -	vmovdqu (w+4*4)(CTX), RK1; - -	leaq (4*4*4)(%rdx), %rax; -	inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); +	inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);  	preload_rgi(RC1);  	rotate_1l(RA1); -	inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); +	inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);  	rotate_1l(RA2); -	movq %rsi, %r11; -  	decrypt_cycle(7);  	decrypt_cycle(6);  	decrypt_cycle(5); @@ -350,8 +321,103 @@ twofish_dec_blk_8way:  	popq %rbx;  	popq %rbp; -	leaq (4*4*4)(%r11), %rax; -	outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); -	outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); +	outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); +	outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); + +	ret; + +.align 8 +.global twofish_ecb_enc_8way +.type   twofish_ecb_enc_8way,@function; + +twofish_ecb_enc_8way: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 */ + +	movq %rsi, %r11; + +	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + +	call __twofish_enc_blk8; + +	store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); + +	ret; + +.align 8 +.global twofish_ecb_dec_8way +.type   twofish_ecb_dec_8way,@function; + +twofish_ecb_dec_8way: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 */ + +	movq %rsi, %r11; + +	load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); + +	call __twofish_dec_blk8; + +	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + +	ret; + +.align 8 +.global twofish_cbc_dec_8way +.type   twofish_cbc_dec_8way,@function; + +twofish_cbc_dec_8way: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 */ + +	pushq %r12; + +	movq %rsi, %r11; +	movq %rdx, %r12; + +	load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); + +	call __twofish_dec_blk8; + +	store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + +	popq %r12; + +	ret; + +.align 8 +.global twofish_ctr_8way +.type   twofish_ctr_8way,@function; + +twofish_ctr_8way: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 *	%rcx: iv (little endian, 128bit) +	 */ + +	pushq %r12; + +	movq %rsi, %r11; +	movq %rdx, %r12; + +	load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, +		      RD2, RX0, RX1, RY0); + +	call __twofish_enc_blk8; + +	store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); + +	popq %r12;  	ret; diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index e7708b5442e..94ac91d26e4 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -45,66 +45,23 @@  #define TWOFISH_PARALLEL_BLOCKS 8 -static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, -					const u8 *src) -{ -	__twofish_enc_blk_3way(ctx, dst, src, false); -} -  /* 8-way parallel cipher functions */ -asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst, -				       const u8 *src, bool xor); -asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst, +asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, +				     const u8 *src); +asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,  				     const u8 *src); -static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst, -					const u8 *src) -{ -	__twofish_enc_blk_8way(ctx, dst, src, false); -} - -static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst, -					    const u8 *src) -{ -	__twofish_enc_blk_8way(ctx, dst, src, true); -} +asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, +				     const u8 *src); +asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, +				 const u8 *src, le128 *iv); -static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst, +static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,  					const u8 *src)  { -	twofish_dec_blk_8way(ctx, dst, src); -} - -static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src) -{ -	u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1]; -	unsigned int j; - -	for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++) -		ivs[j] = src[j]; - -	twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); - -	for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++) -		u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); +	__twofish_enc_blk_3way(ctx, dst, src, false);  } -static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src, -				     u128 *iv) -{ -	be128 ctrblks[TWOFISH_PARALLEL_BLOCKS]; -	unsigned int i; - -	for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) { -		if (dst != src) -			dst[i] = src[i]; - -		u128_to_be128(&ctrblks[i], iv); -		u128_inc(iv); -	} - -	twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); -}  static const struct common_glue_ctx twofish_enc = {  	.num_funcs = 3, @@ -112,7 +69,7 @@ static const struct common_glue_ctx twofish_enc = {  	.funcs = { {  		.num_blocks = TWOFISH_PARALLEL_BLOCKS, -		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) } +		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }  	}, {  		.num_blocks = 3,  		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } @@ -128,7 +85,7 @@ static const struct common_glue_ctx twofish_ctr = {  	.funcs = { {  		.num_blocks = TWOFISH_PARALLEL_BLOCKS, -		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) } +		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }  	}, {  		.num_blocks = 3,  		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } @@ -144,7 +101,7 @@ static const struct common_glue_ctx twofish_dec = {  	.funcs = { {  		.num_blocks = TWOFISH_PARALLEL_BLOCKS, -		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) } +		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }  	}, {  		.num_blocks = 3,  		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } @@ -160,7 +117,7 @@ static const struct common_glue_ctx twofish_dec_cbc = {  	.funcs = { {  		.num_blocks = TWOFISH_PARALLEL_BLOCKS, -		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) } +		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }  	}, {  		.num_blocks = 3,  		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } @@ -227,7 +184,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)  	ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);  	if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { -		twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst); +		twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);  		return;  	} @@ -249,7 +206,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)  	ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);  	if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { -		twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst); +		twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);  		return;  	} diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index aa3eb358b7e..13e63b3e1df 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -62,15 +62,15 @@ void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src)  }  EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way); -void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) +void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)  {  	be128 ctrblk;  	if (dst != src)  		*dst = *src; -	u128_to_be128(&ctrblk, iv); -	u128_inc(iv); +	le128_to_be128(&ctrblk, iv); +	le128_inc(iv);  	twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);  	u128_xor(dst, dst, (u128 *)&ctrblk); @@ -78,7 +78,7 @@ void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)  EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr);  void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, -				     u128 *iv) +			      le128 *iv)  {  	be128 ctrblks[3]; @@ -88,12 +88,12 @@ void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,  		dst[2] = src[2];  	} -	u128_to_be128(&ctrblks[0], iv); -	u128_inc(iv); -	u128_to_be128(&ctrblks[1], iv); -	u128_inc(iv); -	u128_to_be128(&ctrblks[2], iv); -	u128_inc(iv); +	le128_to_be128(&ctrblks[0], iv); +	le128_inc(iv); +	le128_to_be128(&ctrblks[1], iv); +	le128_inc(iv); +	le128_to_be128(&ctrblks[2], iv); +	le128_inc(iv);  	twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks);  } diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h new file mode 100644 index 00000000000..98038add801 --- /dev/null +++ b/arch/x86/include/asm/crypto/camellia.h @@ -0,0 +1,82 @@ +#ifndef ASM_X86_CAMELLIA_H +#define ASM_X86_CAMELLIA_H + +#include <linux/kernel.h> +#include <linux/crypto.h> + +#define CAMELLIA_MIN_KEY_SIZE	16 +#define CAMELLIA_MAX_KEY_SIZE	32 +#define CAMELLIA_BLOCK_SIZE	16 +#define CAMELLIA_TABLE_BYTE_LEN	272 +#define CAMELLIA_PARALLEL_BLOCKS 2 + +struct camellia_ctx { +	u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)]; +	u32 key_length; +}; + +struct camellia_lrw_ctx { +	struct lrw_table_ctx lrw_table; +	struct camellia_ctx camellia_ctx; +}; + +struct camellia_xts_ctx { +	struct camellia_ctx tweak_ctx; +	struct camellia_ctx crypt_ctx; +}; + +extern int __camellia_setkey(struct camellia_ctx *cctx, +			     const unsigned char *key, +			     unsigned int key_len, u32 *flags); + +extern int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, +			       unsigned int keylen); +extern void lrw_camellia_exit_tfm(struct crypto_tfm *tfm); + +extern int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, +			       unsigned int keylen); + +/* regular block cipher functions */ +asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, +				   const u8 *src, bool xor); +asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst, +				 const u8 *src); + +/* 2-way parallel cipher functions */ +asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, +					const u8 *src, bool xor); +asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst, +				      const u8 *src); + +static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, +				    const u8 *src) +{ +	__camellia_enc_blk(ctx, dst, src, false); +} + +static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst, +					const u8 *src) +{ +	__camellia_enc_blk(ctx, dst, src, true); +} + +static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, +					 const u8 *src) +{ +	__camellia_enc_blk_2way(ctx, dst, src, false); +} + +static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst, +					     const u8 *src) +{ +	__camellia_enc_blk_2way(ctx, dst, src, true); +} + +/* glue helpers */ +extern void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src); +extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, +			       le128 *iv); +extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, +				    le128 *iv); + +#endif /* ASM_X86_CAMELLIA_H */ diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h index 3e408bddc96..e2d65b061d2 100644 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -13,7 +13,7 @@  typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);  typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);  typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src, -				       u128 *iv); +				       le128 *iv);  #define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))  #define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn)) @@ -71,23 +71,29 @@ static inline void glue_fpu_end(bool fpu_enabled)  		kernel_fpu_end();  } -static inline void u128_to_be128(be128 *dst, const u128 *src) +static inline void le128_to_be128(be128 *dst, const le128 *src)  { -	dst->a = cpu_to_be64(src->a); -	dst->b = cpu_to_be64(src->b); +	dst->a = cpu_to_be64(le64_to_cpu(src->a)); +	dst->b = cpu_to_be64(le64_to_cpu(src->b));  } -static inline void be128_to_u128(u128 *dst, const be128 *src) +static inline void be128_to_le128(le128 *dst, const be128 *src)  { -	dst->a = be64_to_cpu(src->a); -	dst->b = be64_to_cpu(src->b); +	dst->a = cpu_to_le64(be64_to_cpu(src->a)); +	dst->b = cpu_to_le64(be64_to_cpu(src->b));  } -static inline void u128_inc(u128 *i) +static inline void le128_inc(le128 *i)  { -	i->b++; -	if (!i->b) -		i->a++; +	u64 a = le64_to_cpu(i->a); +	u64 b = le64_to_cpu(i->b); + +	b++; +	if (!b) +		a++; + +	i->a = cpu_to_le64(a); +	i->b = cpu_to_le64(b);  }  extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h index 432deedd294..0da1d3e2a55 100644 --- a/arch/x86/include/asm/crypto/serpent-avx.h +++ b/arch/x86/include/asm/crypto/serpent-avx.h @@ -6,27 +6,14 @@  #define SERPENT_PARALLEL_BLOCKS 8 -asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, -					   const u8 *src, bool xor); -asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, +					 const u8 *src); +asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,  					 const u8 *src); -static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, -				   const u8 *src) -{ -	__serpent_enc_blk_8way_avx(ctx, dst, src, false); -} - -static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, -				       const u8 *src) -{ -	__serpent_enc_blk_8way_avx(ctx, dst, src, true); -} - -static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, -				   const u8 *src) -{ -	serpent_dec_blk_8way_avx(ctx, dst, src); -} +asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, +					 const u8 *src); +asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst, +				     const u8 *src, le128 *iv);  #endif diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h index 9d2c514bd5f..878c51ceebb 100644 --- a/arch/x86/include/asm/crypto/twofish.h +++ b/arch/x86/include/asm/crypto/twofish.h @@ -31,9 +31,9 @@ asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,  /* helpers from twofish_x86_64-3way module */  extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);  extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, -				u128 *iv); +				le128 *iv);  extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, -				     u128 *iv); +				     le128 *iv);  extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,  			      unsigned int keylen);  |