Index: sys/conf/files.amd64 =================================================================== --- sys/conf/files.amd64 (revision 254981) +++ sys/conf/files.amd64 (working copy) @@ -140,10 +140,13 @@ amd64/amd64/uma_machdep.c standard amd64/amd64/vm_machdep.c standard amd64/pci/pci_cfgreg.c optional pci -crypto/aesni/aesencdec_amd64.S optional aesni crypto/aesni/aeskeys_amd64.S optional aesni crypto/aesni/aesni.c optional aesni -crypto/aesni/aesni_wrap.c optional aesni +aesni_wrap.o optional aesni \ + dependency "$S/crypto/aesni/aesni_wrap.c" \ + compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -maes ${.IMPSRC}" \ + no-implicit-rule \ + clean "aesni_wrap.o" crypto/blowfish/bf_enc.c optional crypto | ipsec crypto/des/des_enc.c optional crypto | ipsec | netsmb crypto/via/padlock.c optional padlock Index: sys/conf/files.i386 =================================================================== --- sys/conf/files.i386 (revision 254981) +++ sys/conf/files.i386 (working copy) @@ -126,7 +126,11 @@ crypto/aesni/aesencdec_i386.S optional aesni crypto/aesni/aeskeys_i386.S optional aesni crypto/aesni/aesni.c optional aesni -crypto/aesni/aesni_wrap.c optional aesni +aesni_wrap.o optional aesni \ + dependency "$S/crypto/aesni/aesni_wrap.c" \ + compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -maes ${.IMPSRC}" \ + no-implicit-rule \ + clean "aesni_wrap.o" crypto/des/arch/i386/des_enc.S optional crypto | ipsec | netsmb crypto/via/padlock.c optional padlock crypto/via/padlock_cipher.c optional padlock Index: sys/crypto/aesni/aesencdec.h =================================================================== --- sys/crypto/aesni/aesencdec.h (revision 0) +++ sys/crypto/aesni/aesencdec.h (working copy) @@ -0,0 +1,136 @@ +/*- + * Copyright 2013 John-Mark Gurney + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#include + +static inline void +aesni_enc8(int rounds, const uint8_t *key_schedule, __m128i a, + __m128i b, __m128i c, __m128i d, __m128i e, __m128i f, __m128i g, + __m128i h, __m128i out[8]) +{ + const __m128i *keysched = (const __m128i *)key_schedule; + int i; + + a ^= keysched[0]; + b ^= keysched[0]; + c ^= keysched[0]; + d ^= keysched[0]; + e ^= keysched[0]; + f ^= keysched[0]; + g ^= keysched[0]; + h ^= keysched[0]; + + for (i = 0; i < rounds; i++) { + a = _mm_aesenc_si128(a, keysched[i + 1]); + b = _mm_aesenc_si128(b, keysched[i + 1]); + c = _mm_aesenc_si128(c, keysched[i + 1]); + d = _mm_aesenc_si128(d, keysched[i + 1]); + e = _mm_aesenc_si128(e, keysched[i + 1]); + f = _mm_aesenc_si128(f, keysched[i + 1]); + g = _mm_aesenc_si128(g, keysched[i + 1]); + h = _mm_aesenc_si128(h, keysched[i + 1]); + } + + out[0] = _mm_aesenclast_si128(a, keysched[i + 1]); + out[1] = _mm_aesenclast_si128(b, keysched[i + 1]); + out[2] = _mm_aesenclast_si128(c, keysched[i + 1]); + out[3] = _mm_aesenclast_si128(d, keysched[i + 1]); + out[4] = _mm_aesenclast_si128(e, keysched[i + 1]); + out[5] = _mm_aesenclast_si128(f, keysched[i + 1]); + out[6] = _mm_aesenclast_si128(g, keysched[i + 1]); + out[7] = _mm_aesenclast_si128(h, keysched[i + 1]); +} + +static inline void +aesni_dec8(int rounds, const uint8_t *key_schedule, __m128i a, + __m128i b, __m128i c, __m128i d, __m128i e, __m128i f, __m128i g, + __m128i h, __m128i out[8]) +{ + const __m128i *keysched = (const __m128i *)key_schedule; + int i; + + a ^= keysched[0]; + b ^= keysched[0]; + c ^= keysched[0]; + d ^= keysched[0]; + e ^= keysched[0]; + f ^= keysched[0]; + g ^= keysched[0]; + h ^= keysched[0]; + + for (i = 0; i < rounds; i++) { + a = _mm_aesdec_si128(a, keysched[i + 1]); + b = _mm_aesdec_si128(b, keysched[i + 1]); + c = _mm_aesdec_si128(c, keysched[i + 1]); + d = _mm_aesdec_si128(d, keysched[i + 1]); + e = _mm_aesdec_si128(e, keysched[i + 1]); + f = _mm_aesdec_si128(f, keysched[i + 1]); + g = _mm_aesdec_si128(g, keysched[i + 1]); + h = _mm_aesdec_si128(h, keysched[i + 1]); + } + + out[0] = _mm_aesdeclast_si128(a, keysched[i + 1]); + out[1] = _mm_aesdeclast_si128(b, keysched[i + 1]); + out[2] = _mm_aesdeclast_si128(c, keysched[i + 1]); + out[3] = _mm_aesdeclast_si128(d, keysched[i + 1]); + out[4] = _mm_aesdeclast_si128(e, keysched[i + 1]); + out[5] = _mm_aesdeclast_si128(f, keysched[i + 1]); + out[6] = _mm_aesdeclast_si128(g, keysched[i + 1]); + out[7] = _mm_aesdeclast_si128(h, keysched[i + 1]); +} + +static inline __m128i +aesni_enc(int rounds, const uint8_t *key_schedule, const __m128i from) +{ + __m128i tmp; + const __m128i *keysched = (const __m128i *)key_schedule; + int i; + + tmp = from ^ keysched[0]; + + for (i = 0; i < rounds; i++) + tmp = _mm_aesenc_si128(tmp, keysched[i + 1]); + + return _mm_aesenclast_si128(tmp, keysched[i + 1]); +} + +static inline __m128i +aesni_dec(int rounds, const uint8_t *key_schedule, const __m128i from) +{ + __m128i tmp; + const __m128i *keysched = (const __m128i *)key_schedule; + int i; + + tmp = from ^ keysched[0]; + + for (i = 0; i < rounds; i++) + tmp = _mm_aesdec_si128(tmp, keysched[i + 1]); + + return _mm_aesdeclast_si128(tmp, keysched[i + 1]); +} Index: sys/crypto/aesni/aeskeys_amd64.S =================================================================== --- sys/crypto/aesni/aeskeys_amd64.S (revision 254981) +++ sys/crypto/aesni/aeskeys_amd64.S (working copy) @@ -125,103 +125,72 @@ movups 0x10(%rdi),%xmm2 # other user key movaps %xmm2,(%rsi) addq $0x10,%rsi -// aeskeygenassist $0x1,%xmm2,%xmm1 # round 1 - .byte 0x66,0x0f,0x3a,0xdf,0xca,0x01 + aeskeygenassist $0x1,%xmm2,%xmm1 # round 1 call _key_expansion_256a -// aeskeygenassist $0x1,%xmm0,%xmm1 - .byte 0x66,0x0f,0x3a,0xdf,0xc8,0x01 + aeskeygenassist $0x1,%xmm0,%xmm1 call _key_expansion_256b -// aeskeygenassist $0x2,%xmm2,%xmm1 # round 2 - .byte 0x66,0x0f,0x3a,0xdf,0xca,0x02 + aeskeygenassist $0x2,%xmm2,%xmm1 # round 2 call _key_expansion_256a -// aeskeygenassist $0x2,%xmm0,%xmm1 - .byte 0x66,0x0f,0x3a,0xdf,0xc8,0x02 + aeskeygenassist $0x2,%xmm0,%xmm1 call _key_expansion_256b -// aeskeygenassist $0x4,%xmm2,%xmm1 # round 3 - .byte 0x66,0x0f,0x3a,0xdf,0xca,0x04 + aeskeygenassist $0x4,%xmm2,%xmm1 # round 3 call _key_expansion_256a -// aeskeygenassist $0x4,%xmm0,%xmm1 - .byte 0x66,0x0f,0x3a,0xdf,0xc8,0x04 + aeskeygenassist $0x4,%xmm0,%xmm1 call _key_expansion_256b -// aeskeygenassist $0x8,%xmm2,%xmm1 # round 4 - .byte 0x66,0x0f,0x3a,0xdf,0xca,0x08 + aeskeygenassist $0x8,%xmm2,%xmm1 # round 4 call _key_expansion_256a -// aeskeygenassist $0x8,%xmm0,%xmm1 - .byte 0x66,0x0f,0x3a,0xdf,0xc8,0x08 + aeskeygenassist $0x8,%xmm0,%xmm1 call _key_expansion_256b -// aeskeygenassist $0x10,%xmm2,%xmm1 # round 5 - .byte 0x66,0x0f,0x3a,0xdf,0xca,0x10 + aeskeygenassist $0x10,%xmm2,%xmm1 # round 5 call _key_expansion_256a -// aeskeygenassist $0x10,%xmm0,%xmm1 - .byte 0x66,0x0f,0x3a,0xdf,0xc8,0x10 + aeskeygenassist $0x10,%xmm0,%xmm1 call _key_expansion_256b -// aeskeygenassist $0x20,%xmm2,%xmm1 # round 6 - .byte 0x66,0x0f,0x3a,0xdf,0xca,0x20 + aeskeygenassist $0x20,%xmm2,%xmm1 # round 6 call _key_expansion_256a -// aeskeygenassist $0x20,%xmm0,%xmm1 - .byte 0x66,0x0f,0x3a,0xdf,0xc8,0x20 + aeskeygenassist $0x20,%xmm0,%xmm1 call _key_expansion_256b -// aeskeygenassist $0x40,%xmm2,%xmm1 # round 7 - .byte 0x66,0x0f,0x3a,0xdf,0xca,0x40 + aeskeygenassist $0x40,%xmm2,%xmm1 # round 7 call _key_expansion_256a retq .Lenc_key192: movq 0x10(%rdi),%xmm2 # other user key -// aeskeygenassist $0x1,%xmm2,%xmm1 # round 1 - .byte 0x66,0x0f,0x3a,0xdf,0xca,0x01 + aeskeygenassist $0x1,%xmm2,%xmm1 # round 1 call _key_expansion_192a -// aeskeygenassist $0x2,%xmm2,%xmm1 # round 2 - .byte 0x66,0x0f,0x3a,0xdf,0xca,0x02 + aeskeygenassist $0x2,%xmm2,%xmm1 # round 2 call _key_expansion_192b -// aeskeygenassist $0x4,%xmm2,%xmm1 # round 3 - .byte 0x66,0x0f,0x3a,0xdf,0xca,0x04 + aeskeygenassist $0x4,%xmm2,%xmm1 # round 3 call _key_expansion_192a -// aeskeygenassist $0x8,%xmm2,%xmm1 # round 4 - .byte 0x66,0x0f,0x3a,0xdf,0xca,0x08 + aeskeygenassist $0x8,%xmm2,%xmm1 # round 4 call _key_expansion_192b -// aeskeygenassist $0x10,%xmm2,%xmm1 # round 5 - .byte 0x66,0x0f,0x3a,0xdf,0xca,0x10 + aeskeygenassist $0x10,%xmm2,%xmm1 # round 5 call _key_expansion_192a -// aeskeygenassist $0x20,%xmm2,%xmm1 # round 6 - .byte 0x66,0x0f,0x3a,0xdf,0xca,0x20 + aeskeygenassist $0x20,%xmm2,%xmm1 # round 6 call _key_expansion_192b -// aeskeygenassist $0x40,%xmm2,%xmm1 # round 7 - .byte 0x66,0x0f,0x3a,0xdf,0xca,0x40 + aeskeygenassist $0x40,%xmm2,%xmm1 # round 7 call _key_expansion_192a -// aeskeygenassist $0x80,%xmm2,%xmm1 # round 8 - .byte 0x66,0x0f,0x3a,0xdf,0xca,0x80 + aeskeygenassist $0x80,%xmm2,%xmm1 # round 8 call _key_expansion_192b retq .Lenc_key128: -// aeskeygenassist $0x1,%xmm0,%xmm1 # round 1 - .byte 0x66,0x0f,0x3a,0xdf,0xc8,0x01 + aeskeygenassist $0x1,%xmm0,%xmm1 # round 1 call _key_expansion_128 -// aeskeygenassist $0x2,%xmm0,%xmm1 # round 2 - .byte 0x66,0x0f,0x3a,0xdf,0xc8,0x02 + aeskeygenassist $0x2,%xmm0,%xmm1 # round 2 call _key_expansion_128 -// aeskeygenassist $0x4,%xmm0,%xmm1 # round 3 - .byte 0x66,0x0f,0x3a,0xdf,0xc8,0x04 + aeskeygenassist $0x4,%xmm0,%xmm1 # round 3 call _key_expansion_128 -// aeskeygenassist $0x8,%xmm0,%xmm1 # round 4 - .byte 0x66,0x0f,0x3a,0xdf,0xc8,0x08 + aeskeygenassist $0x8,%xmm0,%xmm1 # round 4 call _key_expansion_128 -// aeskeygenassist $0x10,%xmm0,%xmm1 # round 5 - .byte 0x66,0x0f,0x3a,0xdf,0xc8,0x10 + aeskeygenassist $0x10,%xmm0,%xmm1 # round 5 call _key_expansion_128 -// aeskeygenassist $0x20,%xmm0,%xmm1 # round 6 - .byte 0x66,0x0f,0x3a,0xdf,0xc8,0x20 + aeskeygenassist $0x20,%xmm0,%xmm1 # round 6 call _key_expansion_128 -// aeskeygenassist $0x40,%xmm0,%xmm1 # round 7 - .byte 0x66,0x0f,0x3a,0xdf,0xc8,0x40 + aeskeygenassist $0x40,%xmm0,%xmm1 # round 7 call _key_expansion_128 -// aeskeygenassist $0x80,%xmm0,%xmm1 # round 8 - .byte 0x66,0x0f,0x3a,0xdf,0xc8,0x80 + aeskeygenassist $0x80,%xmm0,%xmm1 # round 8 call _key_expansion_128 -// aeskeygenassist $0x1b,%xmm0,%xmm1 # round 9 - .byte 0x66,0x0f,0x3a,0xdf,0xc8,0x1b + aeskeygenassist $0x1b,%xmm0,%xmm1 # round 9 call _key_expansion_128 -// aeskeygenassist $0x36,%xmm0,%xmm1 # round 10 - .byte 0x66,0x0f,0x3a,0xdf,0xc8,0x36 + aeskeygenassist $0x36,%xmm0,%xmm1 # round 10 call _key_expansion_128 retq .cfi_endproc @@ -238,8 +207,7 @@ 1: addq $0x10,%rsi subq $0x10,%rdi -// aesimc (%rdi),%xmm1 - .byte 0x66,0x0f,0x38,0xdb,0x0f + aesimc (%rdi),%xmm1 movdqa %xmm1,(%rsi) decl %edx jne 1b Index: sys/crypto/aesni/aesni.c =================================================================== --- sys/crypto/aesni/aesni.c (revision 254981) +++ sys/crypto/aesni/aesni.c (working copy) @@ -40,7 +40,7 @@ #include #include #include -#include "cryptodev_if.h" +#include struct aesni_softc { int32_t cid; @@ -74,6 +74,12 @@ device_printf(dev, "No AESNI support.\n"); return (EINVAL); } + + if ((cpu_feature & CPUID_SSE2) == 0) { + device_printf(dev, "No SSE2 support but AESNI!?!\n"); + return (EINVAL); + } + device_set_desc_copy(dev, "AES-CBC,AES-XTS"); return (0); } Index: sys/crypto/aesni/aesni.h =================================================================== --- sys/crypto/aesni/aesni.h (revision 254981) +++ sys/crypto/aesni/aesni.h (working copy) @@ -71,12 +71,6 @@ /* * Internal functions, implemented in assembler. */ -void aesni_enc(int rounds, const uint8_t *key_schedule, - const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN], - const uint8_t iv[AES_BLOCK_LEN]); -void aesni_dec(int rounds, const uint8_t *key_schedule, - const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN], - const uint8_t iv[AES_BLOCK_LEN]); void aesni_set_enckey(const uint8_t *userkey, uint8_t *encrypt_schedule, int number_of_rounds); void aesni_set_deckey(const uint8_t *encrypt_schedule, @@ -88,12 +82,19 @@ void aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len, const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN]); void aesni_decrypt_cbc(int rounds, const void *key_schedule, size_t len, - const uint8_t *from, const uint8_t iv[AES_BLOCK_LEN]); + uint8_t *buf, const uint8_t iv[AES_BLOCK_LEN]); void aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len, const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN]); void aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len, const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN]); +void aesni_encrypt_xts(int rounds, const void *data_schedule, + const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, + const uint8_t iv[AES_BLOCK_LEN]); +void aesni_decrypt_xts(int rounds, const void *data_schedule, + const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, + const uint8_t iv[AES_BLOCK_LEN]); + int aesni_cipher_setup(struct aesni_session *ses, struct cryptoini *encini); int aesni_cipher_process(struct aesni_session *ses, Index: sys/crypto/aesni/aesni_wrap.c =================================================================== --- sys/crypto/aesni/aesni_wrap.c (revision 254981) +++ sys/crypto/aesni/aesni_wrap.c (working copy) @@ -1,6 +1,7 @@ /*- * Copyright (c) 2010 Konstantin Belousov * Copyright (c) 2010-2011 Pawel Jakub Dawidek + * Copyright 2012-2013 John-Mark Gurney * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -35,34 +36,86 @@ #include #include +#include "aesencdec.h" + MALLOC_DECLARE(M_AESNI); void aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len, const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN]) { - const uint8_t *ivp; + __m128i tot, ivreg; size_t i; len /= AES_BLOCK_LEN; - ivp = iv; + ivreg = _mm_loadu_si128((const __m128i *)iv); for (i = 0; i < len; i++) { - aesni_enc(rounds - 1, key_schedule, from, to, ivp); - ivp = to; + tot = aesni_enc(rounds - 1, key_schedule, + _mm_loadu_si128((const __m128i *)from) ^ ivreg); + ivreg = tot; + _mm_storeu_si128((__m128i *)to, tot); from += AES_BLOCK_LEN; to += AES_BLOCK_LEN; } } void +aesni_decrypt_cbc(int rounds, const void *key_schedule, size_t len, + uint8_t *buf, const uint8_t iv[AES_BLOCK_LEN]) +{ + __m128i blocks[8]; + __m128i *bufs; + __m128i ivreg, nextiv; + size_t i, j, cnt; + + ivreg = _mm_loadu_si128((const __m128i *)iv); + cnt = len / AES_BLOCK_LEN / 8; + for (i = 0; i < cnt; i++) { + bufs = (__m128i *)buf; + aesni_dec8(rounds - 1, key_schedule, bufs[0], bufs[1], + bufs[2], bufs[3], bufs[4], bufs[5], bufs[6], + bufs[7], &blocks[0]); + for (j = 0; j < 8; j++) { + nextiv = bufs[j]; + bufs[j] = blocks[j] ^ ivreg; + ivreg = nextiv; + } + buf += AES_BLOCK_LEN * 8; + } + i *= 8; + cnt = len / AES_BLOCK_LEN; + for (; i < cnt; i++) { + bufs = (__m128i *)buf; + nextiv = bufs[0]; + bufs[0] = aesni_dec(rounds - 1, key_schedule, bufs[0]) ^ ivreg; + ivreg = nextiv; + buf += AES_BLOCK_LEN; + } +} + +void aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len, - const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN]) + const uint8_t *from, uint8_t *to) { - size_t i; + __m128i tot; + const __m128i *blocks; + size_t i, cnt; - len /= AES_BLOCK_LEN; - for (i = 0; i < len; i++) { - aesni_enc(rounds - 1, key_schedule, from, to, NULL); + cnt = len / AES_BLOCK_LEN / 8; + for (i = 0; i < cnt; i++) { + blocks = (const __m128i *)from; + aesni_enc8(rounds - 1, key_schedule, blocks[0], blocks[1], + blocks[2], blocks[3], blocks[4], blocks[5], blocks[6], + blocks[7], (__m128i *)to); + from += AES_BLOCK_LEN * 8; + to += AES_BLOCK_LEN * 8; + } + i *= 8; + cnt = len / AES_BLOCK_LEN; + for (; i < cnt; i++) { + tot = aesni_enc(rounds - 1, key_schedule, + _mm_loadu_si128((const __m128i *)from)); + _mm_storeu_si128((__m128i *)to, tot); from += AES_BLOCK_LEN; to += AES_BLOCK_LEN; } @@ -72,11 +125,25 @@ aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len, const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN]) { - size_t i; + __m128i tot; + const __m128i *blocks; + size_t i, cnt; - len /= AES_BLOCK_LEN; - for (i = 0; i < len; i++) { - aesni_dec(rounds - 1, key_schedule, from, to, NULL); + cnt = len / AES_BLOCK_LEN / 8; + for (i = 0; i < cnt; i++) { + blocks = (const __m128i *)from; + aesni_dec8(rounds - 1, key_schedule, blocks[0], blocks[1], + blocks[2], blocks[3], blocks[4], blocks[5], blocks[6], + blocks[7], (__m128i *)to); + from += AES_BLOCK_LEN * 8; + to += AES_BLOCK_LEN * 8; + } + i *= 8; + cnt = len / AES_BLOCK_LEN; + for (; i < cnt; i++) { + tot = aesni_dec(rounds - 1, key_schedule, + _mm_loadu_si128((const __m128i *)from)); + _mm_storeu_si128((__m128i *)to, tot); from += AES_BLOCK_LEN; to += AES_BLOCK_LEN; } @@ -86,34 +153,88 @@ #define AES_XTS_IVSIZE 8 #define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */ +static inline __m128i +xts_crank_lfsr(__m128i inp) +{ + const __m128i alphamask = _mm_set_epi32(1, 1, 1, AES_XTS_ALPHA); + __m128i xtweak, ret; + + /* set up xor mask */ + xtweak = _mm_shuffle_epi32(inp, 0x93); + xtweak = _mm_srai_epi32(xtweak, 31); + xtweak &= alphamask; + + /* next term */ + ret = _mm_slli_epi32(inp, 1); + ret ^= xtweak; + + return ret; +} + static void -aesni_crypt_xts_block(int rounds, const void *key_schedule, uint64_t *tweak, - const uint64_t *from, uint64_t *to, uint64_t *block, int do_encrypt) +aesni_crypt_xts_block(int rounds, const void *key_schedule, __m128i *tweak, + const __m128i *from, __m128i *to, int do_encrypt) { - int carry; + __m128i block; - block[0] = from[0] ^ tweak[0]; - block[1] = from[1] ^ tweak[1]; + block = *from ^ *tweak; if (do_encrypt) - aesni_enc(rounds - 1, key_schedule, (uint8_t *)block, (uint8_t *)to, NULL); + block = aesni_enc(rounds - 1, key_schedule, block); else - aesni_dec(rounds - 1, key_schedule, (uint8_t *)block, (uint8_t *)to, NULL); + block = aesni_dec(rounds - 1, key_schedule, block); - to[0] ^= tweak[0]; - to[1] ^= tweak[1]; + *to = block ^ *tweak; - /* Exponentiate tweak. */ - carry = ((tweak[0] & 0x8000000000000000ULL) > 0); - tweak[0] <<= 1; - if (tweak[1] & 0x8000000000000000ULL) { - uint8_t *twk = (uint8_t *)tweak; + *tweak = xts_crank_lfsr(*tweak); +} - twk[0] ^= AES_XTS_ALPHA; - } - tweak[1] <<= 1; - if (carry) - tweak[1] |= 1; +static void +aesni_crypt_xts_block8(int rounds, const void *key_schedule, __m128i *tweak, + const __m128i *from, __m128i *to, int do_encrypt) +{ + __m128i tmptweak; + __m128i a, b, c, d, e, f, g, h; + __m128i tweaks[8]; + __m128i tmp[8]; + + tmptweak = *tweak; + + /* + * unroll the loop. This lets gcc put values directly in the + * register and saves memory accesses. + */ +#define PREPINP(v, pos) \ + do { \ + tweaks[(pos)] = tmptweak; \ + (v) = from[(pos)] ^ tmptweak; \ + tmptweak = xts_crank_lfsr(tmptweak); \ + } while (0) + PREPINP(a, 0); + PREPINP(b, 1); + PREPINP(c, 2); + PREPINP(d, 3); + PREPINP(e, 4); + PREPINP(f, 5); + PREPINP(g, 6); + PREPINP(h, 7); + *tweak = tmptweak; + + if (do_encrypt) + aesni_enc8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h, + tmp); + else + aesni_dec8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h, + tmp); + + to[0] = tmp[0] ^ tweaks[0]; + to[1] = tmp[1] ^ tweaks[1]; + to[2] = tmp[2] ^ tweaks[2]; + to[3] = tmp[3] ^ tweaks[3]; + to[4] = tmp[4] ^ tweaks[4]; + to[5] = tmp[5] ^ tweaks[5]; + to[6] = tmp[6] ^ tweaks[6]; + to[7] = tmp[7] ^ tweaks[7]; } static void @@ -121,9 +242,9 @@ const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN], int do_encrypt) { - uint64_t block[AES_XTS_BLOCKSIZE / 8]; - uint8_t tweak[AES_XTS_BLOCKSIZE]; - size_t i; + __m128i tweakreg; + uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16); + size_t i, cnt; /* * Prepare tweak as E_k2(IV). IV is specified as LE representation @@ -136,21 +257,27 @@ #else #error Only LITTLE_ENDIAN architectures are supported. #endif - aesni_enc(rounds - 1, tweak_schedule, tweak, tweak, NULL); + tweakreg = _mm_loadu_si128((__m128i *)&tweak[0]); + tweakreg = aesni_enc(rounds - 1, tweak_schedule, tweakreg); - len /= AES_XTS_BLOCKSIZE; - for (i = 0; i < len; i++) { - aesni_crypt_xts_block(rounds, data_schedule, (uint64_t *)tweak, - (const uint64_t *)from, (uint64_t *)to, block, do_encrypt); + cnt = len / AES_XTS_BLOCKSIZE / 8; + for (i = 0; i < cnt; i++) { + aesni_crypt_xts_block8(rounds, data_schedule, &tweakreg, + (const __m128i *)from, (__m128i *)to, do_encrypt); + from += AES_XTS_BLOCKSIZE * 8; + to += AES_XTS_BLOCKSIZE * 8; + } + i *= 8; + cnt = len / AES_XTS_BLOCKSIZE; + for (; i < cnt; i++) { + aesni_crypt_xts_block(rounds, data_schedule, &tweakreg, + (const __m128i *)from, (__m128i *)to, do_encrypt); from += AES_XTS_BLOCKSIZE; to += AES_XTS_BLOCKSIZE; } - - bzero(tweak, sizeof(tweak)); - bzero(block, sizeof(block)); } -static void +void aesni_encrypt_xts(int rounds, const void *data_schedule, const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN]) @@ -160,7 +287,7 @@ iv, 1); } -static void +void aesni_decrypt_xts(int rounds, const void *data_schedule, const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN]) Index: sys/modules/aesni/Makefile =================================================================== --- sys/modules/aesni/Makefile (revision 254981) +++ sys/modules/aesni/Makefile (working copy) @@ -3,8 +3,17 @@ .PATH: ${.CURDIR}/../../crypto/aesni KMOD= aesni -SRCS= aesni.c aesni_wrap.c -SRCS+= aesencdec_${MACHINE_CPUARCH}.S aeskeys_${MACHINE_CPUARCH}.S +SRCS= aesni.c +SRCS+= aeskeys_${MACHINE_CPUARCH}.S SRCS+= device_if.h bus_if.h opt_bus.h cryptodev_if.h +OBJS+= aesni_wrap.o + +# Remove -nostdinc so we can get the intrinsics. +aesni_wrap.o: aesni_wrap.c + ${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} \ + -mmmx -msse -maes ${.IMPSRC} + ${CTFCONVERT_CMD} + .include +