From 37385fcd836bcc086b56b8dc7089d5038c203f13 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Sat, 15 Sep 2012 06:01:11 +0200 Subject: Rework some parts of the AES128-GCM method These changes improve the performance of the AES128-GCM method by ~10% on my Intel CPU when compiled with -O2. Furthermore, the AES and the GHASH parts are separated now, allowing to switch to other implementations of the algorithms more easily. --- src/fastd.c | 44 ++++++++++-- src/fastd.h | 4 +- src/method_aes128_gcm.c | 159 ++++++++++++++++++++++------------------- src/method_null.c | 8 ++- src/method_xsalsa20_poly1305.c | 6 ++ src/protocol_ec25519_fhmqvc.c | 2 +- 6 files changed, 137 insertions(+), 86 deletions(-) diff --git a/src/fastd.c b/src/fastd.c index dadcd17..3bbf30f 100644 --- a/src/fastd.c +++ b/src/fastd.c @@ -264,7 +264,7 @@ static size_t methods_min_encrypt_head_space(fastd_context *ctx) { ret = s; } - return ret; + return ALIGN(ret, 8); } static size_t methods_min_decrypt_head_space(fastd_context *ctx) { @@ -280,6 +280,38 @@ static size_t methods_min_decrypt_head_space(fastd_context *ctx) { ret = s; } + return ALIGN(ret, 8); +} + +static size_t methods_min_encrypt_tail_space(fastd_context *ctx) { + size_t ret = ctx->conf->methods[0]->min_encrypt_tail_space(ctx); + + int i; + for (i = 0; i < MAX_METHODS; i++) { + if (!ctx->conf->methods[i]) + break; + + size_t s = ctx->conf->methods[i]->min_encrypt_tail_space(ctx); + if (s > ret) + ret = s; + } + + return ret; +} + +static size_t methods_min_decrypt_tail_space(fastd_context *ctx) { + size_t ret = ctx->conf->methods[0]->min_decrypt_tail_space(ctx); + + int i; + for (i = 0; i < MAX_METHODS; i++) { + if (!ctx->conf->methods[i]) + break; + + size_t s = ctx->conf->methods[i]->min_decrypt_tail_space(ctx); + if (s > ret) + ret = s; + } + return ret; } @@ -360,7 +392,7 @@ void fastd_handle_receive(fastd_context *ctx, fastd_peer *peer, fastd_buffer buf fastd_peer *dest_peer; for (dest_peer = ctx->peers; dest_peer; dest_peer = dest_peer->next) { if (dest_peer != peer && fastd_peer_is_established(dest_peer)) { - fastd_buffer send_buffer = fastd_buffer_alloc(buffer.len, methods_min_encrypt_head_space(ctx), 0); + fastd_buffer send_buffer = fastd_buffer_alloc(buffer.len, methods_min_encrypt_head_space(ctx), methods_min_encrypt_tail_space(ctx)); memcpy(send_buffer.data, buffer.data, buffer.len); ctx->conf->protocol->send(ctx, dest_peer, send_buffer); } @@ -479,7 +511,7 @@ static void handle_tasks(fastd_context *ctx) { case TASK_KEEPALIVE: pr_debug(ctx, "sending keepalive to %P", task->peer); - ctx->conf->protocol->send(ctx, task->peer, fastd_buffer_alloc(0, methods_min_encrypt_head_space(ctx), 0)); + ctx->conf->protocol->send(ctx, task->peer, fastd_buffer_alloc(0, methods_min_encrypt_head_space(ctx), methods_min_encrypt_tail_space(ctx))); break; default: @@ -492,7 +524,7 @@ static void handle_tasks(fastd_context *ctx) { static void handle_tun(fastd_context *ctx) { size_t max_len = fastd_max_packet_size(ctx); - fastd_buffer buffer = fastd_buffer_alloc(max_len, methods_min_encrypt_head_space(ctx), 0); + fastd_buffer buffer = fastd_buffer_alloc(max_len, methods_min_encrypt_head_space(ctx), methods_min_encrypt_tail_space(ctx)); ssize_t len = read(ctx->tunfd, buffer.data, max_len); if (len < 0) { @@ -529,7 +561,7 @@ static void handle_tun(fastd_context *ctx) { if (peer == NULL) { for (peer = ctx->peers; peer; peer = peer->next) { if (fastd_peer_is_established(peer)) { - fastd_buffer send_buffer = fastd_buffer_alloc(len, methods_min_encrypt_head_space(ctx), 0); + fastd_buffer send_buffer = fastd_buffer_alloc(len, methods_min_encrypt_head_space(ctx), methods_min_encrypt_tail_space(ctx)); memcpy(send_buffer.data, buffer.data, len); ctx->conf->protocol->send(ctx, peer, send_buffer); } @@ -541,7 +573,7 @@ static void handle_tun(fastd_context *ctx) { static void handle_socket(fastd_context *ctx, int sockfd) { size_t max_len = PACKET_TYPE_LEN + methods_max_packet_size(ctx); - fastd_buffer buffer = fastd_buffer_alloc(max_len, ALIGN8(methods_min_decrypt_head_space(ctx)), 0); + fastd_buffer buffer = fastd_buffer_alloc(max_len, methods_min_decrypt_head_space(ctx), methods_min_decrypt_tail_space(ctx)); uint8_t *packet_type; fastd_peer_address recvaddr; diff --git a/src/fastd.h b/src/fastd.h index 12702e7..e1c531b 100644 --- a/src/fastd.h +++ b/src/fastd.h @@ -91,6 +91,8 @@ struct _fastd_method { size_t (*max_packet_size)(fastd_context *ctx); size_t (*min_encrypt_head_space)(fastd_context *ctx); size_t (*min_decrypt_head_space)(fastd_context *ctx); + size_t (*min_encrypt_tail_space)(fastd_context *ctx); + size_t (*min_decrypt_tail_space)(fastd_context *ctx); fastd_method_session_state* (*session_init)(fastd_context *ctx, uint8_t *secret, size_t length, bool initiator); bool (*session_is_valid)(fastd_context *ctx, fastd_method_session_state *session); @@ -349,7 +351,7 @@ static inline void fastd_string_stack_free(fastd_string_stack *str) { } } -#define ALIGN8(l) (((l+7)/8)*8) +#define ALIGN(l, a) (((l+a-1)/a)*a) static inline bool timespec_after(const struct timespec *tp1, const struct timespec *tp2) { return (tp1->tv_sec > tp2->tv_sec || diff --git a/src/method_aes128_gcm.c b/src/method_aes128_gcm.c index b0c3232..9f2d6b4 100644 --- a/src/method_aes128_gcm.c +++ b/src/method_aes128_gcm.c @@ -87,14 +87,19 @@ static size_t method_max_packet_size(fastd_context *ctx) { } -static size_t method_min_encrypt_head_space(fastd_context *ctx) { +static size_t method_min_head_space(fastd_context *ctx) { return 0; } -static size_t method_min_decrypt_head_space(fastd_context *ctx) { - return 0; +static size_t method_min_encrypt_tail_space(fastd_context *ctx) { + return (BLOCKBYTES-1); +} + +static size_t method_min_decrypt_tail_space(fastd_context *ctx) { + return (2*BLOCKBYTES-1); } + static const block_t r = { .b = {0xe1} }; static inline uint8_t shr(block_t *out, const block_t *in, int n) { @@ -110,15 +115,13 @@ static inline uint8_t shr(block_t *out, const block_t *in, int n) { return (c >> (8-n)); } -static inline void xor(uint8_t *x, const uint8_t *a, const uint8_t *b, unsigned int l) { - int i; - for (i = 0; i < l; i++) - x[i] = a[i] ^ b[i]; +static inline void xor(block_t *x, const block_t *a, const block_t *b) { + x->qw[0] = a->qw[0] ^ b->qw[0]; + x->qw[1] = a->qw[1] ^ b->qw[1]; } -static inline void xor_block(block_t *x, const block_t *a) { - x->qw[0] ^= a->qw[0]; - x->qw[1] ^= a->qw[1]; +static inline void xor_a(block_t *x, const block_t *a) { + xor(x, x, a); } static fastd_method_session_state* method_session_init(fastd_context *ctx, uint8_t *secret, size_t length, bool initiator) { @@ -137,8 +140,7 @@ static fastd_method_session_state* method_session_init(fastd_context *ctx, uint8 crypto_stream_aes128ctr_beforenm(session->d, secret); - uint8_t zerononce[crypto_stream_aes128ctr_NONCEBYTES]; - memset(zerononce, 0, crypto_stream_aes128ctr_NONCEBYTES); + static const uint8_t zerononce[crypto_stream_aes128ctr_NONCEBYTES] = {}; block_t Hbase[4]; crypto_stream_aes128ctr_afternm(Hbase[0].b, BLOCKBYTES, zerononce, session->d); @@ -149,7 +151,7 @@ static fastd_method_session_state* method_session_init(fastd_context *ctx, uint8 for (i = 1; i < 4; i++) { uint8_t carry = shr(&Hbase[i], &Hbase[i-1], 1); if (carry) - xor_block(&Hbase[i], &r); + xor_a(&Hbase[i], &r); shr(&Rbase[i], &Rbase[i-1], 1); } @@ -162,8 +164,8 @@ static fastd_method_session_state* method_session_init(fastd_context *ctx, uint8 int j; for (j = 0; j < 4; j++) { if (i & (8 >> j)) { - xor_block(&session->H[0][i], &Hbase[j]); - xor_block(&R[i], &Rbase[j]); + xor_a(&session->H[0][i], &Hbase[j]); + xor_a(&R[i], &Rbase[j]); } } } @@ -173,7 +175,7 @@ static fastd_method_session_state* method_session_init(fastd_context *ctx, uint8 for (j = 0; j < 16; j++) { uint8_t carry = shr(&session->H[i][j], &session->H[i-1][j], 4); - xor_block(&session->H[i][j], &R[carry]); + xor_a(&session->H[i][j], &R[carry]); } } @@ -207,57 +209,73 @@ static void method_session_free(fastd_context *ctx, fastd_method_session_state * } } -static void mulH(uint8_t out[BLOCKBYTES], const uint8_t in[BLOCKBYTES], fastd_method_session_state *session) { - block_t out2; - memset(&out2, 0, BLOCKBYTES); +static void mulH_a(block_t *x, fastd_method_session_state *session) { + block_t out = {}; int i; for (i = 0; i < 16; i++) { - xor_block(&out2, &session->H[2*i][in[i]>>4]); - xor_block(&out2, &session->H[2*i+1][in[i]&0xf]); + xor_a(&out, &session->H[2*i][x->b[i]>>4]); + xor_a(&out, &session->H[2*i+1][x->b[i]&0xf]); } - memcpy(out, &out2, BLOCKBYTES); + *x = out; } -#define BLOCKPTR(buf, i) (((uint8_t*)(buf))+(i)*BLOCKBYTES) +static inline void xor_blocks(block_t *out, const block_t *in1, const block_t *in2, size_t n_blocks) { + int i; + for (i = 0; i < n_blocks; i++) + xor(&out[i], &in1[i], &in2[i]); +} + +static inline void put_size(block_t *out, size_t len) { + memset(out, 0, BLOCKBYTES-5); + out->b[BLOCKBYTES-5] = len >> 29; + out->b[BLOCKBYTES-4] = len >> 21; + out->b[BLOCKBYTES-3] = len >> 13; + out->b[BLOCKBYTES-2] = len >> 5; + out->b[BLOCKBYTES-1] = len << 3; +} + +static inline void ghash(block_t *out, const block_t *blocks, size_t n_blocks, fastd_method_session_state *session) { + memset(out, 0, sizeof(block_t)); + + int i; + for (i = 0; i < n_blocks; i++) { + xor_a(out, &blocks[i]); + mulH_a(out, session); + } +} static bool method_encrypt(fastd_context *ctx, fastd_peer *peer, fastd_method_session_state *session, fastd_buffer *out, fastd_buffer in) { - *out = fastd_buffer_alloc(in.len, NONCEBYTES+BLOCKBYTES, 0); - uint8_t *sig = ((uint8_t*)out->data) - BLOCKBYTES; + size_t tail_len = ALIGN(in.len, BLOCKBYTES)-in.len; + *out = fastd_buffer_alloc(in.len, ALIGN(NONCEBYTES+BLOCKBYTES, 8), BLOCKBYTES+tail_len); - memset(sig, 0, BLOCKBYTES); + if (tail_len) + memset(in.data+in.len, 0, tail_len); uint8_t nonce[crypto_stream_aes128ctr_NONCEBYTES]; memcpy(nonce, session->send_nonce, NONCEBYTES); memset(nonce+NONCEBYTES, 0, crypto_stream_aes128ctr_NONCEBYTES-NONCEBYTES-1); nonce[crypto_stream_aes128ctr_NONCEBYTES-1] = 1; - uint8_t stream[in.len+BLOCKBYTES]; - crypto_stream_aes128ctr_afternm(stream, in.len+BLOCKBYTES, nonce, session->d); + int n_blocks = (in.len+BLOCKBYTES-1)/BLOCKBYTES; - int blocks = (in.len+BLOCKBYTES-1)/BLOCKBYTES; + block_t stream[n_blocks+1]; + crypto_stream_aes128ctr_afternm((uint8_t*)stream, sizeof(stream), nonce, session->d); - int i; - for (i = 0; i < blocks; i++) { - int len = BLOCKBYTES; - if (i == blocks-1) - len = in.len - i*BLOCKBYTES; + block_t *inblocks = in.data; + block_t *outblocks = out->data; - xor(BLOCKPTR(out->data, i), BLOCKPTR(in.data, i), BLOCKPTR(stream, i+1), len); + xor_blocks(outblocks, inblocks, stream+1, n_blocks); - xor(sig, sig, BLOCKPTR(out->data, i), len); - mulH(sig, sig, session); - } + if (tail_len) + memset(out->data+out->len, 0, tail_len); - sig[BLOCKBYTES-5] ^= (in.len >> 29) & 0xff; - sig[BLOCKBYTES-4] ^= (in.len >> 21) & 0xff; - sig[BLOCKBYTES-3] ^= (in.len >> 13) & 0xff; - sig[BLOCKBYTES-2] ^= (in.len >> 5) & 0xff; - sig[BLOCKBYTES-1] ^= (in.len << 3) & 0xff; - mulH(sig, sig, session); + put_size(&outblocks[n_blocks], in.len); - xor(sig, sig, stream, BLOCKBYTES); + block_t *sig = outblocks-1; + ghash(sig, outblocks, n_blocks+1, session); + xor_a(sig, &stream[0]); fastd_buffer_free(in); @@ -294,48 +312,37 @@ static bool method_decrypt(fastd_context *ctx, fastd_peer *peer, fastd_method_se fastd_buffer_push_head(&in, NONCEBYTES+BLOCKBYTES); - *out = fastd_buffer_alloc(in.len, 0, 0); + size_t tail_len = ALIGN(in.len, BLOCKBYTES)-in.len; + *out = fastd_buffer_alloc(in.len, 0, tail_len); - uint8_t sig[BLOCKBYTES]; - memset(sig, 0, BLOCKBYTES); + int n_blocks = (in.len+BLOCKBYTES-1)/BLOCKBYTES; - uint8_t stream[in.len+BLOCKBYTES]; - crypto_stream_aes128ctr_afternm(stream, in.len+BLOCKBYTES, nonce, session->d); + block_t stream[n_blocks+1]; + crypto_stream_aes128ctr_afternm((uint8_t*)stream, sizeof(stream), nonce, session->d); - int blocks = (in.len+BLOCKBYTES-1)/BLOCKBYTES; + block_t *inblocks = in.data; + block_t *outblocks = out->data; - int i; - for (i = 0; i < blocks; i++) { - int len = BLOCKBYTES; - if (i == blocks-1) - len = in.len - i*BLOCKBYTES; + if (tail_len) + memset(in.data+in.len, 0, tail_len); - xor(BLOCKPTR(out->data, i), BLOCKPTR(in.data, i), BLOCKPTR(stream, i+1), len); + put_size(&inblocks[n_blocks], in.len); - xor(sig, sig, BLOCKPTR(in.data, i), len); - mulH(sig, sig, session); - } - - sig[BLOCKBYTES-5] ^= (in.len >> 29) & 0xff; - sig[BLOCKBYTES-4] ^= (in.len >> 21) & 0xff; - sig[BLOCKBYTES-3] ^= (in.len >> 13) & 0xff; - sig[BLOCKBYTES-2] ^= (in.len >> 5) & 0xff; - sig[BLOCKBYTES-1] ^= (in.len << 3) & 0xff; - mulH(sig, sig, session); - - xor(sig, sig, stream, BLOCKBYTES); + block_t sig; + ghash(&sig, inblocks, n_blocks+1, session); + xor_a(&sig, &stream[0]); - fastd_buffer_pull_head(&in, BLOCKBYTES); - - if (memcmp(sig, in.data, BLOCKBYTES) != 0) { + if (memcmp(&sig, inblocks-1, BLOCKBYTES) != 0) { fastd_buffer_free(*out); /* restore input buffer */ - fastd_buffer_pull_head(&in, NONCEBYTES); + fastd_buffer_pull_head(&in, NONCEBYTES+BLOCKBYTES); return false; } + xor_blocks(outblocks, inblocks, stream+1, n_blocks); + fastd_buffer_free(in); if (age < 0) { @@ -361,8 +368,10 @@ const fastd_method fastd_method_aes128_gcm = { .name = "aes128-gcm", .max_packet_size = method_max_packet_size, - .min_encrypt_head_space = method_min_encrypt_head_space, - .min_decrypt_head_space = method_min_decrypt_head_space, + .min_encrypt_head_space = method_min_head_space, + .min_decrypt_head_space = method_min_head_space, + .min_encrypt_tail_space = method_min_encrypt_tail_space, + .min_decrypt_tail_space = method_min_decrypt_tail_space, .session_init = method_session_init, .session_is_valid = method_session_is_valid, diff --git a/src/method_null.c b/src/method_null.c index e8b3fca..7cdbe00 100644 --- a/src/method_null.c +++ b/src/method_null.c @@ -31,7 +31,7 @@ static size_t method_max_packet_size(fastd_context *ctx) { return fastd_max_packet_size(ctx); } -static size_t method_min_head_space(fastd_context *ctx) { +static size_t method_min_head_tail_space(fastd_context *ctx) { return 0; } @@ -66,8 +66,10 @@ const fastd_method fastd_method_null = { .name = "null", .max_packet_size = method_max_packet_size, - .min_encrypt_head_space = method_min_head_space, - .min_decrypt_head_space = method_min_head_space, + .min_encrypt_head_space = method_min_head_tail_space, + .min_decrypt_head_space = method_min_head_tail_space, + .min_encrypt_tail_space = method_min_head_tail_space, + .min_decrypt_tail_space = method_min_head_tail_space, .session_init = method_session_init, .session_is_valid = method_session_is_valid, diff --git a/src/method_xsalsa20_poly1305.c b/src/method_xsalsa20_poly1305.c index c7fb040..23f4c5d 100644 --- a/src/method_xsalsa20_poly1305.c +++ b/src/method_xsalsa20_poly1305.c @@ -86,6 +86,10 @@ static size_t method_min_decrypt_head_space(fastd_context *ctx) { return (crypto_secretbox_xsalsa20poly1305_BOXZEROBYTES - NONCEBYTES); } +static size_t method_min_tail_space(fastd_context *ctx) { + return 0; +} + static fastd_method_session_state* method_session_init(fastd_context *ctx, uint8_t *secret, size_t length, bool initiator) { int i; @@ -220,6 +224,8 @@ const fastd_method fastd_method_xsalsa20_poly1305 = { .max_packet_size = method_max_packet_size, .min_encrypt_head_space = method_min_encrypt_head_space, .min_decrypt_head_space = method_min_decrypt_head_space, + .min_encrypt_tail_space = method_min_tail_space, + .min_decrypt_tail_space = method_min_tail_space, .session_init = method_session_init, .session_is_valid = method_session_is_valid, diff --git a/src/protocol_ec25519_fhmqvc.c b/src/protocol_ec25519_fhmqvc.c index b27f4d6..964c21d 100644 --- a/src/protocol_ec25519_fhmqvc.c +++ b/src/protocol_ec25519_fhmqvc.c @@ -726,7 +726,7 @@ static void protocol_send(fastd_context *ctx, fastd_peer *peer, fastd_buffer buf } static void send_empty(fastd_context *ctx, fastd_peer *peer, protocol_session *session) { - session_send(ctx, peer, fastd_buffer_alloc(0, session->method->min_encrypt_head_space(ctx), 0), session); + session_send(ctx, peer, fastd_buffer_alloc(0, ALIGN(session->method->min_encrypt_head_space(ctx), 8), session->method->min_encrypt_tail_space(ctx)), session); } static void protocol_init_peer_state(fastd_context *ctx, fastd_peer *peer) { -- cgit v1.2.3