{
#ifdef __VAES__
r[0] = u8x64_xor3 (r[0], aes_block_load_x4 (src, i), k[0][0]);
- r[1] = u8x64_xor3 (r[1], aes_block_load_x4 (src, i), k[0][1]);
- r[2] = u8x64_xor3 (r[2], aes_block_load_x4 (src, i), k[0][2]);
- r[3] = u8x64_xor3 (r[3], aes_block_load_x4 (src, i), k[0][3]);
+ r[1] = u8x64_xor3 (r[1], aes_block_load_x4 (src + 4, i), k[0][1]);
+ r[2] = u8x64_xor3 (r[2], aes_block_load_x4 (src + 8, i), k[0][2]);
+ r[3] = u8x64_xor3 (r[3], aes_block_load_x4 (src + 12, i), k[0][3]);
for (j = 1; j < rounds; j++)
{