X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Fip%2Fip_packet.h;h=04cf9f11d70c936ac111b52007ad00d02a551b12;hb=7c2a3357f32ae02bcc20cdad6d87beda39f71d31;hp=c160deefbeb2814112e866bf349846611e95b4a6;hpb=ae34872077c956aa34ee816b55ccb5c5f6ab40a1;p=vpp.git diff --git a/src/vnet/ip/ip_packet.h b/src/vnet/ip/ip_packet.h old mode 100644 new mode 100755 index c160deefbeb..04cf9f11d70 --- a/src/vnet/ip/ip_packet.h +++ b/src/vnet/ip/ip_packet.h @@ -149,98 +149,6 @@ STATIC_ASSERT_SIZEOF (ip_ecn_t, 1); extern u8 *format_ip_ecn (u8 * s, va_list * va); -/* IP checksum support. */ - -static_always_inline u16 -ip_csum (void *data, u16 n_left) -{ - u32 sum; -#ifdef CLIB_HAVE_VEC256 - u16x16 v1, v2; - u32x8 zero = { 0 }; - u32x8 sum8 = { 0 }; - u32x4 sum4; -#endif - - /* if there is odd number of bytes, pad by zero and store in sum */ - sum = (n_left & 1) ? ((u8 *) data)[n_left - 1] << 8 : 0; - - /* we deal with words */ - n_left >>= 1; - -#ifdef CLIB_HAVE_VEC256 - while (n_left >= 32) - { - v1 = u16x16_load_unaligned (data); - v2 = u16x16_load_unaligned (data + 32); - -#ifdef CLIB_ARCH_IS_LITTLE_ENDIAN - v1 = u16x16_byte_swap (v1); - v2 = u16x16_byte_swap (v2); -#endif - sum8 += u32x8_from_u16x8 (u16x16_extract_lo (v1)); - sum8 += u32x8_from_u16x8 (u16x16_extract_hi (v1)); - sum8 += u32x8_from_u16x8 (u16x16_extract_lo (v2)); - sum8 += u32x8_from_u16x8 (u16x16_extract_hi (v2)); - n_left -= 32; - data += 64; - } - - if (n_left >= 16) - { - v1 = u16x16_load_unaligned (data); -#ifdef CLIB_ARCH_IS_LITTLE_ENDIAN - v1 = u16x16_byte_swap (v1); -#endif - sum8 += u32x8_from_u16x8 (u16x16_extract_lo (v1)); - sum8 += u32x8_from_u16x8 (u16x16_extract_hi (v1)); - n_left -= 16; - data += 32; - } - - if (n_left) - { - v1 = u16x16_load_unaligned (data); -#ifdef CLIB_ARCH_IS_LITTLE_ENDIAN - v1 = u16x16_byte_swap (v1); -#endif - v1 = u16x16_mask_last (v1, 16 - n_left); - sum8 += u32x8_from_u16x8 (u16x16_extract_lo (v1)); - sum8 += u32x8_from_u16x8 (u16x16_extract_hi (v1)); - } - - sum8 = u32x8_hadd (sum8, zero); - sum4 = u32x8_extract_lo (sum8) + u32x8_extract_hi (sum8); - sum = sum4[0] + sum4[1]; - -#else - /* scalar version */ - while (n_left >= 8) - { - sum += clib_net_to_host_u16 (*((u16 *) data + 0)); - sum += clib_net_to_host_u16 (*((u16 *) data + 1)); - sum += clib_net_to_host_u16 (*((u16 *) data + 2)); - sum += clib_net_to_host_u16 (*((u16 *) data + 3)); - sum += clib_net_to_host_u16 (*((u16 *) data + 4)); - sum += clib_net_to_host_u16 (*((u16 *) data + 5)); - sum += clib_net_to_host_u16 (*((u16 *) data + 6)); - sum += clib_net_to_host_u16 (*((u16 *) data + 7)); - n_left -= 8; - data += 16; - } - while (n_left) - { - sum += clib_net_to_host_u16 (*(u16 *) data); - n_left -= 1; - data += 2; - } -#endif - - sum = (sum & 0xffff) + (sum >> 16); - sum = (sum & 0xffff) + (sum >> 16); - return ~((u16) sum); -} - /* Incremental checksum update. */ typedef uword ip_csum_t; @@ -301,6 +209,20 @@ always_inline u16 ip_csum_fold (ip_csum_t c) { /* Reduce to 16 bits. */ +#if defined(__x86_64__) && defined(__BMI2__) + u64 tmp; + asm volatile( + /* using ADC is much faster than mov, shift, add sequence + * compiler produces */ + "mov %k[sum], %k[tmp] \n\t" + "shr $32, %[sum] \n\t" + "add %k[tmp], %k[sum] \n\t" + "mov $16, %k[tmp] \n\t" + "shrx %k[tmp], %k[sum], %k[tmp] \n\t" + "adc %w[tmp], %w[sum] \n\t" + "adc $0, %w[sum] \n\t" + : [ sum ] "+&r"(c), [ tmp ] "=&r"(tmp)); +#else #if uword_bits == 64 c = (c & (ip_csum_t) 0xffffffff) + (c >> (ip_csum_t) 32); c = (c & 0xffff) + (c >> 16); @@ -308,7 +230,7 @@ ip_csum_fold (ip_csum_t c) c = (c & 0xffff) + (c >> 16); c = (c & 0xffff) + (c >> 16); - +#endif return c; }