+static_always_inline u16 *
+clib_compress_u16_x64 (u16 *dst, u16 *src, u64 mask)
+{
+#if defined(CLIB_HAVE_VEC512_COMPRESS_U8_U16)
+ u16x32u *sv = (u16x32u *) src;
+ for (int i = 0; i < 2; i++)
+ {
+ u16x32_compress_store (sv[i], mask, dst);
+ dst += _popcnt32 ((u32) mask);
+ mask >>= 32;
+ }
+#else
+ while (mask)
+ {
+ u16 bit = count_trailing_zeros (mask);
+ mask = clear_lowest_set_bit (mask);
+ dst++[0] = src[bit];
+ }
+#endif
+ return dst;
+}
+
+/** \brief Compress array of 16-bit elemments into destination array based on
+ * mask
+
+ @param dst destination array of u16 elements
+ @param src source array of u16 elements
+ @param mask array of u64 values representing compress mask
+ @param n_elts number of elements in the source array
+ @return number of elements stored in destionation array
+*/
+
+static_always_inline u32
+clib_compress_u16 (u16 *dst, u16 *src, u64 *mask, u32 n_elts)
+{
+ u16 *dst0 = dst;
+ while (n_elts >= 64)
+ {
+ if (mask[0] == ~0ULL)
+ {
+ clib_memcpy_fast (dst, src, 64 * sizeof (u16));
+ dst += 64;
+ }
+ else
+ dst = clib_compress_u16_x64 (dst, src, mask[0]);
+
+ mask++;
+ src += 64;
+ n_elts -= 64;
+ }
+
+ if (PREDICT_TRUE (n_elts == 0))
+ return dst - dst0;
+
+ return clib_compress_u16_x64 (dst, src, mask[0] & pow2_mask (n_elts)) - dst0;
+}
+
+static_always_inline u8 *
+clib_compress_u8_x64 (u8 *dst, u8 *src, u64 mask)
+{
+#if defined(CLIB_HAVE_VEC512_COMPRESS_U8_U16)
+ u8x64u *sv = (u8x64u *) src;
+ u8x64_compress_store (sv[0], mask, dst);
+ dst += _popcnt64 (mask);
+#else
+ while (mask)
+ {
+ u16 bit = count_trailing_zeros (mask);
+ mask = clear_lowest_set_bit (mask);
+ dst++[0] = src[bit];
+ }
+#endif
+ return dst;
+}
+
+/** \brief Compress array of 8-bit elemments into destination array based on
+ * mask
+
+ @param dst destination array of u8 elements
+ @param src source array of u8 elements
+ @param mask array of u64 values representing compress mask
+ @param n_elts number of elements in the source array
+ @return number of elements stored in destionation array
+*/
+
+static_always_inline u32
+clib_compress_u8 (u8 *dst, u8 *src, u64 *mask, u32 n_elts)
+{
+ u8 *dst0 = dst;
+ while (n_elts >= 64)
+ {
+ if (mask[0] == ~0ULL)
+ {
+ clib_memcpy_fast (dst, src, 64);
+ dst += 64;
+ }
+ else
+ dst = clib_compress_u8_x64 (dst, src, mask[0]);
+
+ mask++;
+ src += 64;
+ n_elts -= 64;
+ }
+
+ if (PREDICT_TRUE (n_elts == 0))
+ return dst - dst0;
+
+ return clib_compress_u8_x64 (dst, src, mask[0] & pow2_mask (n_elts)) - dst0;
+}
+