New upstream version 17.11-rc3
[deb_dpdk.git] / drivers / net / mlx5 / mlx5_rxtx_vec_sse.h
similarity index 72%
rename from drivers/net/mlx5/mlx5_rxtx_vec_sse.c
rename to drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index 8560f74..2b9f160 100644 (file)
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#ifndef RTE_PMD_MLX5_RXTX_VEC_SSE_H_
+#define RTE_PMD_MLX5_RXTX_VEC_SSE_H_
+
 #include <assert.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdlib.h>
 #include <smmintrin.h>
 
-/* Verbs header. */
-/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
-#include <infiniband/verbs.h>
-#include <infiniband/mlx5_hw.h>
-#include <infiniband/arch.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
-
-/* DPDK headers don't like -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
 #include <rte_mbuf.h>
 #include <rte_mempool.h>
 #include <rte_prefetch.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
 
 #include "mlx5.h"
 #include "mlx5_utils.h"
 #include "mlx5_rxtx.h"
+#include "mlx5_rxtx_vec.h"
 #include "mlx5_autoconf.h"
 #include "mlx5_defs.h"
 #include "mlx5_prm.h"
  * @param txq
  *   Pointer to TX queue structure.
  * @param dseg
- *   Pointer to buffer descriptor to be writen.
+ *   Pointer to buffer descriptor to be written.
  * @param pkts
  *   Pointer to array of packets to be sent.
  * @param n
  *   Number of packets to be filled.
  */
 static inline void
-txq_wr_dseg_v(struct txq *txq, __m128i *dseg,
+txq_wr_dseg_v(struct mlx5_txq_data *txq, __m128i *dseg,
              struct rte_mbuf **pkts, unsigned int n)
 {
        unsigned int pos;
@@ -118,85 +103,6 @@ txq_wr_dseg_v(struct txq *txq, __m128i *dseg,
 #endif
 }
 
-/**
- * Count the number of continuous single segment packets. The first packet must
- * be a single segment packet.
- *
- * @param pkts
- *   Pointer to array of packets.
- * @param pkts_n
- *   Number of packets.
- *
- * @return
- *   Number of continuous single segment packets.
- */
-static inline unsigned int
-txq_check_multiseg(struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-       unsigned int pos;
-
-       if (!pkts_n)
-               return 0;
-       assert(NB_SEGS(pkts[0]) == 1);
-       /* Count the number of continuous single segment packets. */
-       for (pos = 1; pos < pkts_n; ++pos)
-               if (NB_SEGS(pkts[pos]) > 1)
-                       break;
-       return pos;
-}
-
-/**
- * Count the number of packets having same ol_flags and calculate cs_flags.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param pkts
- *   Pointer to array of packets.
- * @param pkts_n
- *   Number of packets.
- * @param cs_flags
- *   Pointer of flags to be returned.
- *
- * @return
- *   Number of packets having same ol_flags.
- */
-static inline unsigned int
-txq_calc_offload(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
-                uint8_t *cs_flags)
-{
-       unsigned int pos;
-       const uint64_t ol_mask =
-               PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM |
-               PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE |
-               PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM;
-
-       if (!pkts_n)
-               return 0;
-       /* Count the number of packets having same ol_flags. */
-       for (pos = 1; pos < pkts_n; ++pos)
-               if ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask)
-                       break;
-       /* Should open another MPW session for the rest. */
-       if (pkts[0]->ol_flags &
-           (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
-               const uint64_t is_tunneled =
-                       pkts[0]->ol_flags &
-                       (PKT_TX_TUNNEL_GRE |
-                        PKT_TX_TUNNEL_VXLAN);
-
-               if (is_tunneled && txq->tunnel_en) {
-                       *cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM |
-                                   MLX5_ETH_WQE_L4_INNER_CSUM;
-                       if (pkts[0]->ol_flags & PKT_TX_OUTER_IP_CKSUM)
-                               *cs_flags |= MLX5_ETH_WQE_L3_CSUM;
-               } else {
-                       *cs_flags = MLX5_ETH_WQE_L3_CSUM |
-                                   MLX5_ETH_WQE_L4_CSUM;
-               }
-       }
-       return pos;
-}
-
 /**
  * Send multi-segmented packets until it encounters a single segment packet in
  * the pkts list.
@@ -212,7 +118,8 @@ txq_calc_offload(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
  *   Number of packets successfully transmitted (<= pkts_n).
  */
 static uint16_t
-txq_scatter_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
+             uint16_t pkts_n)
 {
        uint16_t elts_head = txq->elts_head;
        const uint16_t elts_n = 1 << txq->elts_n;
@@ -257,13 +164,17 @@ txq_scatter_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n)
                if (segs_n == 1 ||
                    max_elts < segs_n || max_wqe < 2)
                        break;
+               if (segs_n > MLX5_MPW_DSEG_MAX) {
+                       txq->stats.oerrors++;
+                       break;
+               }
                wqe = &((volatile struct mlx5_wqe64 *)
                         txq->wqes)[wqe_ci & wq_mask].hdr;
                if (buf->ol_flags &
                     (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
-                       const uint64_t is_tunneled = buf->ol_flags &
-                                                     (PKT_TX_TUNNEL_GRE |
-                                                      PKT_TX_TUNNEL_VXLAN);
+                       const uint64_t is_tunneled =
+                               buf->ol_flags & (PKT_TX_TUNNEL_GRE |
+                                                PKT_TX_TUNNEL_VXLAN);
 
                        if (is_tunneled && txq->tunnel_en) {
                                cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM |
@@ -298,7 +209,7 @@ txq_scatter_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n)
                /* Fill ESEG in the header. */
                _mm_store_si128(t_wqe + 1,
                                _mm_set_epi16(0, 0, 0, 0,
-                                             htons(len), cs_flags,
+                                             rte_cpu_to_be_16(len), cs_flags,
                                              0, 0));
                txq->wqe_ci = wqe_ci;
        }
@@ -307,7 +218,7 @@ txq_scatter_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n)
        txq->elts_comp += (uint16_t)(elts_head - txq->elts_head);
        txq->elts_head = elts_head;
        if (txq->elts_comp >= MLX5_TX_COMP_THRESH) {
-               wqe->ctrl[2] = htonl(8);
+               wqe->ctrl[2] = rte_cpu_to_be_32(8);
                wqe->ctrl[3] = txq->elts_head;
                txq->elts_comp = 0;
                ++txq->cq_pi;
@@ -338,7 +249,7 @@ txq_scatter_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n)
  *   Number of packets successfully transmitted (<= pkts_n).
  */
 static inline uint16_t
-txq_burst_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
+txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
            uint8_t cs_flags)
 {
        struct rte_mbuf **elts;
@@ -374,6 +285,7 @@ txq_burst_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
        max_elts = (elts_n - (elts_head - txq->elts_tail));
        max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
        pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts);
+       assert(pkts_n <= MLX5_DSEG_MAX - nb_dword_in_hdr);
        if (unlikely(!pkts_n))
                return 0;
        elts = &(*txq->elts)[elts_head & elts_m];
@@ -432,86 +344,10 @@ txq_burst_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
        txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) /
                       nb_dword_per_wqebb;
        /* Ring QP doorbell. */
-       mlx5_tx_dbrec(txq, wqe);
+       mlx5_tx_dbrec_cond_wmb(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST);
        return pkts_n;
 }
 
-/**
- * DPDK callback for vectorized TX.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts,
-                     uint16_t pkts_n)
-{
-       struct txq *txq = (struct txq *)dpdk_txq;
-       uint16_t nb_tx = 0;
-
-       while (pkts_n > nb_tx) {
-               uint16_t n;
-               uint16_t ret;
-
-               n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
-               ret = txq_burst_v(txq, &pkts[nb_tx], n, 0);
-               nb_tx += ret;
-               if (!ret)
-                       break;
-       }
-       return nb_tx;
-}
-
-/**
- * DPDK callback for vectorized TX with multi-seg packets and offload.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-       struct txq *txq = (struct txq *)dpdk_txq;
-       uint16_t nb_tx = 0;
-
-       while (pkts_n > nb_tx) {
-               uint8_t cs_flags = 0;
-               uint16_t n;
-               uint16_t ret;
-
-               /* Transmit multi-seg packets in the head of pkts list. */
-               if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS) &&
-                   NB_SEGS(pkts[nb_tx]) > 1)
-                       nb_tx += txq_scatter_v(txq,
-                                              &pkts[nb_tx],
-                                              pkts_n - nb_tx);
-               n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
-               if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS))
-                       n = txq_check_multiseg(&pkts[nb_tx], n);
-               if (!(txq->flags & ETH_TXQ_FLAGS_NOOFFLOADS))
-                       n = txq_calc_offload(txq, &pkts[nb_tx], n, &cs_flags);
-               ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags);
-               nb_tx += ret;
-               if (!ret)
-                       break;
-       }
-       return nb_tx;
-}
-
 /**
  * Store free buffers to RX SW ring.
  *
@@ -523,7 +359,7 @@ mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
  *   Number of packets to be stored.
  */
 static inline void
-rxq_copy_mbuf_v(struct rxq *rxq, struct rte_mbuf **pkts, uint16_t n)
+rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)
 {
        const uint16_t q_mask = (1 << rxq->elts_n) - 1;
        struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask];
@@ -540,41 +376,6 @@ rxq_copy_mbuf_v(struct rxq *rxq, struct rte_mbuf **pkts, uint16_t n)
                pkts[pos] = elts[pos];
 }
 
-/**
- * Replenish buffers for RX in bulk.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- * @param n
- *   Number of buffers to be replenished.
- */
-static inline void
-rxq_replenish_bulk_mbuf(struct rxq *rxq, uint16_t n)
-{
-       const uint16_t q_n = 1 << rxq->elts_n;
-       const uint16_t q_mask = q_n - 1;
-       const uint16_t elts_idx = rxq->rq_ci & q_mask;
-       struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
-       volatile struct mlx5_wqe_data_seg *wq = &(*rxq->wqes)[elts_idx];
-       unsigned int i;
-
-       assert(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH);
-       assert(n <= (uint16_t)(q_n - (rxq->rq_ci - rxq->rq_pi)));
-       assert(MLX5_VPMD_RXQ_RPLNSH_THRESH > MLX5_VPMD_DESCS_PER_LOOP);
-       /* Not to cross queue end. */
-       n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx);
-       if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {
-               rxq->stats.rx_nombuf += n;
-               return;
-       }
-       for (i = 0; i < n; ++i)
-               wq[i].addr = htonll((uintptr_t)elts[i]->buf_addr +
-                                   RTE_PKTMBUF_HEADROOM);
-       rxq->rq_ci += n;
-       rte_wmb();
-       *rxq->rq_db = htonl(rxq->rq_ci);
-}
-
 /**
  * Decompress a compressed completion and fill in mbufs in RX SW ring with data
  * extracted from the title completion descriptor.
@@ -588,8 +389,7 @@ rxq_replenish_bulk_mbuf(struct rxq *rxq, uint16_t n)
  *   the title completion descriptor to be copied to the rest of mbufs.
  */
 static inline void
-rxq_cq_decompress_v(struct rxq *rxq,
-                   volatile struct mlx5_cqe *cq,
+rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
                    struct rte_mbuf **elts)
 {
        volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + 1);
@@ -636,13 +436,6 @@ rxq_cq_decompress_v(struct rxq *rxq,
                             10, 11,  2,  3);
 #endif
 
-       /* Compile time sanity check for this function. */
-       RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
-                        offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
-       RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
-                        offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
-       RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
-                        offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
        /*
         * A. load mCQEs into a 128bit register.
         * B. store rearm data to mbuf.
@@ -747,12 +540,13 @@ rxq_cq_decompress_v(struct rxq *rxq,
  *   Pointer to array of packets to be filled.
  */
 static inline void
-rxq_cq_to_ptype_oflags_v(struct rxq *rxq, __m128i cqes[4], __m128i op_err,
-                        struct rte_mbuf **pkts)
+rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
+                        __m128i op_err, struct rte_mbuf **pkts)
 {
        __m128i pinfo0, pinfo1;
        __m128i pinfo, ptype;
-       __m128i ol_flags = _mm_set1_epi32(rxq->rss_hash * PKT_RX_RSS_HASH);
+       __m128i ol_flags = _mm_set1_epi32(rxq->rss_hash * PKT_RX_RSS_HASH |
+                                         rxq->hw_timestamp * PKT_RX_TIMESTAMP);
        __m128i cv_flags;
        const __m128i zero = _mm_setzero_si128();
        const __m128i ptype_mask =
@@ -769,17 +563,17 @@ rxq_cq_to_ptype_oflags_v(struct rxq *rxq, __m128i cqes[4], __m128i op_err,
                             (uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1),
                             0,
                             (uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1),
-                            (uint8_t)(PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED),
+                            (uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
                             0);
        const __m128i cv_mask =
                _mm_set_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-                             PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED,
+                             PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
                              PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-                             PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED,
+                             PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
                              PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-                             PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED,
+                             PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
                              PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-                             PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED);
+                             PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED);
        const __m128i mbuf_init =
                _mm_loadl_epi64((__m128i *)&rxq->mbuf_initializer);
        __m128i rearm0, rearm1, rearm2, rearm3;
@@ -853,66 +647,17 @@ rxq_cq_to_ptype_oflags_v(struct rxq *rxq, __m128i cqes[4], __m128i op_err,
        /* Merge to ol_flags. */
        ol_flags = _mm_or_si128(ol_flags, cv_flags);
        /* Merge mbuf_init and ol_flags. */
-       RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
-                        offsetof(struct rte_mbuf, rearm_data) + 8);
        rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 8), 0x30);
        rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 4), 0x30);
        rearm2 = _mm_blend_epi16(mbuf_init, ol_flags, 0x30);
        rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(ol_flags, 4), 0x30);
        /* Write 8B rearm_data and 8B ol_flags. */
-       RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
-                        RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
        _mm_store_si128((__m128i *)&pkts[0]->rearm_data, rearm0);
        _mm_store_si128((__m128i *)&pkts[1]->rearm_data, rearm1);
        _mm_store_si128((__m128i *)&pkts[2]->rearm_data, rearm2);
        _mm_store_si128((__m128i *)&pkts[3]->rearm_data, rearm3);
 }
 
-/**
- * Skip error packets.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- * @param[out] pkts
- *   Array to store received packets.
- * @param pkts_n
- *   Maximum number of packets in array.
- *
- * @return
- *   Number of packets successfully received (<= pkts_n).
- */
-static uint16_t
-rxq_handle_pending_error(struct rxq *rxq, struct rte_mbuf **pkts,
-                        uint16_t pkts_n)
-{
-       uint16_t n = 0;
-       unsigned int i;
-#ifdef MLX5_PMD_SOFT_COUNTERS
-       uint32_t err_bytes = 0;
-#endif
-
-       for (i = 0; i < pkts_n; ++i) {
-               struct rte_mbuf *pkt = pkts[i];
-
-               if (pkt->packet_type == RTE_PTYPE_ALL_MASK) {
-#ifdef MLX5_PMD_SOFT_COUNTERS
-                       err_bytes += PKT_LEN(pkt);
-#endif
-                       rte_pktmbuf_free_seg(pkt);
-               } else {
-                       pkts[n++] = pkt;
-               }
-       }
-       rxq->stats.idropped += (pkts_n - n);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-       /* Correct counters of errored completions. */
-       rxq->stats.ipackets -= (pkts_n - n);
-       rxq->stats.ibytes -= err_bytes;
-#endif
-       rxq->pending_err = 0;
-       return n;
-}
-
 /**
  * Receive burst of packets. An errored completion also consumes a mbuf, but the
  * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
@@ -929,7 +674,7 @@ rxq_handle_pending_error(struct rxq *rxq, struct rte_mbuf **pkts,
  *   Number of packets received including errors (<= pkts_n).
  */
 static inline uint16_t
-rxq_burst_v(struct rxq *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
        const uint16_t q_n = 1 << rxq->cqe_n;
        const uint16_t q_mask = q_n - 1;
@@ -984,26 +729,6 @@ rxq_burst_v(struct rxq *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
                              rxq->crc_present * ETHER_CRC_LEN);
        const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0);
 
-       /* Compile time sanity check for this function. */
-       RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
-                        offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
-       RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
-                        offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
-       RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, pkt_info) != 0);
-       RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, rx_hash_res) !=
-                        offsetof(struct mlx5_cqe, pkt_info) + 12);
-       RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, rsvd1) +
-                         sizeof(((struct mlx5_cqe *)0)->rsvd1) !=
-                        offsetof(struct mlx5_cqe, hdr_type_etc));
-       RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, vlan_info) !=
-                        offsetof(struct mlx5_cqe, hdr_type_etc) + 2);
-       RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, rsvd2) +
-                         sizeof(((struct mlx5_cqe *)0)->rsvd2) !=
-                        offsetof(struct mlx5_cqe, byte_cnt));
-       RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, sop_drop_qpn) !=
-                        RTE_ALIGN(offsetof(struct mlx5_cqe, sop_drop_qpn), 8));
-       RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, op_own) !=
-                        offsetof(struct mlx5_cqe, sop_drop_qpn) + 7);
        assert(rxq->sges_n == 0);
        assert(rxq->cqe_n == rxq->elts_n);
        cq = &(*rxq->cqes)[cq_idx];
@@ -1022,7 +747,7 @@ rxq_burst_v(struct rxq *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
         */
        repl_n = q_n - (rxq->rq_ci - rxq->rq_pi);
        if (repl_n >= MLX5_VPMD_RXQ_RPLNSH_THRESH)
-               rxq_replenish_bulk_mbuf(rxq, repl_n);
+               mlx5_rx_replenish_bulk_mbuf(rxq, repl_n);
        /* See if there're unreturned mbufs from compressed CQE. */
        rcvd_pkt = rxq->cq_ci - rxq->rq_pi;
        if (rcvd_pkt > 0) {
@@ -1214,6 +939,16 @@ rxq_burst_v(struct rxq *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
                rxq->pending_err |= !!_mm_cvtsi128_si64(opcode);
                /* D.5 fill in mbuf - rearm_data and packet_type. */
                rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);
+               if (rxq->hw_timestamp) {
+                       pkts[pos]->timestamp =
+                               rte_be_to_cpu_64(cq[pos].timestamp);
+                       pkts[pos + 1]->timestamp =
+                               rte_be_to_cpu_64(cq[pos + p1].timestamp);
+                       pkts[pos + 2]->timestamp =
+                               rte_be_to_cpu_64(cq[pos + p2].timestamp);
+                       pkts[pos + 3]->timestamp =
+                               rte_be_to_cpu_64(cq[pos + p3].timestamp);
+               }
 #ifdef MLX5_PMD_SOFT_COUNTERS
                /* Add up received bytes count. */
                byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask);
@@ -1254,164 +989,9 @@ rxq_burst_v(struct rxq *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
                        rcvd_pkt += n;
                }
        }
-       rte_wmb();
-       *rxq->cq_db = htonl(rxq->cq_ci);
+       rte_compiler_barrier();
+       *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
        return rcvd_pkt;
 }
 
-/**
- * DPDK callback for vectorized RX.
- *
- * @param dpdk_rxq
- *   Generic pointer to RX queue structure.
- * @param[out] pkts
- *   Array to store received packets.
- * @param pkts_n
- *   Maximum number of packets in array.
- *
- * @return
- *   Number of packets successfully received (<= pkts_n).
- */
-uint16_t
-mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-       struct rxq *rxq = dpdk_rxq;
-       uint16_t nb_rx;
-
-       nb_rx = rxq_burst_v(rxq, pkts, pkts_n);
-       if (unlikely(rxq->pending_err))
-               nb_rx = rxq_handle_pending_error(rxq, pkts, nb_rx);
-       return nb_rx;
-}
-
-/**
- * Check Tx queue flags are set for raw vectorized Tx.
- *
- * @param priv
- *   Pointer to private structure.
- *
- * @return
- *   1 if supported, negative errno value if not.
- */
-int __attribute__((cold))
-priv_check_raw_vec_tx_support(struct priv *priv)
-{
-       uint16_t i;
-
-       /* All the configured queues should support. */
-       for (i = 0; i < priv->txqs_n; ++i) {
-               struct txq *txq = (*priv->txqs)[i];
-
-               if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS) ||
-                   !(txq->flags & ETH_TXQ_FLAGS_NOOFFLOADS))
-                       break;
-       }
-       if (i != priv->txqs_n)
-               return -ENOTSUP;
-       return 1;
-}
-
-/**
- * Check a device can support vectorized TX.
- *
- * @param priv
- *   Pointer to private structure.
- *
- * @return
- *   1 if supported, negative errno value if not.
- */
-int __attribute__((cold))
-priv_check_vec_tx_support(struct priv *priv)
-{
-       if (!priv->tx_vec_en ||
-           priv->txqs_n > MLX5_VPMD_MIN_TXQS ||
-           priv->mps != MLX5_MPW_ENHANCED ||
-           priv->tso)
-               return -ENOTSUP;
-       return 1;
-}
-
-/**
- * Check a RX queue can support vectorized RX.
- *
- * @param rxq
- *   Pointer to RX queue.
- *
- * @return
- *   1 if supported, negative errno value if not.
- */
-int __attribute__((cold))
-rxq_check_vec_support(struct rxq *rxq)
-{
-       struct rxq_ctrl *ctrl = container_of(rxq, struct rxq_ctrl, rxq);
-
-       if (!ctrl->priv->rx_vec_en || rxq->sges_n != 0)
-               return -ENOTSUP;
-       return 1;
-}
-
-/**
- * Check a device can support vectorized RX.
- *
- * @param priv
- *   Pointer to private structure.
- *
- * @return
- *   1 if supported, negative errno value if not.
- */
-int __attribute__((cold))
-priv_check_vec_rx_support(struct priv *priv)
-{
-       uint16_t i;
-
-       if (!priv->rx_vec_en)
-               return -ENOTSUP;
-       /* All the configured queues should support. */
-       for (i = 0; i < priv->rxqs_n; ++i) {
-               struct rxq *rxq = (*priv->rxqs)[i];
-
-               if (rxq_check_vec_support(rxq) < 0)
-                       break;
-       }
-       if (i != priv->rxqs_n)
-               return -ENOTSUP;
-       return 1;
-}
-
-/**
- * Prepare for vectorized RX.
- *
- * @param priv
- *   Pointer to private structure.
- */
-void
-priv_prep_vec_rx_function(struct priv *priv)
-{
-       uint16_t i;
-
-       for (i = 0; i < priv->rxqs_n; ++i) {
-               struct rxq *rxq = (*priv->rxqs)[i];
-               struct rte_mbuf *mbuf_init = &rxq->fake_mbuf;
-               const uint16_t desc = 1 << rxq->elts_n;
-               int j;
-
-               assert(rxq->elts_n == rxq->cqe_n);
-               /* Initialize default rearm_data for vPMD. */
-               mbuf_init->data_off = RTE_PKTMBUF_HEADROOM;
-               rte_mbuf_refcnt_set(mbuf_init, 1);
-               mbuf_init->nb_segs = 1;
-               mbuf_init->port = rxq->port_id;
-               /*
-                * prevent compiler reordering:
-                * rearm_data covers previous fields.
-                */
-               rte_compiler_barrier();
-               rxq->mbuf_initializer =
-                       *(uint64_t *)&mbuf_init->rearm_data;
-               /* Padding with a fake mbuf for vectorized Rx. */
-               for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j)
-                       (*rxq->elts)[desc + j] = &rxq->fake_mbuf;
-               /* Mark that it need to be cleaned up for rxq_alloc_elts(). */
-               rxq->trim_elts = 1;
-       }
-}
+#endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */