4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
43 #pragma GCC diagnostic ignored "-Wpedantic"
45 #include <infiniband/verbs.h>
46 #include <infiniband/arch.h>
47 #include <infiniband/mlx5_hw.h>
49 #pragma GCC diagnostic error "-Wpedantic"
52 /* DPDK headers don't like -pedantic. */
54 #pragma GCC diagnostic ignored "-Wpedantic"
57 #include <rte_malloc.h>
58 #include <rte_ethdev.h>
59 #include <rte_common.h>
61 #pragma GCC diagnostic error "-Wpedantic"
65 #include "mlx5_rxtx.h"
66 #include "mlx5_utils.h"
67 #include "mlx5_autoconf.h"
68 #include "mlx5_defs.h"
70 /* Initialization data for hash RX queues. */
71 const struct hash_rxq_init hash_rxq_init[] = {
73 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
74 IBV_EXP_RX_HASH_DST_IPV4 |
75 IBV_EXP_RX_HASH_SRC_PORT_TCP |
76 IBV_EXP_RX_HASH_DST_PORT_TCP),
77 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
79 .flow_spec.tcp_udp = {
80 .type = IBV_EXP_FLOW_SPEC_TCP,
81 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
83 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
86 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
87 IBV_EXP_RX_HASH_DST_IPV4 |
88 IBV_EXP_RX_HASH_SRC_PORT_UDP |
89 IBV_EXP_RX_HASH_DST_PORT_UDP),
90 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
92 .flow_spec.tcp_udp = {
93 .type = IBV_EXP_FLOW_SPEC_UDP,
94 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
96 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
99 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
100 IBV_EXP_RX_HASH_DST_IPV4),
101 .dpdk_rss_hf = (ETH_RSS_IPV4 |
105 .type = IBV_EXP_FLOW_SPEC_IPV4,
106 .size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
108 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
111 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
112 IBV_EXP_RX_HASH_DST_IPV6 |
113 IBV_EXP_RX_HASH_SRC_PORT_TCP |
114 IBV_EXP_RX_HASH_DST_PORT_TCP),
115 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
117 .flow_spec.tcp_udp = {
118 .type = IBV_EXP_FLOW_SPEC_TCP,
119 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
121 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
124 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
125 IBV_EXP_RX_HASH_DST_IPV6 |
126 IBV_EXP_RX_HASH_SRC_PORT_UDP |
127 IBV_EXP_RX_HASH_DST_PORT_UDP),
128 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
130 .flow_spec.tcp_udp = {
131 .type = IBV_EXP_FLOW_SPEC_UDP,
132 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
134 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
137 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
138 IBV_EXP_RX_HASH_DST_IPV6),
139 .dpdk_rss_hf = (ETH_RSS_IPV6 |
143 .type = IBV_EXP_FLOW_SPEC_IPV6,
144 .size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
146 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
153 .type = IBV_EXP_FLOW_SPEC_ETH,
154 .size = sizeof(hash_rxq_init[0].flow_spec.eth),
160 /* Number of entries in hash_rxq_init[]. */
161 const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
163 /* Initialization data for hash RX queue indirection tables. */
164 static const struct ind_table_init ind_table_init[] = {
166 .max_size = -1u, /* Superseded by HW limitations. */
168 1 << HASH_RXQ_TCPV4 |
169 1 << HASH_RXQ_UDPV4 |
171 1 << HASH_RXQ_TCPV6 |
172 1 << HASH_RXQ_UDPV6 |
179 .hash_types = 1 << HASH_RXQ_ETH,
184 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
186 /* Default RSS hash key also used for ConnectX-3. */
187 uint8_t rss_hash_default_key[] = {
188 0x2c, 0xc6, 0x81, 0xd1,
189 0x5b, 0xdb, 0xf4, 0xf7,
190 0xfc, 0xa2, 0x83, 0x19,
191 0xdb, 0x1a, 0x3e, 0x94,
192 0x6b, 0x9e, 0x38, 0xd9,
193 0x2c, 0x9c, 0x03, 0xd1,
194 0xad, 0x99, 0x44, 0xa7,
195 0xd9, 0x56, 0x3d, 0x59,
196 0x06, 0x3c, 0x25, 0xf3,
197 0xfc, 0x1f, 0xdc, 0x2a,
200 /* Length of the default RSS hash key. */
201 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
204 * Populate flow steering rule for a given hash RX queue type using
205 * information from hash_rxq_init[]. Nothing is written to flow_attr when
206 * flow_attr_size is not large enough, but the required size is still returned.
209 * Pointer to private structure.
210 * @param[out] flow_attr
211 * Pointer to flow attribute structure to fill. Note that the allocated
212 * area must be larger and large enough to hold all flow specifications.
213 * @param flow_attr_size
214 * Entire size of flow_attr and trailing room for flow specifications.
216 * Hash RX queue type to use for flow steering rule.
219 * Total size of the flow attribute buffer. No errors are defined.
222 priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr,
223 size_t flow_attr_size, enum hash_rxq_type type)
225 size_t offset = sizeof(*flow_attr);
226 const struct hash_rxq_init *init = &hash_rxq_init[type];
228 assert(priv != NULL);
229 assert((size_t)type < RTE_DIM(hash_rxq_init));
231 offset += init->flow_spec.hdr.size;
232 init = init->underlayer;
233 } while (init != NULL);
234 if (offset > flow_attr_size)
236 flow_attr_size = offset;
237 init = &hash_rxq_init[type];
238 *flow_attr = (struct ibv_exp_flow_attr){
239 .type = IBV_EXP_FLOW_ATTR_NORMAL,
240 /* Priorities < 3 are reserved for flow director. */
241 .priority = init->flow_priority + 3,
247 offset -= init->flow_spec.hdr.size;
248 memcpy((void *)((uintptr_t)flow_attr + offset),
250 init->flow_spec.hdr.size);
251 ++flow_attr->num_of_specs;
252 init = init->underlayer;
253 } while (init != NULL);
254 return flow_attr_size;
258 * Convert hash type position in indirection table initializer to
259 * hash RX queue type.
262 * Indirection table initializer.
264 * Hash type position.
267 * Hash RX queue type.
269 static enum hash_rxq_type
270 hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos)
272 enum hash_rxq_type type = HASH_RXQ_TCPV4;
274 assert(pos < table->hash_types_n);
276 if ((table->hash_types & (1 << type)) && (pos-- == 0))
284 * Filter out disabled hash RX queue types from ind_table_init[].
287 * Pointer to private structure.
292 * Number of table entries.
295 priv_make_ind_table_init(struct priv *priv,
296 struct ind_table_init (*table)[IND_TABLE_INIT_N])
301 unsigned int table_n = 0;
302 /* Mandatory to receive frames not handled by normal hash RX queues. */
303 unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
305 rss_hf = priv->rss_hf;
306 /* Process other protocols only if more than one queue. */
307 if (priv->rxqs_n > 1)
308 for (i = 0; (i != hash_rxq_init_n); ++i)
309 if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
310 hash_types_sup |= (1 << i);
312 /* Filter out entries whose protocols are not in the set. */
313 for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
317 /* j is increased only if the table has valid protocols. */
319 (*table)[j] = ind_table_init[i];
320 (*table)[j].hash_types &= hash_types_sup;
321 for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
322 if (((*table)[j].hash_types >> h) & 0x1)
324 (*table)[i].hash_types_n = nb;
334 * Initialize hash RX queues and indirection table.
337 * Pointer to private structure.
340 * 0 on success, errno value on failure.
343 priv_create_hash_rxqs(struct priv *priv)
345 struct ibv_exp_wq *wqs[priv->reta_idx_n];
346 struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
347 unsigned int ind_tables_n =
348 priv_make_ind_table_init(priv, &ind_table_init);
349 unsigned int hash_rxqs_n = 0;
350 struct hash_rxq (*hash_rxqs)[] = NULL;
351 struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
357 assert(priv->ind_tables == NULL);
358 assert(priv->ind_tables_n == 0);
359 assert(priv->hash_rxqs == NULL);
360 assert(priv->hash_rxqs_n == 0);
361 assert(priv->pd != NULL);
362 assert(priv->ctx != NULL);
363 if (priv->rxqs_n == 0)
365 assert(priv->rxqs != NULL);
366 if (ind_tables_n == 0) {
367 ERROR("all hash RX queue types have been filtered out,"
368 " indirection table cannot be created");
371 if (priv->rxqs_n & (priv->rxqs_n - 1)) {
372 INFO("%u RX queues are configured, consider rounding this"
373 " number to the next power of two for better balancing",
375 DEBUG("indirection table extended to assume %u WQs",
378 for (i = 0; (i != priv->reta_idx_n); ++i) {
379 struct rxq_ctrl *rxq_ctrl;
381 rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]],
382 struct rxq_ctrl, rxq);
383 wqs[i] = rxq_ctrl->wq;
385 /* Get number of hash RX queues to configure. */
386 for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
387 hash_rxqs_n += ind_table_init[i].hash_types_n;
388 DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
389 hash_rxqs_n, priv->rxqs_n, ind_tables_n);
390 /* Create indirection tables. */
391 ind_tables = rte_calloc(__func__, ind_tables_n,
392 sizeof((*ind_tables)[0]), 0);
393 if (ind_tables == NULL) {
395 ERROR("cannot allocate indirection tables container: %s",
399 for (i = 0; (i != ind_tables_n); ++i) {
400 struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
402 .log_ind_tbl_size = 0, /* Set below. */
406 unsigned int ind_tbl_size = ind_table_init[i].max_size;
407 struct ibv_exp_rwq_ind_table *ind_table;
409 if (priv->reta_idx_n < ind_tbl_size)
410 ind_tbl_size = priv->reta_idx_n;
411 ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
413 ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
415 if (ind_table != NULL) {
416 (*ind_tables)[i] = ind_table;
419 /* Not clear whether errno is set. */
420 err = (errno ? errno : EINVAL);
421 ERROR("RX indirection table creation failed with error %d: %s",
425 /* Allocate array that holds hash RX queues and related data. */
426 hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
427 sizeof((*hash_rxqs)[0]), 0);
428 if (hash_rxqs == NULL) {
430 ERROR("cannot allocate hash RX queues container: %s",
434 for (i = 0, j = 0, k = 0;
435 ((i != hash_rxqs_n) && (j != ind_tables_n));
437 struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
438 enum hash_rxq_type type =
439 hash_rxq_type_from_pos(&ind_table_init[j], k);
440 struct rte_eth_rss_conf *priv_rss_conf =
441 (*priv->rss_conf)[type];
442 struct ibv_exp_rx_hash_conf hash_conf = {
443 .rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
444 .rx_hash_key_len = (priv_rss_conf ?
445 priv_rss_conf->rss_key_len :
446 rss_hash_default_key_len),
447 .rx_hash_key = (priv_rss_conf ?
448 priv_rss_conf->rss_key :
449 rss_hash_default_key),
450 .rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
451 .rwq_ind_tbl = (*ind_tables)[j],
453 struct ibv_exp_qp_init_attr qp_init_attr = {
454 .max_inl_recv = 0, /* Currently not supported. */
455 .qp_type = IBV_QPT_RAW_PACKET,
456 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
457 IBV_EXP_QP_INIT_ATTR_RX_HASH),
459 .rx_hash_conf = &hash_conf,
460 .port_num = priv->port,
463 DEBUG("using indirection table %u for hash RX queue %u type %d",
465 *hash_rxq = (struct hash_rxq){
467 .qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
470 if (hash_rxq->qp == NULL) {
471 err = (errno ? errno : EINVAL);
472 ERROR("Hash RX QP creation failure: %s",
476 if (++k < ind_table_init[j].hash_types_n)
478 /* Switch to the next indirection table and reset hash RX
479 * queue type array index. */
483 priv->ind_tables = ind_tables;
484 priv->ind_tables_n = ind_tables_n;
485 priv->hash_rxqs = hash_rxqs;
486 priv->hash_rxqs_n = hash_rxqs_n;
490 if (hash_rxqs != NULL) {
491 for (i = 0; (i != hash_rxqs_n); ++i) {
492 struct ibv_qp *qp = (*hash_rxqs)[i].qp;
496 claim_zero(ibv_destroy_qp(qp));
500 if (ind_tables != NULL) {
501 for (j = 0; (j != ind_tables_n); ++j) {
502 struct ibv_exp_rwq_ind_table *ind_table =
505 if (ind_table == NULL)
507 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
509 rte_free(ind_tables);
515 * Clean up hash RX queues and indirection table.
518 * Pointer to private structure.
521 priv_destroy_hash_rxqs(struct priv *priv)
525 DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
526 if (priv->hash_rxqs_n == 0) {
527 assert(priv->hash_rxqs == NULL);
528 assert(priv->ind_tables == NULL);
531 for (i = 0; (i != priv->hash_rxqs_n); ++i) {
532 struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
535 assert(hash_rxq->priv == priv);
536 assert(hash_rxq->qp != NULL);
537 /* Also check that there are no remaining flows. */
538 for (j = 0; (j != RTE_DIM(hash_rxq->special_flow)); ++j)
540 (k != RTE_DIM(hash_rxq->special_flow[j]));
542 assert(hash_rxq->special_flow[j][k] == NULL);
543 for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
544 for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
545 assert(hash_rxq->mac_flow[j][k] == NULL);
546 claim_zero(ibv_destroy_qp(hash_rxq->qp));
548 priv->hash_rxqs_n = 0;
549 rte_free(priv->hash_rxqs);
550 priv->hash_rxqs = NULL;
551 for (i = 0; (i != priv->ind_tables_n); ++i) {
552 struct ibv_exp_rwq_ind_table *ind_table =
553 (*priv->ind_tables)[i];
555 assert(ind_table != NULL);
556 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
558 priv->ind_tables_n = 0;
559 rte_free(priv->ind_tables);
560 priv->ind_tables = NULL;
564 * Check whether a given flow type is allowed.
567 * Pointer to private structure.
569 * Flow type to check.
572 * Nonzero if the given flow type is allowed.
575 priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
577 /* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
578 * has been requested. */
579 if (priv->promisc_req)
580 return type == HASH_RXQ_FLOW_TYPE_PROMISC;
582 case HASH_RXQ_FLOW_TYPE_PROMISC:
583 return !!priv->promisc_req;
584 case HASH_RXQ_FLOW_TYPE_ALLMULTI:
585 return !!priv->allmulti_req;
586 case HASH_RXQ_FLOW_TYPE_BROADCAST:
587 case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
588 /* If allmulti is enabled, broadcast and ipv6multi
589 * are unnecessary. */
590 return !priv->allmulti_req;
591 case HASH_RXQ_FLOW_TYPE_MAC:
594 /* Unsupported flow type is not allowed. */
601 * Automatically enable/disable flows according to configuration.
607 * 0 on success, errno value on failure.
610 priv_rehash_flows(struct priv *priv)
612 enum hash_rxq_flow_type i;
614 for (i = HASH_RXQ_FLOW_TYPE_PROMISC;
615 i != RTE_DIM((*priv->hash_rxqs)[0].special_flow);
617 if (!priv_allow_flow_type(priv, i)) {
618 priv_special_flow_disable(priv, i);
620 int ret = priv_special_flow_enable(priv, i);
625 if (priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
626 return priv_mac_addrs_enable(priv);
627 priv_mac_addrs_disable(priv);
632 * Allocate RX queue elements.
635 * Pointer to RX queue structure.
637 * Number of elements to allocate.
639 * If not NULL, fetch buffers from this array instead of allocating them
640 * with rte_pktmbuf_alloc().
643 * 0 on success, errno value on failure.
646 rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
647 struct rte_mbuf *(*pool)[])
649 const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
653 /* Iterate on segments. */
654 for (i = 0; (i != elts_n); ++i) {
655 struct rte_mbuf *buf;
656 volatile struct mlx5_wqe_data_seg *scat =
657 &(*rxq_ctrl->rxq.wqes)[i];
662 rte_pktmbuf_reset(buf);
663 rte_pktmbuf_refcnt_update(buf, 1);
665 buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
667 assert(pool == NULL);
668 ERROR("%p: empty mbuf pool", (void *)rxq_ctrl);
672 /* Headroom is reserved by rte_pktmbuf_alloc(). */
673 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
674 /* Buffer is supposed to be empty. */
675 assert(rte_pktmbuf_data_len(buf) == 0);
676 assert(rte_pktmbuf_pkt_len(buf) == 0);
678 /* Only the first segment keeps headroom. */
680 SET_DATA_OFF(buf, 0);
681 PORT(buf) = rxq_ctrl->rxq.port_id;
682 DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
683 PKT_LEN(buf) = DATA_LEN(buf);
685 /* scat->addr must be able to store a pointer. */
686 assert(sizeof(scat->addr) >= sizeof(uintptr_t));
687 *scat = (struct mlx5_wqe_data_seg){
688 .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
689 .byte_count = htonl(DATA_LEN(buf)),
690 .lkey = htonl(rxq_ctrl->mr->lkey),
692 (*rxq_ctrl->rxq.elts)[i] = buf;
694 DEBUG("%p: allocated and configured %u segments (max %u packets)",
695 (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n));
699 assert(pool == NULL);
701 for (i = 0; (i != elts_n); ++i) {
702 if ((*rxq_ctrl->rxq.elts)[i] != NULL)
703 rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
704 (*rxq_ctrl->rxq.elts)[i] = NULL;
706 DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
712 * Free RX queue elements.
715 * Pointer to RX queue structure.
718 rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
722 DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
723 if (rxq_ctrl->rxq.elts == NULL)
726 for (i = 0; (i != (1u << rxq_ctrl->rxq.elts_n)); ++i) {
727 if ((*rxq_ctrl->rxq.elts)[i] != NULL)
728 rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
729 (*rxq_ctrl->rxq.elts)[i] = NULL;
734 * Clean up a RX queue.
736 * Destroy objects, free allocated memory and reset the structure for reuse.
739 * Pointer to RX queue structure.
742 rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
744 struct ibv_exp_release_intf_params params;
746 DEBUG("cleaning up %p", (void *)rxq_ctrl);
747 rxq_free_elts(rxq_ctrl);
748 if (rxq_ctrl->fdir_queue != NULL)
749 priv_fdir_queue_destroy(rxq_ctrl->priv, rxq_ctrl->fdir_queue);
750 if (rxq_ctrl->if_wq != NULL) {
751 assert(rxq_ctrl->priv != NULL);
752 assert(rxq_ctrl->priv->ctx != NULL);
753 assert(rxq_ctrl->wq != NULL);
754 params = (struct ibv_exp_release_intf_params){
757 claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx,
761 if (rxq_ctrl->if_cq != NULL) {
762 assert(rxq_ctrl->priv != NULL);
763 assert(rxq_ctrl->priv->ctx != NULL);
764 assert(rxq_ctrl->cq != NULL);
765 params = (struct ibv_exp_release_intf_params){
768 claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx,
772 if (rxq_ctrl->wq != NULL)
773 claim_zero(ibv_exp_destroy_wq(rxq_ctrl->wq));
774 if (rxq_ctrl->cq != NULL)
775 claim_zero(ibv_destroy_cq(rxq_ctrl->cq));
776 if (rxq_ctrl->rd != NULL) {
777 struct ibv_exp_destroy_res_domain_attr attr = {
781 assert(rxq_ctrl->priv != NULL);
782 assert(rxq_ctrl->priv->ctx != NULL);
783 claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->priv->ctx,
787 if (rxq_ctrl->mr != NULL)
788 claim_zero(ibv_dereg_mr(rxq_ctrl->mr));
789 memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
793 * Reconfigure RX queue buffers.
795 * rxq_rehash() does not allocate mbufs, which, if not done from the right
796 * thread (such as a control thread), may corrupt the pool.
797 * In case of failure, the queue is left untouched.
800 * Pointer to Ethernet device structure.
805 * 0 on success, errno value on failure.
808 rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
810 unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
812 struct ibv_exp_wq_attr mod;
815 DEBUG("%p: rehashing queue %p with %u SGE(s) per packet",
816 (void *)dev, (void *)rxq_ctrl, 1 << rxq_ctrl->rxq.sges_n);
817 assert(!(elts_n % (1 << rxq_ctrl->rxq.sges_n)));
818 /* From now on, any failure will render the queue unusable.
819 * Reinitialize WQ. */
820 mod = (struct ibv_exp_wq_attr){
821 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
822 .wq_state = IBV_EXP_WQS_RESET,
824 err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
826 ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
830 /* Snatch mbufs from original queue. */
831 claim_zero(rxq_alloc_elts(rxq_ctrl, elts_n, rxq_ctrl->rxq.elts));
832 for (i = 0; i != elts_n; ++i) {
833 struct rte_mbuf *buf = (*rxq_ctrl->rxq.elts)[i];
835 assert(rte_mbuf_refcnt_read(buf) == 2);
836 rte_pktmbuf_free_seg(buf);
838 /* Change queue state to ready. */
839 mod = (struct ibv_exp_wq_attr){
840 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
841 .wq_state = IBV_EXP_WQS_RDY,
843 err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
845 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
846 (void *)dev, strerror(err));
849 /* Update doorbell counter. */
850 rxq_ctrl->rxq.rq_ci = elts_n >> rxq_ctrl->rxq.sges_n;
852 *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
859 * Initialize RX queue.
862 * Pointer to RX queue control template.
865 * 0 on success, errno value on failure.
868 rxq_setup(struct rxq_ctrl *tmpl)
870 struct ibv_cq *ibcq = tmpl->cq;
871 struct ibv_mlx5_cq_info cq_info;
872 struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
873 struct rte_mbuf *(*elts)[1 << tmpl->rxq.elts_n] =
874 rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);
876 if (ibv_mlx5_exp_get_cq_info(ibcq, &cq_info)) {
877 ERROR("Unable to query CQ info. check your OFED.");
880 if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
881 ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
882 "it should be set to %u", RTE_CACHE_LINE_SIZE);
887 tmpl->rxq.rq_db = rwq->rq.db;
888 tmpl->rxq.cqe_n = log2above(cq_info.cqe_cnt);
891 tmpl->rxq.cq_db = cq_info.dbrec;
893 (volatile struct mlx5_wqe_data_seg (*)[])
894 (uintptr_t)rwq->rq.buff;
896 (volatile struct mlx5_cqe (*)[])
897 (uintptr_t)cq_info.buf;
898 tmpl->rxq.elts = elts;
903 * Configure a RX queue.
906 * Pointer to Ethernet device structure.
908 * Pointer to RX queue structure.
910 * Number of descriptors to configure in queue.
912 * NUMA socket on which memory must be allocated.
914 * Thresholds parameters.
916 * Memory pool for buffer allocations.
919 * 0 on success, errno value on failure.
922 rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
923 uint16_t desc, unsigned int socket,
924 const struct rte_eth_rxconf *conf, struct rte_mempool *mp)
926 struct priv *priv = dev->data->dev_private;
927 struct rxq_ctrl tmpl = {
931 .elts_n = log2above(desc),
933 .rss_hash = priv->rxqs_n > 1,
936 struct ibv_exp_wq_attr mod;
938 struct ibv_exp_query_intf_params params;
939 struct ibv_exp_cq_init_attr cq;
940 struct ibv_exp_res_domain_init_attr rd;
941 struct ibv_exp_wq_init_attr wq;
942 struct ibv_exp_cq_attr cq_attr;
944 enum ibv_exp_query_intf_status status;
945 unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
946 unsigned int cqe_n = desc - 1;
947 struct rte_mbuf *(*elts)[desc] = NULL;
950 (void)conf; /* Thresholds configuration (ignored). */
951 /* Enable scattered packets support for this queue if necessary. */
952 assert(mb_len >= RTE_PKTMBUF_HEADROOM);
953 /* If smaller than MRU, multi-segment support must be enabled. */
954 if (mb_len < (priv->mtu > dev->data->dev_conf.rxmode.max_rx_pkt_len ?
955 dev->data->dev_conf.rxmode.max_rx_pkt_len :
957 dev->data->dev_conf.rxmode.jumbo_frame = 1;
958 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
959 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
960 (mb_len - RTE_PKTMBUF_HEADROOM))) {
962 RTE_PKTMBUF_HEADROOM +
963 dev->data->dev_conf.rxmode.max_rx_pkt_len;
967 * Determine the number of SGEs needed for a full packet
968 * and round it to the next power of two.
970 sges_n = log2above((size / mb_len) + !!(size % mb_len));
971 tmpl.rxq.sges_n = sges_n;
972 /* Make sure rxq.sges_n did not overflow. */
973 size = mb_len * (1 << tmpl.rxq.sges_n);
974 size -= RTE_PKTMBUF_HEADROOM;
975 if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
976 ERROR("%p: too many SGEs (%u) needed to handle"
977 " requested maximum packet size %u",
980 dev->data->dev_conf.rxmode.max_rx_pkt_len);
984 DEBUG("%p: maximum number of segments per packet: %u",
985 (void *)dev, 1 << tmpl.rxq.sges_n);
986 if (desc % (1 << tmpl.rxq.sges_n)) {
987 ERROR("%p: number of RX queue descriptors (%u) is not a"
988 " multiple of SGEs per packet (%u)",
991 1 << tmpl.rxq.sges_n);
994 /* Toggle RX checksum offload if hardware supports it. */
996 tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
997 if (priv->hw_csum_l2tun)
998 tmpl.rxq.csum_l2tun =
999 !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1000 /* Use the entire RX mempool as the memory region. */
1001 tmpl.mr = mlx5_mp2mr(priv->pd, mp);
1002 if (tmpl.mr == NULL) {
1004 ERROR("%p: MR creation failure: %s",
1005 (void *)dev, strerror(ret));
1008 attr.rd = (struct ibv_exp_res_domain_init_attr){
1009 .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
1010 IBV_EXP_RES_DOMAIN_MSG_MODEL),
1011 .thread_model = IBV_EXP_THREAD_SINGLE,
1012 .msg_model = IBV_EXP_MSG_HIGH_BW,
1014 tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
1015 if (tmpl.rd == NULL) {
1017 ERROR("%p: RD creation failure: %s",
1018 (void *)dev, strerror(ret));
1021 attr.cq = (struct ibv_exp_cq_init_attr){
1022 .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
1023 .res_domain = tmpl.rd,
1025 if (priv->cqe_comp) {
1026 attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
1027 attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
1028 cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */
1030 tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, NULL, 0,
1032 if (tmpl.cq == NULL) {
1034 ERROR("%p: CQ creation failure: %s",
1035 (void *)dev, strerror(ret));
1038 DEBUG("priv->device_attr.max_qp_wr is %d",
1039 priv->device_attr.max_qp_wr);
1040 DEBUG("priv->device_attr.max_sge is %d",
1041 priv->device_attr.max_sge);
1042 /* Configure VLAN stripping. */
1043 tmpl.rxq.vlan_strip = (priv->hw_vlan_strip &&
1044 !!dev->data->dev_conf.rxmode.hw_vlan_strip);
1045 attr.wq = (struct ibv_exp_wq_init_attr){
1046 .wq_context = NULL, /* Could be useful in the future. */
1047 .wq_type = IBV_EXP_WQT_RQ,
1048 /* Max number of outstanding WRs. */
1049 .max_recv_wr = desc >> tmpl.rxq.sges_n,
1050 /* Max number of scatter/gather elements in a WR. */
1051 .max_recv_sge = 1 << tmpl.rxq.sges_n,
1055 IBV_EXP_CREATE_WQ_RES_DOMAIN |
1056 IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
1058 .res_domain = tmpl.rd,
1059 .vlan_offloads = (tmpl.rxq.vlan_strip ?
1060 IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
1063 /* By default, FCS (CRC) is stripped by hardware. */
1064 if (dev->data->dev_conf.rxmode.hw_strip_crc) {
1065 tmpl.rxq.crc_present = 0;
1066 } else if (priv->hw_fcs_strip) {
1067 /* Ask HW/Verbs to leave CRC in place when supported. */
1068 attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
1069 attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1070 tmpl.rxq.crc_present = 1;
1072 WARN("%p: CRC stripping has been disabled but will still"
1073 " be performed by hardware, make sure MLNX_OFED and"
1074 " firmware are up to date",
1076 tmpl.rxq.crc_present = 0;
1078 DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
1079 " incoming frames to hide it",
1081 tmpl.rxq.crc_present ? "disabled" : "enabled",
1082 tmpl.rxq.crc_present << 2);
1083 if (!mlx5_getenv_int("MLX5_PMD_ENABLE_PADDING"))
1084 ; /* Nothing else to do. */
1085 else if (priv->hw_padding) {
1086 INFO("%p: enabling packet padding on queue %p",
1087 (void *)dev, (void *)rxq_ctrl);
1088 attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
1089 attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1091 WARN("%p: packet padding has been requested but is not"
1092 " supported, make sure MLNX_OFED and firmware are"
1096 tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1097 if (tmpl.wq == NULL) {
1098 ret = (errno ? errno : EINVAL);
1099 ERROR("%p: WQ creation failure: %s",
1100 (void *)dev, strerror(ret));
1104 * Make sure number of WRs*SGEs match expectations since a queue
1105 * cannot allocate more than "desc" buffers.
1107 if (((int)attr.wq.max_recv_wr != (desc >> tmpl.rxq.sges_n)) ||
1108 ((int)attr.wq.max_recv_sge != (1 << tmpl.rxq.sges_n))) {
1109 ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
1111 (desc >> tmpl.rxq.sges_n), (1 << tmpl.rxq.sges_n),
1112 attr.wq.max_recv_wr, attr.wq.max_recv_sge);
1117 tmpl.rxq.port_id = dev->data->port_id;
1118 DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
1119 attr.params = (struct ibv_exp_query_intf_params){
1120 .intf_scope = IBV_EXP_INTF_GLOBAL,
1122 .intf = IBV_EXP_INTF_CQ,
1125 tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1126 if (tmpl.if_cq == NULL) {
1127 ERROR("%p: CQ interface family query failed with status %d",
1128 (void *)dev, status);
1131 attr.params = (struct ibv_exp_query_intf_params){
1132 .intf_scope = IBV_EXP_INTF_GLOBAL,
1133 .intf = IBV_EXP_INTF_WQ,
1136 tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1137 if (tmpl.if_wq == NULL) {
1138 ERROR("%p: WQ interface family query failed with status %d",
1139 (void *)dev, status);
1142 /* Change queue state to ready. */
1143 mod = (struct ibv_exp_wq_attr){
1144 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1145 .wq_state = IBV_EXP_WQS_RDY,
1147 ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1149 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1150 (void *)dev, strerror(ret));
1153 ret = rxq_setup(&tmpl);
1155 ERROR("%p: cannot initialize RX queue structure: %s",
1156 (void *)dev, strerror(ret));
1159 /* Reuse buffers from original queue if possible. */
1160 if (rxq_ctrl->rxq.elts_n) {
1161 assert(1 << rxq_ctrl->rxq.elts_n == desc);
1162 assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts);
1163 ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts);
1165 ret = rxq_alloc_elts(&tmpl, desc, NULL);
1167 ERROR("%p: RXQ allocation failed: %s",
1168 (void *)dev, strerror(ret));
1171 /* Clean up rxq in case we're reinitializing it. */
1172 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
1173 rxq_cleanup(rxq_ctrl);
1174 /* Move mbuf pointers to dedicated storage area in RX queue. */
1175 elts = (void *)(rxq_ctrl + 1);
1176 rte_memcpy(elts, tmpl.rxq.elts, sizeof(*elts));
1178 memset(tmpl.rxq.elts, 0x55, sizeof(*elts));
1180 rte_free(tmpl.rxq.elts);
1181 tmpl.rxq.elts = elts;
1183 /* Update doorbell counter. */
1184 rxq_ctrl->rxq.rq_ci = desc >> rxq_ctrl->rxq.sges_n;
1186 *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
1187 DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
1191 elts = tmpl.rxq.elts;
1199 * DPDK callback to configure a RX queue.
1202 * Pointer to Ethernet device structure.
1206 * Number of descriptors to configure in queue.
1208 * NUMA socket on which memory must be allocated.
1210 * Thresholds parameters.
1212 * Memory pool for buffer allocations.
1215 * 0 on success, negative errno value on failure.
1218 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1219 unsigned int socket, const struct rte_eth_rxconf *conf,
1220 struct rte_mempool *mp)
1222 struct priv *priv = dev->data->dev_private;
1223 struct rxq *rxq = (*priv->rxqs)[idx];
1224 struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1227 if (mlx5_is_secondary())
1228 return -E_RTE_SECONDARY;
1231 if (!rte_is_power_of_2(desc)) {
1232 desc = 1 << log2above(desc);
1233 WARN("%p: increased number of descriptors in RX queue %u"
1234 " to the next power of two (%d)",
1235 (void *)dev, idx, desc);
1237 DEBUG("%p: configuring queue %u for %u descriptors",
1238 (void *)dev, idx, desc);
1239 if (idx >= priv->rxqs_n) {
1240 ERROR("%p: queue index out of range (%u >= %u)",
1241 (void *)dev, idx, priv->rxqs_n);
1246 DEBUG("%p: reusing already allocated queue index %u (%p)",
1247 (void *)dev, idx, (void *)rxq);
1248 if (priv->started) {
1252 (*priv->rxqs)[idx] = NULL;
1253 rxq_cleanup(rxq_ctrl);
1254 /* Resize if rxq size is changed. */
1255 if (rxq_ctrl->rxq.elts_n != log2above(desc)) {
1256 rxq_ctrl = rte_realloc(rxq_ctrl,
1258 desc * sizeof(struct rte_mbuf *),
1259 RTE_CACHE_LINE_SIZE);
1261 ERROR("%p: unable to reallocate queue index %u",
1268 rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl) +
1269 desc * sizeof(struct rte_mbuf *),
1271 if (rxq_ctrl == NULL) {
1272 ERROR("%p: unable to allocate queue index %u",
1278 ret = rxq_ctrl_setup(dev, rxq_ctrl, desc, socket, conf, mp);
1282 rxq_ctrl->rxq.stats.idx = idx;
1283 DEBUG("%p: adding RX queue %p to list",
1284 (void *)dev, (void *)rxq_ctrl);
1285 (*priv->rxqs)[idx] = &rxq_ctrl->rxq;
1286 /* Update receive callback. */
1287 priv_select_rx_function(priv);
1294 * DPDK callback to release a RX queue.
1297 * Generic RX queue pointer.
1300 mlx5_rx_queue_release(void *dpdk_rxq)
1302 struct rxq *rxq = (struct rxq *)dpdk_rxq;
1303 struct rxq_ctrl *rxq_ctrl;
1307 if (mlx5_is_secondary())
1312 rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1313 priv = rxq_ctrl->priv;
1315 for (i = 0; (i != priv->rxqs_n); ++i)
1316 if ((*priv->rxqs)[i] == rxq) {
1317 DEBUG("%p: removing RX queue %p from list",
1318 (void *)priv->dev, (void *)rxq_ctrl);
1319 (*priv->rxqs)[i] = NULL;
1322 rxq_cleanup(rxq_ctrl);
1328 * DPDK callback for RX in secondary processes.
1330 * This function configures all queues from primary process information
1331 * if necessary before reverting to the normal RX burst callback.
1334 * Generic pointer to RX queue structure.
1336 * Array to store received packets.
1338 * Maximum number of packets in array.
1341 * Number of packets successfully received (<= pkts_n).
1344 mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts,
1347 struct rxq *rxq = dpdk_rxq;
1348 struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1349 struct priv *priv = mlx5_secondary_data_setup(rxq_ctrl->priv);
1350 struct priv *primary_priv;
1356 mlx5_secondary_data[priv->dev->data->port_id].primary_priv;
1357 /* Look for queue index in both private structures. */
1358 for (index = 0; index != priv->rxqs_n; ++index)
1359 if (((*primary_priv->rxqs)[index] == rxq) ||
1360 ((*priv->rxqs)[index] == rxq))
1362 if (index == priv->rxqs_n)
1364 rxq = (*priv->rxqs)[index];
1365 return priv->dev->rx_pkt_burst(rxq, pkts, pkts_n);