vnet/vnet/devices/dpdk/device.c

   1 /*
   2  * Copyright (c) 2015 Cisco and/or its affiliates.
   3  * Licensed under the Apache License, Version 2.0 (the "License");
   4  * you may not use this file except in compliance with the License.
   5  * You may obtain a copy of the License at:
   6  *
   7  *     http://www.apache.org/licenses/LICENSE-2.0
   8  *
   9  * Unless required by applicable law or agreed to in writing, software
  10  * distributed under the License is distributed on an "AS IS" BASIS,
  11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12  * See the License for the specific language governing permissions and
  13  * limitations under the License.
  14  */
  15 #include <vnet/vnet.h>
  16 #include <vppinfra/vec.h>
  17 #include <vppinfra/format.h>
  18 #include <vlib/unix/cj.h>
  19 #include <assert.h>
  20
  21 #include <vnet/ethernet/ethernet.h>
  22 #include <vnet/devices/dpdk/dpdk.h>
  23
  24 #include "dpdk_priv.h"
  25 #include <vppinfra/error.h>
  26
  27 #define foreach_dpdk_tx_func_error                      \
  28   _(BAD_RETVAL, "DPDK tx function returned an error")   \
  29   _(RING_FULL, "Tx packet drops (ring full)")           \
  30   _(PKT_DROP, "Tx packet drops (dpdk tx failure)")      \
  31   _(REPL_FAIL, "Tx packet drops (replication failure)")
  32
  33 typedef enum {
  34 #define _(f,s) DPDK_TX_FUNC_ERROR_##f,
  35   foreach_dpdk_tx_func_error
  36 #undef _
  37   DPDK_TX_FUNC_N_ERROR,
  38 } dpdk_tx_func_error_t;
  39
  40 static char * dpdk_tx_func_error_strings[] = {
  41 #define _(n,s) s,
  42     foreach_dpdk_tx_func_error
  43 #undef _
  44 };
  45
  46 static struct rte_mbuf * dpdk_replicate_packet_mb (vlib_buffer_t * b)
  47 {
  48   vlib_main_t * vm = vlib_get_main();
  49   vlib_buffer_main_t * bm = vm->buffer_main;
  50   struct rte_mbuf * first_mb = 0, * new_mb, * pkt_mb, ** prev_mb_next = 0;
  51   u8 nb_segs, nb_segs_left;
  52   u32 copy_bytes;
  53   unsigned socket_id = rte_socket_id();
  54
  55   ASSERT (bm->pktmbuf_pools[socket_id]);
  56   pkt_mb = ((struct rte_mbuf *)b)-1;
  57   nb_segs = pkt_mb->nb_segs;
  58   for (nb_segs_left = nb_segs; nb_segs_left; nb_segs_left--)
  59     {
  60       if (PREDICT_FALSE(pkt_mb == 0))
  61         {
  62           clib_warning ("Missing %d mbuf chain segment(s):   "
  63                         "(nb_segs = %d, nb_segs_left = %d)!",
  64                         nb_segs - nb_segs_left, nb_segs, nb_segs_left);
  65           if (first_mb)
  66             rte_pktmbuf_free(first_mb);
  67           return NULL;
  68         }
  69       new_mb = rte_pktmbuf_alloc (bm->pktmbuf_pools[socket_id]);
  70       if (PREDICT_FALSE(new_mb == 0))
  71         {
  72           if (first_mb)
  73             rte_pktmbuf_free(first_mb);
  74           return NULL;
  75         }
  76
  77       /*
  78        * Copy packet info into 1st segment.
  79        */
  80       if (first_mb == 0)
  81         {
  82           first_mb = new_mb;
  83           rte_pktmbuf_pkt_len (first_mb) = pkt_mb->pkt_len;
  84           first_mb->nb_segs = pkt_mb->nb_segs;
  85           first_mb->port = pkt_mb->port;
  86 #ifdef DAW_FIXME // TX Offload support TBD
  87           first_mb->vlan_macip = pkt_mb->vlan_macip;
  88           first_mb->hash = pkt_mb->hash;
  89           first_mb->ol_flags = pkt_mb->ol_flags
  90 #endif
  91         }
  92       else
  93         {
  94           ASSERT(prev_mb_next != 0);
  95           *prev_mb_next = new_mb;
  96         }
  97
  98       /*
  99        * Copy packet segment data into new mbuf segment.
 100        */
 101       rte_pktmbuf_data_len (new_mb) = pkt_mb->data_len;
 102       copy_bytes = pkt_mb->data_len + RTE_PKTMBUF_HEADROOM;
 103       ASSERT(copy_bytes <= pkt_mb->buf_len);
 104       memcpy(new_mb->buf_addr, pkt_mb->buf_addr, copy_bytes);
 105
 106       prev_mb_next = &new_mb->next;
 107       pkt_mb = pkt_mb->next;
 108     }
 109
 110   ASSERT(pkt_mb == 0);
 111   __rte_mbuf_sanity_check(first_mb, 1);
 112
 113   return first_mb;
 114 }
 115
 116 static void
 117 dpdk_tx_trace_buffer (dpdk_main_t * dm,
 118                       vlib_node_runtime_t * node,
 119                       dpdk_device_t * xd,
 120                       u16 queue_id,
 121                       u32 buffer_index,
 122                       vlib_buffer_t * buffer)
 123 {
 124   vlib_main_t * vm = vlib_get_main();
 125   dpdk_tx_dma_trace_t * t0;
 126   struct rte_mbuf * mb;
 127
 128   mb = ((struct rte_mbuf *)buffer)-1;
 129
 130   t0 = vlib_add_trace (vm, node, buffer, sizeof (t0[0]));
 131   t0->queue_index = queue_id;
 132   t0->device_index = xd->device_index;
 133   t0->buffer_index = buffer_index;
 134   memcpy (&t0->mb, mb, sizeof (t0->mb));
 135   memcpy (&t0->buffer, buffer, sizeof (buffer[0]) - sizeof (buffer->pre_data));
 136   memcpy (t0->buffer.pre_data, buffer->data + buffer->current_data,
 137           sizeof (t0->buffer.pre_data));
 138 }
 139
 140 /*
 141  * This function calls the dpdk's tx_burst function to transmit the packets
 142  * on the tx_vector. It manages a lock per-device if the device does not
 143  * support multiple queues. It returns the number of packets untransmitted
 144  * on the tx_vector. If all packets are transmitted (the normal case), the
 145  * function returns 0.
 146  *
 147  * The tx_burst function may not be able to transmit all packets because the
 148  * dpdk ring is full. If a flowcontrol callback function has been configured
 149  * then the function simply returns. If no callback has been configured, the
 150  * function will retry calling tx_burst with the remaining packets. This will
 151  * continue until all packets are transmitted or tx_burst indicates no packets
 152  * could be transmitted. (The caller can drop the remaining packets.)
 153  *
 154  * The function assumes there is at least one packet on the tx_vector.
 155  */
 156 static_always_inline
 157 u32 tx_burst_vector_internal (vlib_main_t * vm,
 158                               dpdk_device_t * xd,
 159                               struct rte_mbuf ** tx_vector)
 160 {
 161   dpdk_main_t * dm = &dpdk_main;
 162   u32 n_packets;
 163   u32 tx_head;
 164   u32 tx_tail;
 165   u32 n_retry;
 166   int rv;
 167   int queue_id;
 168   tx_ring_hdr_t *ring;
 169
 170   ring = vec_header(tx_vector, sizeof(*ring));
 171
 172   n_packets = ring->tx_head - ring->tx_tail;
 173
 174   tx_head = ring->tx_head % DPDK_TX_RING_SIZE;
 175
 176   /*
 177    * Ensure rte_eth_tx_burst is not called with 0 packets, which can lead to
 178    * unpredictable results.
 179    */
 180   ASSERT(n_packets > 0);
 181
 182   /*
 183    * Check for tx_vector overflow. If this fails it is a system configuration
 184    * error. The ring should be sized big enough to handle the largest un-flowed
 185    * off burst from a traffic manager. A larger size also helps performance
 186    * a bit because it decreases the probability of having to issue two tx_burst
 187    * calls due to a ring wrap.
 188    */
 189   ASSERT(n_packets < DPDK_TX_RING_SIZE);
 190
 191   /*
 192    * If there is no flowcontrol callback, there is only temporary buffering
 193    * on the tx_vector and so the tail should always be 0.
 194    */
 195   ASSERT(dm->flowcontrol_callback || ring->tx_tail == 0);
 196
 197   /*
 198    * If there is a flowcontrol callback, don't retry any incomplete tx_bursts.
 199    * Apply backpressure instead. If there is no callback, keep retrying until
 200    * a tx_burst sends no packets. n_retry of 255 essentially means no retry
 201    * limit.
 202    */
 203   n_retry = dm->flowcontrol_callback ? 0 : 255;
 204
 205   queue_id = vm->cpu_index;
 206
 207   do {
 208       /* start the burst at the tail */
 209       tx_tail = ring->tx_tail % DPDK_TX_RING_SIZE;
 210
 211       /*
 212        * This device only supports one TX queue,
 213        * and we're running multi-threaded...
 214        */
 215       if (PREDICT_FALSE(xd->dev_type != VNET_DPDK_DEV_VHOST_USER &&
 216         xd->lockp != 0))
 217         {
 218           queue_id = queue_id % xd->tx_q_used;
 219           while (__sync_lock_test_and_set (xd->lockp[queue_id], 1))
 220             /* zzzz */
 221             queue_id = (queue_id + 1) % xd->tx_q_used;
 222         }
 223
 224       if (PREDICT_TRUE(xd->dev_type == VNET_DPDK_DEV_ETH))
 225         {
 226           if (PREDICT_TRUE(tx_head > tx_tail))
 227             {
 228               /* no wrap, transmit in one burst */
 229               rv = rte_eth_tx_burst(xd->device_index,
 230                                     (uint16_t) queue_id,
 231                                     &tx_vector[tx_tail],
 232                                     (uint16_t) (tx_head-tx_tail));
 233             }
 234           else
 235             {
 236               /*
 237                * This can only happen if there is a flowcontrol callback.
 238                * We need to split the transmit into two calls: one for
 239                * the packets up to the wrap point, and one to continue
 240                * at the start of the ring.
 241                * Transmit pkts up to the wrap point.
 242                */
 243               rv = rte_eth_tx_burst(xd->device_index,
 244                                     (uint16_t) queue_id,
 245                                     &tx_vector[tx_tail],
 246                                     (uint16_t) (DPDK_TX_RING_SIZE - tx_tail));
 247
 248               /*
 249                * If we transmitted everything we wanted, then allow 1 retry
 250                * so we can try to transmit the rest. If we didn't transmit
 251                * everything, stop now.
 252                */
 253               n_retry = (rv == DPDK_TX_RING_SIZE - tx_tail) ? 1 : 0;
 254             }
 255         }
 256       else if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER)
 257         {
 258           u32 offset = 0;
 259           if (xd->need_txlock) {
 260             queue_id = 0;
 261             while (__sync_lock_test_and_set (xd->lockp[queue_id], 1));
 262           }
 263 #if RTE_VERSION >= RTE_VERSION_NUM(2, 2, 0, 0)
 264           else {
 265               dpdk_device_and_queue_t * dq;
 266               vec_foreach (dq, dm->devices_by_cpu[vm->cpu_index])
 267               {
 268                 if (xd->device_index == dq->device)
 269                     break;
 270               }
 271               assert (dq);
 272               offset = dq->queue_id * VIRTIO_QNUM;
 273           }
 274 #endif
 275           if (PREDICT_TRUE(tx_head > tx_tail))
 276             {
 277               /* no wrap, transmit in one burst */
 278               rv = rte_vhost_enqueue_burst(&xd->vu_vhost_dev, offset + VIRTIO_RXQ,
 279                                            &tx_vector[tx_tail],
 280                                            (uint16_t) (tx_head-tx_tail));
 281               if (PREDICT_TRUE(rv > 0))
 282                 {
 283                   if (dpdk_vhost_user_want_interrupt(xd, offset + VIRTIO_RXQ)) {
 284                     dpdk_vu_vring *vring = &(xd->vu_intf->vrings[offset + VIRTIO_RXQ]);
 285                     vring->n_since_last_int += rv;
 286
 287                     f64 now = vlib_time_now (vm);
 288                     if (vring->int_deadline < now ||
 289                         vring->n_since_last_int > dm->vhost_coalesce_frames)
 290                       dpdk_vhost_user_send_interrupt(vm, xd, offset + VIRTIO_RXQ);
 291                   }
 292
 293                   int c = rv;
 294                   while(c--)
 295                     rte_pktmbuf_free (tx_vector[tx_tail+c]);
 296                 }
 297             }
 298           else
 299             {
 300               /*
 301                * If we transmitted everything we wanted, then allow 1 retry
 302                * so we can try to transmit the rest. If we didn't transmit
 303                * everything, stop now.
 304                */
 305               rv = rte_vhost_enqueue_burst(&xd->vu_vhost_dev, offset + VIRTIO_RXQ,
 306                                            &tx_vector[tx_tail],
 307                                            (uint16_t) (DPDK_TX_RING_SIZE - tx_tail));
 308
 309               if (PREDICT_TRUE(rv > 0))
 310                 {
 311                   if (dpdk_vhost_user_want_interrupt(xd, offset + VIRTIO_RXQ)) {
 312                     dpdk_vu_vring *vring = &(xd->vu_intf->vrings[offset + VIRTIO_RXQ]);
 313                     vring->n_since_last_int += rv;
 314
 315                     f64 now = vlib_time_now (vm);
 316                     if (vring->int_deadline < now ||
 317                         vring->n_since_last_int > dm->vhost_coalesce_frames)
 318                       dpdk_vhost_user_send_interrupt(vm, xd, offset + VIRTIO_RXQ);
 319                   }
 320
 321                   int c = rv;
 322                   while(c--)
 323                     rte_pktmbuf_free (tx_vector[tx_tail+c]);
 324                 }
 325
 326               n_retry = (rv == DPDK_TX_RING_SIZE - tx_tail) ? 1 : 0;
 327             }
 328
 329           if (xd->need_txlock)
 330             *xd->lockp[queue_id] = 0;
 331         }
 332 #if RTE_LIBRTE_KNI
 333       else if (xd->dev_type == VNET_DPDK_DEV_KNI)
 334         {
 335           if (PREDICT_TRUE(tx_head > tx_tail))
 336             {
 337               /* no wrap, transmit in one burst */
 338               rv = rte_kni_tx_burst(xd->kni,
 339                                     &tx_vector[tx_tail],
 340                                     (uint16_t) (tx_head-tx_tail));
 341             }
 342           else
 343             {
 344               /*
 345                * This can only happen if there is a flowcontrol callback.
 346                * We need to split the transmit into two calls: one for
 347                * the packets up to the wrap point, and one to continue
 348                * at the start of the ring.
 349                * Transmit pkts up to the wrap point.
 350                */
 351               rv = rte_kni_tx_burst(xd->kni,
 352                                     &tx_vector[tx_tail],
 353                                     (uint16_t) (DPDK_TX_RING_SIZE - tx_tail));
 354
 355               /*
 356                * If we transmitted everything we wanted, then allow 1 retry
 357                * so we can try to transmit the rest. If we didn't transmit
 358                * everything, stop now.
 359                */
 360               n_retry = (rv == DPDK_TX_RING_SIZE - tx_tail) ? 1 : 0;
 361             }
 362         }
 363 #endif
 364       else
 365         {
 366           ASSERT(0);
 367           rv = 0;
 368         }
 369
 370       if (PREDICT_FALSE(xd->dev_type != VNET_DPDK_DEV_VHOST_USER &&
 371             xd->lockp != 0))
 372           *xd->lockp[queue_id] = 0;
 373
 374       if (PREDICT_FALSE(rv < 0))
 375         {
 376           // emit non-fatal message, bump counter
 377           vnet_main_t * vnm = dm->vnet_main;
 378           vnet_interface_main_t * im = &vnm->interface_main;
 379           u32 node_index;
 380
 381           node_index = vec_elt_at_index(im->hw_interfaces,
 382                                         xd->vlib_hw_if_index)->tx_node_index;
 383
 384           vlib_error_count (vm, node_index, DPDK_TX_FUNC_ERROR_BAD_RETVAL, 1);
 385           clib_warning ("rte_eth_tx_burst[%d]: error %d", xd->device_index, rv);
 386           return n_packets; // untransmitted packets
 387         }
 388       ring->tx_tail += (u16)rv;
 389       n_packets -= (uint16_t) rv;
 390   } while (rv && n_packets && (n_retry>0));
 391
 392   return n_packets;
 393 }
 394
 395
 396 /*
 397  * This function transmits any packets on the interface's tx_vector and returns
 398  * the number of packets untransmitted on the tx_vector. If the tx_vector is
 399  * empty the function simply returns 0.
 400  *
 401  * It is intended to be called by a traffic manager which has flowed-off an
 402  * interface to see if the interface can be flowed-on again.
 403  */
 404 u32 dpdk_interface_tx_vector (vlib_main_t * vm, u32 dev_instance)
 405 {
 406   dpdk_main_t * dm = &dpdk_main;
 407   dpdk_device_t * xd;
 408   int queue_id;
 409   struct rte_mbuf ** tx_vector;
 410   tx_ring_hdr_t *ring;
 411
 412   /* param is dev_instance and not hw_if_index to save another lookup */
 413   xd = vec_elt_at_index (dm->devices, dev_instance);
 414
 415   queue_id = vm->cpu_index;
 416   tx_vector = xd->tx_vectors[queue_id];
 417
 418   /* If no packets on the ring, don't bother calling tx function */
 419   ring = vec_header(tx_vector, sizeof(*ring));
 420   if (ring->tx_head == ring->tx_tail)
 421     {
 422       return 0;
 423     }
 424
 425   return tx_burst_vector_internal (vm, xd, tx_vector);
 426 }
 427
 428 /*
 429  * Transmits the packets on the frame to the interface associated with the
 430  * node. It first copies packets on the frame to a tx_vector containing the
 431  * rte_mbuf pointers. It then passes this vector to tx_burst_vector_internal
 432  * which calls the dpdk tx_burst function.
 433  *
 434  * The tx_vector is treated slightly differently depending on whether or
 435  * not a flowcontrol callback function has been configured. If there is no
 436  * callback, the tx_vector is a temporary array of rte_mbuf packet pointers.
 437  * Its entries are written and consumed before the function exits.
 438  *
 439  * If there is a callback then the transmit is being invoked in the presence
 440  * of a traffic manager. Here the tx_vector is treated like a ring of rte_mbuf
 441  * pointers. If not all packets can be transmitted, the untransmitted packets
 442  * stay on the tx_vector until the next call. The callback allows the traffic
 443  * manager to flow-off dequeues to the interface. The companion function
 444  * dpdk_interface_tx_vector() allows the traffic manager to detect when
 445  * it should flow-on the interface again.
 446  */
 447 static uword
 448 dpdk_interface_tx (vlib_main_t * vm,
 449            vlib_node_runtime_t * node,
 450            vlib_frame_t * f)
 451 {
 452   dpdk_main_t * dm = &dpdk_main;
 453   vnet_interface_output_runtime_t * rd = (void *) node->runtime_data;
 454   dpdk_device_t * xd = vec_elt_at_index (dm->devices, rd->dev_instance);
 455   u32 n_packets = f->n_vectors;
 456   u32 n_left;
 457   u32 * from;
 458   struct rte_mbuf ** tx_vector;
 459   int i;
 460   int queue_id;
 461   u32 my_cpu;
 462   u32 tx_pkts = 0;
 463   tx_ring_hdr_t *ring;
 464   u32 n_on_ring;
 465
 466   my_cpu = vm->cpu_index;
 467
 468   queue_id = my_cpu;
 469
 470   tx_vector = xd->tx_vectors[queue_id];
 471   ring = vec_header(tx_vector, sizeof(*ring));
 472
 473   n_on_ring = ring->tx_head - ring->tx_tail;
 474   from = vlib_frame_vector_args (f);
 475
 476   ASSERT(n_packets <= VLIB_FRAME_SIZE);
 477
 478   if (PREDICT_FALSE(n_on_ring + n_packets > DPDK_TX_RING_SIZE))
 479     {
 480       /*
 481        * Overflowing the ring should never happen.
 482        * If it does then drop the whole frame.
 483        */
 484       vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_RING_FULL,
 485                         n_packets);
 486
 487       while (n_packets--)
 488         {
 489           u32 bi0 = from[n_packets];
 490           vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
 491           struct rte_mbuf *mb0 = ((struct rte_mbuf *)b0) - 1;
 492           rte_pktmbuf_free (mb0);
 493         }
 494       return n_on_ring;
 495     }
 496
 497   if (PREDICT_FALSE(dm->tx_pcap_enable))
 498     {
 499       n_left = n_packets;
 500       while (n_left > 0)
 501         {
 502           u32 bi0 = from[0];
 503           vlib_buffer_t * b0 = vlib_get_buffer (vm, bi0);
 504           if (dm->pcap_sw_if_index == 0 ||
 505               dm->pcap_sw_if_index == vnet_buffer(b0)->sw_if_index [VLIB_TX])
 506               pcap_add_buffer (&dm->pcap_main, vm, bi0, 512);
 507           from++;
 508           n_left--;
 509         }
 510     }
 511
 512   from = vlib_frame_vector_args (f);
 513   n_left = n_packets;
 514   i = ring->tx_head % DPDK_TX_RING_SIZE;
 515
 516   while (n_left >= 4)
 517     {
 518       u32 bi0, bi1;
 519       u32 pi0, pi1;
 520       struct rte_mbuf * mb0, * mb1;
 521       struct rte_mbuf * prefmb0, * prefmb1;
 522       vlib_buffer_t * b0, * b1;
 523       vlib_buffer_t * pref0, * pref1;
 524       i16 delta0, delta1;
 525       u16 new_data_len0, new_data_len1;
 526       u16 new_pkt_len0, new_pkt_len1;
 527       u32 any_clone;
 528
 529       pi0 = from[2];
 530       pi1 = from[3];
 531       pref0 = vlib_get_buffer (vm, pi0);
 532       pref1 = vlib_get_buffer (vm, pi1);
 533
 534       prefmb0 = ((struct rte_mbuf *)pref0) - 1;
 535       prefmb1 = ((struct rte_mbuf *)pref1) - 1;
 536
 537       CLIB_PREFETCH(prefmb0, CLIB_CACHE_LINE_BYTES, LOAD);
 538       CLIB_PREFETCH(pref0, CLIB_CACHE_LINE_BYTES, LOAD);
 539       CLIB_PREFETCH(prefmb1, CLIB_CACHE_LINE_BYTES, LOAD);
 540       CLIB_PREFETCH(pref1, CLIB_CACHE_LINE_BYTES, LOAD);
 541
 542       bi0 = from[0];
 543       bi1 = from[1];
 544       from += 2;
 545
 546       b0 = vlib_get_buffer (vm, bi0);
 547       b1 = vlib_get_buffer (vm, bi1);
 548
 549       mb0 = ((struct rte_mbuf *)b0) - 1;
 550       mb1 = ((struct rte_mbuf *)b1) - 1;
 551
 552       any_clone = b0->clone_count | b1->clone_count;
 553       if (PREDICT_FALSE(any_clone != 0))
 554         {
 555           if (PREDICT_FALSE(b0->clone_count != 0))
 556         {
 557           struct rte_mbuf * mb0_new = dpdk_replicate_packet_mb (b0);
 558           if (PREDICT_FALSE(mb0_new == 0))
 559             {
 560               vlib_error_count (vm, node->node_index,
 561                     DPDK_TX_FUNC_ERROR_REPL_FAIL, 1);
 562               b0->flags |= VLIB_BUFFER_REPL_FAIL;
 563             }
 564           else
 565             mb0 = mb0_new;
 566           vec_add1 (dm->recycle[my_cpu], bi0);
 567         }
 568           if (PREDICT_FALSE(b1->clone_count != 0))
 569         {
 570           struct rte_mbuf * mb1_new = dpdk_replicate_packet_mb (b1);
 571           if (PREDICT_FALSE(mb1_new == 0))
 572             {
 573               vlib_error_count (vm, node->node_index,
 574                     DPDK_TX_FUNC_ERROR_REPL_FAIL, 1);
 575               b1->flags |= VLIB_BUFFER_REPL_FAIL;
 576             }
 577           else
 578             mb1 = mb1_new;
 579           vec_add1 (dm->recycle[my_cpu], bi1);
 580         }
 581     }
 582
 583       delta0 = PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL) ? 0 :
 584     vlib_buffer_length_in_chain (vm, b0) - (i16) mb0->pkt_len;
 585       delta1 = PREDICT_FALSE(b1->flags & VLIB_BUFFER_REPL_FAIL) ? 0 :
 586     vlib_buffer_length_in_chain (vm, b1) - (i16) mb1->pkt_len;
 587
 588       new_data_len0 = (u16)((i16) mb0->data_len + delta0);
 589       new_data_len1 = (u16)((i16) mb1->data_len + delta1);
 590       new_pkt_len0 = (u16)((i16) mb0->pkt_len + delta0);
 591       new_pkt_len1 = (u16)((i16) mb1->pkt_len + delta1);
 592
 593       b0->current_length = new_data_len0;
 594       b1->current_length = new_data_len1;
 595       mb0->data_len = new_data_len0;
 596       mb1->data_len = new_data_len1;
 597       mb0->pkt_len = new_pkt_len0;
 598       mb1->pkt_len = new_pkt_len1;
 599
 600       mb0->data_off = (PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL)) ?
 601           mb0->data_off : (u16)(RTE_PKTMBUF_HEADROOM + b0->current_data);
 602       mb1->data_off = (PREDICT_FALSE(b1->flags & VLIB_BUFFER_REPL_FAIL)) ?
 603           mb1->data_off : (u16)(RTE_PKTMBUF_HEADROOM + b1->current_data);
 604
 605       if (PREDICT_FALSE(node->flags & VLIB_NODE_FLAG_TRACE))
 606     {
 607           if (b0->flags & VLIB_BUFFER_IS_TRACED)
 608               dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0);
 609           if (b1->flags & VLIB_BUFFER_IS_TRACED)
 610               dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi1, b1);
 611     }
 612
 613       if (PREDICT_TRUE(any_clone == 0))
 614         {
 615       tx_vector[i % DPDK_TX_RING_SIZE] = mb0;
 616           i++;
 617       tx_vector[i % DPDK_TX_RING_SIZE] = mb1;
 618           i++;
 619         }
 620       else
 621         {
 622           /* cloning was done, need to check for failure */
 623           if (PREDICT_TRUE((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0))
 624             {
 625           tx_vector[i % DPDK_TX_RING_SIZE] = mb0;
 626               i++;
 627             }
 628           if (PREDICT_TRUE((b1->flags & VLIB_BUFFER_REPL_FAIL) == 0))
 629             {
 630           tx_vector[i % DPDK_TX_RING_SIZE] = mb1;
 631               i++;
 632             }
 633         }
 634
 635       n_left -= 2;
 636     }
 637   while (n_left > 0)
 638     {
 639       u32 bi0;
 640       struct rte_mbuf * mb0;
 641       vlib_buffer_t * b0;
 642       i16 delta0;
 643       u16 new_data_len0;
 644       u16 new_pkt_len0;
 645
 646       bi0 = from[0];
 647       from++;
 648
 649       b0 = vlib_get_buffer (vm, bi0);
 650
 651       mb0 = ((struct rte_mbuf *)b0) - 1;
 652       if (PREDICT_FALSE(b0->clone_count != 0))
 653     {
 654       struct rte_mbuf * mb0_new = dpdk_replicate_packet_mb (b0);
 655       if (PREDICT_FALSE(mb0_new == 0))
 656         {
 657           vlib_error_count (vm, node->node_index,
 658                 DPDK_TX_FUNC_ERROR_REPL_FAIL, 1);
 659           b0->flags |= VLIB_BUFFER_REPL_FAIL;
 660         }
 661       else
 662         mb0 = mb0_new;
 663       vec_add1 (dm->recycle[my_cpu], bi0);
 664     }
 665
 666       delta0 = PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL) ? 0 :
 667     vlib_buffer_length_in_chain (vm, b0) - (i16) mb0->pkt_len;
 668
 669       new_data_len0 = (u16)((i16) mb0->data_len + delta0);
 670       new_pkt_len0 = (u16)((i16) mb0->pkt_len + delta0);
 671
 672       b0->current_length = new_data_len0;
 673       mb0->data_len = new_data_len0;
 674       mb0->pkt_len = new_pkt_len0;
 675       mb0->data_off = (PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL)) ?
 676           mb0->data_off : (u16)(RTE_PKTMBUF_HEADROOM + b0->current_data);
 677
 678       if (PREDICT_FALSE(node->flags & VLIB_NODE_FLAG_TRACE))
 679           if (b0->flags & VLIB_BUFFER_IS_TRACED)
 680               dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0);
 681
 682       if (PREDICT_TRUE((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0))
 683         {
 684       tx_vector[i % DPDK_TX_RING_SIZE] = mb0;
 685           i++;
 686         }
 687       n_left--;
 688     }
 689
 690   /* account for additional packets in the ring */
 691   ring->tx_head += n_packets;
 692   n_on_ring = ring->tx_head - ring->tx_tail;
 693
 694   /* transmit as many packets as possible */
 695   n_packets = tx_burst_vector_internal (vm, xd, tx_vector);
 696
 697   /*
 698    * tx_pkts is the number of packets successfully transmitted
 699    * This is the number originally on ring minus the number remaining on ring
 700    */
 701   tx_pkts = n_on_ring - n_packets;
 702
 703   if (PREDICT_FALSE(dm->flowcontrol_callback != 0))
 704     {
 705       if (PREDICT_FALSE(n_packets))
 706         {
 707           /* Callback may want to enable flowcontrol */
 708           dm->flowcontrol_callback(vm, xd->vlib_hw_if_index, ring->tx_head - ring->tx_tail);
 709         }
 710       else
 711         {
 712           /* Reset head/tail to avoid unnecessary wrap */
 713           ring->tx_head = 0;
 714           ring->tx_tail = 0;
 715         }
 716     }
 717   else
 718     {
 719       /* If there is no callback then drop any non-transmitted packets */
 720       if (PREDICT_FALSE(n_packets))
 721         {
 722           vlib_simple_counter_main_t * cm;
 723           vnet_main_t * vnm = vnet_get_main();
 724
 725           cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
 726                                  VNET_INTERFACE_COUNTER_TX_ERROR);
 727
 728           vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, n_packets);
 729
 730           vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_PKT_DROP,
 731                 n_packets);
 732
 733           while (n_packets--)
 734             rte_pktmbuf_free (tx_vector[ring->tx_tail + n_packets]);
 735         }
 736
 737         /* Reset head/tail to avoid unnecessary wrap */
 738         ring->tx_head = 0;
 739         ring->tx_tail = 0;
 740     }
 741
 742   /* Recycle replicated buffers */
 743   if (PREDICT_FALSE(vec_len(dm->recycle[my_cpu])))
 744     {
 745       vlib_buffer_free (vm, dm->recycle[my_cpu], vec_len(dm->recycle[my_cpu]));
 746       _vec_len(dm->recycle[my_cpu]) = 0;
 747     }
 748
 749   ASSERT(ring->tx_head >= ring->tx_tail);
 750
 751   return tx_pkts;
 752 }
 753
 754 static int dpdk_device_renumber (vnet_hw_interface_t * hi,
 755                                  u32 new_dev_instance)
 756 {
 757   dpdk_main_t * dm = &dpdk_main;
 758   dpdk_device_t * xd = vec_elt_at_index (dm->devices, hi->dev_instance);
 759
 760   if (!xd || xd->dev_type != VNET_DPDK_DEV_VHOST_USER) {
 761       clib_warning("cannot renumber non-vhost-user interface (sw_if_index: %d)",
 762               hi->sw_if_index);
 763       return 0;
 764   }
 765
 766   xd->vu_if_id = new_dev_instance;
 767   return 0;
 768 }
 769
 770 static void dpdk_clear_hw_interface_counters (u32 instance)
 771 {
 772   dpdk_main_t * dm = &dpdk_main;
 773   dpdk_device_t * xd = vec_elt_at_index (dm->devices, instance);
 774
 775   /*
 776    * DAW-FIXME: VMXNET3 device stop/start doesn't work,
 777    * therefore fake the stop in the dpdk driver by
 778    * silently dropping all of the incoming pkts instead of
 779    * stopping the driver / hardware.
 780    */
 781   if (xd->admin_up != 0xff)
 782     {
 783       rte_eth_stats_reset (xd->device_index);
 784       memset (&xd->last_stats, 0, sizeof (xd->last_stats));
 785       dpdk_update_counters (xd, vlib_time_now (dm->vlib_main));
 786     }
 787   else
 788     {
 789       rte_eth_stats_reset (xd->device_index);
 790       memset(&xd->stats, 0, sizeof(xd->stats));
 791       memset (&xd->last_stats, 0, sizeof (xd->last_stats));
 792     }
 793   rte_eth_xstats_reset(xd->device_index);
 794 }
 795
 796 #ifdef RTE_LIBRTE_KNI
 797 static int
 798 kni_config_network_if(u8 port_id, u8 if_up)
 799 {
 800   vnet_main_t * vnm = vnet_get_main();
 801   dpdk_main_t * dm = &dpdk_main;
 802   dpdk_device_t * xd;
 803   uword *p;
 804
 805   p = hash_get (dm->dpdk_device_by_kni_port_id, port_id);
 806   if (p == 0) {
 807     clib_warning("unknown interface");
 808     return 0;
 809   } else {
 810     xd = vec_elt_at_index (dm->devices, p[0]);
 811   }
 812
 813   vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index,
 814                                if_up ? VNET_HW_INTERFACE_FLAG_LINK_UP |
 815                                ETH_LINK_FULL_DUPLEX : 0);
 816   return 0;
 817 }
 818
 819 static int
 820 kni_change_mtu(u8 port_id, unsigned new_mtu)
 821 {
 822   vnet_main_t * vnm = vnet_get_main();
 823   dpdk_main_t * dm = &dpdk_main;
 824   dpdk_device_t * xd;
 825   uword *p;
 826   vnet_hw_interface_t * hif;
 827
 828   p = hash_get (dm->dpdk_device_by_kni_port_id, port_id);
 829   if (p == 0) {
 830     clib_warning("unknown interface");
 831     return 0;
 832   } else {
 833     xd = vec_elt_at_index (dm->devices, p[0]);
 834   }
 835   hif = vnet_get_hw_interface (vnm, xd->vlib_hw_if_index);
 836
 837   hif->max_packet_bytes = new_mtu;
 838
 839   return 0;
 840 }
 841 #endif
 842
 843 static clib_error_t *
 844 dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
 845 {
 846   vnet_hw_interface_t * hif = vnet_get_hw_interface (vnm, hw_if_index);
 847   uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
 848   dpdk_main_t * dm = &dpdk_main;
 849   dpdk_device_t * xd = vec_elt_at_index (dm->devices, hif->dev_instance);
 850   int rv = 0;
 851
 852 #ifdef RTE_LIBRTE_KNI
 853   if (xd->dev_type == VNET_DPDK_DEV_KNI)
 854   {
 855       if (is_up)
 856       {
 857           struct rte_kni_conf conf;
 858           struct rte_kni_ops ops;
 859           vlib_main_t * vm = vlib_get_main();
 860           vlib_buffer_main_t * bm = vm->buffer_main;
 861           memset(&conf, 0, sizeof(conf));
 862           snprintf(conf.name, RTE_KNI_NAMESIZE, "vpp%u", xd->kni_port_id);
 863           conf.mbuf_size = MBUF_SIZE;
 864           memset(&ops, 0, sizeof(ops));
 865           ops.port_id = xd->kni_port_id;
 866           ops.change_mtu = kni_change_mtu;
 867           ops.config_network_if = kni_config_network_if;
 868
 869           xd->kni = rte_kni_alloc(bm->pktmbuf_pools[rte_socket_id()], &conf, &ops);
 870           if (!xd->kni)
 871           {
 872             clib_warning("failed to allocate kni interface");
 873           }
 874           else
 875           {
 876             hif->max_packet_bytes = 1500; /* kni interface default value */
 877             xd->admin_up = 1;
 878           }
 879       }
 880       else
 881       {
 882         xd->admin_up = 0;
 883         rte_kni_release(xd->kni);
 884       }
 885       return 0;
 886   }
 887 #endif
 888   if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER)
 889     {
 890       if (is_up)
 891         {
 892           if (xd->vu_is_running)
 893             vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index,
 894                                  VNET_HW_INTERFACE_FLAG_LINK_UP |
 895                                  ETH_LINK_FULL_DUPLEX );
 896           xd->admin_up = 1;
 897         }
 898       else
 899         {
 900           vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0);
 901                               xd->admin_up = 0;
 902         }
 903
 904       return 0;
 905     }
 906
 907
 908   if (is_up)
 909     {
 910       f64 now = vlib_time_now (dm->vlib_main);
 911
 912       /*
 913        * DAW-FIXME: VMXNET3 device stop/start doesn't work,
 914        * therefore fake the stop in the dpdk driver by
 915        * silently dropping all of the incoming pkts instead of
 916        * stopping the driver / hardware.
 917        */
 918       if (xd->admin_up == 0)
 919         rv = rte_eth_dev_start (xd->device_index);
 920
 921       if (xd->promisc)
 922           rte_eth_promiscuous_enable(xd->device_index);
 923       else
 924           rte_eth_promiscuous_disable(xd->device_index);
 925
 926       rte_eth_allmulticast_enable (xd->device_index);
 927       xd->admin_up = 1;
 928       dpdk_update_counters (xd, now);
 929       dpdk_update_link_state (xd, now);
 930     }
 931   else
 932     {
 933       /*
 934        * DAW-FIXME: VMXNET3 device stop/start doesn't work,
 935        * therefore fake the stop in the dpdk driver by
 936        * silently dropping all of the incoming pkts instead of
 937        * stopping the driver / hardware.
 938        */
 939       if (xd->pmd != VNET_DPDK_PMD_VMXNET3)
 940          xd->admin_up = 0;
 941       else
 942          xd->admin_up = ~0;
 943
 944       rte_eth_allmulticast_disable (xd->device_index);
 945       vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0);
 946
 947       /*
 948        * DAW-FIXME: VMXNET3 device stop/start doesn't work,
 949        * therefore fake the stop in the dpdk driver by
 950        * silently dropping all of the incoming pkts instead of
 951        * stopping the driver / hardware.
 952        */
 953       if (xd->pmd != VNET_DPDK_PMD_VMXNET3)
 954           rte_eth_dev_stop (xd->device_index);
 955     }
 956
 957   if (rv < 0)
 958     clib_warning ("rte_eth_dev_%s error: %d", is_up ? "start" : "stop",
 959                   rv);
 960
 961   return /* no error */ 0;
 962 }
 963
 964 /*
 965  * Dynamically redirect all pkts from a specific interface
 966  * to the specified node
 967  */
 968 static void dpdk_set_interface_next_node (vnet_main_t *vnm, u32 hw_if_index,
 969                                           u32 node_index)
 970 {
 971   dpdk_main_t * xm = &dpdk_main;
 972   vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
 973   dpdk_device_t * xd = vec_elt_at_index (xm->devices, hw->dev_instance);
 974
 975   /* Shut off redirection */
 976   if (node_index == ~0)
 977     {
 978       xd->per_interface_next_index = node_index;
 979       return;
 980     }
 981
 982   xd->per_interface_next_index =
 983     vlib_node_add_next (xm->vlib_main, dpdk_input_node.index, node_index);
 984 }
 985
 986
 987 static clib_error_t *
 988 dpdk_subif_add_del_function (vnet_main_t * vnm,
 989                              u32 hw_if_index,
 990                              struct vnet_sw_interface_t * st,
 991                              int is_add)
 992 {
 993   dpdk_main_t * xm = &dpdk_main;
 994   vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
 995   dpdk_device_t * xd = vec_elt_at_index (xm->devices, hw->dev_instance);
 996   vnet_sw_interface_t * t = (vnet_sw_interface_t *) st;
 997   int r, vlan_offload;
 998
 999
1000   if (xd->dev_type != VNET_DPDK_DEV_ETH)
1001         return 0;
1002   /* currently we program VLANS only for IXGBE VF */
1003   if (xd->pmd != VNET_DPDK_PMD_IXGBEVF)
1004         return 0;
1005
1006   if (t->sub.eth.flags.no_tags == 1)
1007         return 0;
1008
1009   if ((t->sub.eth.flags.one_tag != 1) || (t->sub.eth.flags.exact_match != 1 ))
1010         return clib_error_return (0, "unsupported VLAN setup");
1011
1012
1013   vlan_offload = rte_eth_dev_get_vlan_offload(xd->device_index);
1014   vlan_offload |= ETH_VLAN_FILTER_OFFLOAD;
1015
1016   if ((r = rte_eth_dev_set_vlan_offload(xd->device_index, vlan_offload)))
1017         return clib_error_return (0, "rte_eth_dev_set_vlan_offload[%d]: err %d",
1018                                   xd->device_index, r);
1019
1020
1021   if ((r = rte_eth_dev_vlan_filter(xd->device_index, t->sub.eth.outer_vlan_id, is_add)))
1022         return clib_error_return (0, "rte_eth_dev_vlan_filter[%d]: err %d",
1023                                  xd->device_index, r);
1024
1025   return 0;
1026 }
1027
1028 VNET_DEVICE_CLASS (dpdk_device_class) = {
1029   .name = "dpdk",
1030   .tx_function = dpdk_interface_tx,
1031   .tx_function_n_errors = DPDK_TX_FUNC_N_ERROR,
1032   .tx_function_error_strings = dpdk_tx_func_error_strings,
1033   .format_device_name = format_dpdk_device_name,
1034   .format_device = format_dpdk_device,
1035   .format_tx_trace = format_dpdk_tx_dma_trace,
1036   .clear_counters = dpdk_clear_hw_interface_counters,
1037   .admin_up_down_function = dpdk_interface_admin_up_down,
1038   .subif_add_del_function = dpdk_subif_add_del_function,
1039   .rx_redirect_to_node = dpdk_set_interface_next_node,
1040   .no_flatten_output_chains = 1,
1041   .name_renumber = dpdk_device_renumber,
1042 };
1043
1044 void dpdk_set_flowcontrol_callback (vlib_main_t *vm,
1045                                     dpdk_flowcontrol_callback_t callback)
1046 {
1047   dpdk_main.flowcontrol_callback = callback;
1048 }
1049
1050 #define UP_DOWN_FLAG_EVENT 1
1051
1052
1053 u32 dpdk_get_admin_up_down_in_progress (void)
1054 {
1055   return dpdk_main.admin_up_down_in_progress;
1056 }
1057
1058 static uword
1059 admin_up_down_process (vlib_main_t * vm,
1060                        vlib_node_runtime_t * rt,
1061                        vlib_frame_t * f)
1062 {
1063   clib_error_t * error = 0;
1064   uword event_type;
1065   uword *event_data = 0;
1066   u32 index;
1067   u32 sw_if_index;
1068   u32 flags;
1069
1070   while (1)
1071     {
1072       vlib_process_wait_for_event (vm);
1073
1074       event_type = vlib_process_get_events (vm, &event_data);
1075
1076       dpdk_main.admin_up_down_in_progress = 1;
1077
1078       for (index=0; index<vec_len(event_data); index++)
1079         {
1080           sw_if_index = event_data[index] >> 32;
1081           flags = (u32) event_data[index];
1082
1083           switch (event_type) {
1084           case UP_DOWN_FLAG_EVENT:
1085             error = vnet_sw_interface_set_flags (vnet_get_main(), sw_if_index, flags);
1086             clib_error_report(error);
1087             break;
1088           }
1089         }
1090
1091       vec_reset_length (event_data);
1092
1093       dpdk_main.admin_up_down_in_progress = 0;
1094
1095     }
1096   return 0; /* or not */
1097 }
1098
1099 VLIB_REGISTER_NODE (admin_up_down_process_node,static) = {
1100     .function = admin_up_down_process,
1101     .type = VLIB_NODE_TYPE_PROCESS,
1102     .name = "admin-up-down-process",
1103     .process_log2_n_stack_bytes = 17,  // 256KB
1104 };
1105
1106 /*
1107  * Asynchronously invoke vnet_sw_interface_set_flags via the admin_up_down
1108  * process. Useful for avoiding long blocking delays (>150ms) in the dpdk
1109  * drivers.
1110  * WARNING: when posting this event, no other interface-related calls should
1111  * be made (e.g. vnet_create_sw_interface()) while the event is being
1112  * processed (admin_up_down_in_progress). This is required in order to avoid
1113  * race conditions in manipulating interface data structures.
1114  */
1115 void post_sw_interface_set_flags (vlib_main_t *vm, u32 sw_if_index, u32 flags)
1116 {
1117   vlib_process_signal_event
1118       (vm, admin_up_down_process_node.index,
1119        UP_DOWN_FLAG_EVENT,
1120        (((uword)sw_if_index << 32) | flags));
1121 }
1122
1123 /*
1124  * Called by the dpdk driver's rte_delay_us() function.
1125  * Return 0 to have the dpdk do a regular delay loop.
1126  * Return 1 if to skip the delay loop because we are suspending
1127  * the calling vlib process instead.
1128  */
1129 int rte_delay_us_override (unsigned us) {
1130   vlib_main_t * vm;
1131
1132   /* Don't bother intercepting for short delays */
1133   if (us < 10) return 0;
1134
1135   /*
1136    * Only intercept if we are in a vlib process.
1137    * If we are called from a vlib worker thread or the vlib main
1138    * thread then do not intercept. (Must not be called from an
1139    * independent pthread).
1140    */
1141   if (os_get_cpu_number() == 0)
1142     {
1143       /*
1144        * We're in the vlib main thread or a vlib process. Make sure
1145        * the process is running and we're not still initializing.
1146        */
1147       vm = vlib_get_main();
1148       if (vlib_in_process_context(vm))
1149         {
1150           /* Only suspend for the admin_down_process */
1151           vlib_process_t * proc = vlib_get_current_process(vm);
1152           if (!(proc->flags & VLIB_PROCESS_IS_RUNNING) ||
1153               (proc->node_runtime.function != admin_up_down_process))
1154                 return 0;
1155
1156           f64 delay = 1e-6 * us;
1157           vlib_process_suspend(vm, delay);
1158           return 1;
1159         }
1160     }
1161   return 0; // no override
1162 }