2 * Copyright (c) 2017 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 * buffer.c: allocate/free network buffers.
18 * Copyright (c) 2008 Eliot Dresselhaus
20 * Permission is hereby granted, free of charge, to any person obtaining
21 * a copy of this software and associated documentation files (the
22 * "Software"), to deal in the Software without restriction, including
23 * without limitation the rights to use, copy, modify, merge, publish,
24 * distribute, sublicense, and/or sell copies of the Software, and to
25 * permit persons to whom the Software is furnished to do so, subject to
26 * the following conditions:
28 * The above copyright notice and this permission notice shall be
29 * included in all copies or substantial portions of the Software.
31 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
32 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
33 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
34 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
35 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
36 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
37 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
43 * Allocate/free network buffers.
47 #include <linux/vfio.h>
48 #include <sys/ioctl.h>
50 #include <rte_config.h>
52 #include <rte_common.h>
54 #include <rte_memory.h>
55 #include <rte_memzone.h>
56 #include <rte_tailq.h>
58 #include <rte_per_lcore.h>
59 #include <rte_launch.h>
60 #include <rte_atomic.h>
61 #include <rte_cycles.h>
62 #include <rte_prefetch.h>
63 #include <rte_lcore.h>
64 #include <rte_per_lcore.h>
65 #include <rte_branch_prediction.h>
66 #include <rte_interrupts.h>
68 #include <rte_random.h>
69 #include <rte_debug.h>
70 #include <rte_ether.h>
71 #include <rte_ethdev.h>
73 #include <rte_mempool.h>
75 #include <rte_version.h>
77 #include <vlib/vlib.h>
78 #include <vlib/unix/unix.h>
79 #include <vlib/pci/pci.h>
80 #include <vlib/linux/vfio.h>
81 #include <vnet/vnet.h>
82 #include <dpdk/device/dpdk.h>
83 #include <dpdk/device/dpdk_priv.h>
85 STATIC_ASSERT (VLIB_BUFFER_PRE_DATA_SIZE == RTE_PKTMBUF_HEADROOM,
86 "VLIB_BUFFER_PRE_DATA_SIZE must be equal to RTE_PKTMBUF_HEADROOM");
90 CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
91 struct rte_mbuf **mbuf_alloc_list;
92 } dpdk_buffer_per_thread_data;
96 int vfio_container_fd;
97 dpdk_buffer_per_thread_data *ptd;
100 dpdk_buffer_main_t dpdk_buffer_main;
102 static_always_inline void
103 dpdk_rte_pktmbuf_free (vlib_main_t * vm, u32 thread_index, vlib_buffer_t * b,
111 next = b->next_buffer;
112 mb = rte_mbuf_from_vlib_buffer (b);
114 if (PREDICT_FALSE (b->n_add_refs))
116 rte_mbuf_refcnt_update (mb, b->n_add_refs);
120 if ((mb = rte_pktmbuf_prefree_seg (mb)))
121 rte_mempool_put (mb->pool, mb);
123 if (maybe_next && (flags & VLIB_BUFFER_NEXT_PRESENT))
125 b = vlib_get_buffer (vm, next);
130 #ifndef CLIB_MARCH_VARIANT
132 del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f)
136 u32 thread_index = vlib_get_thread_index ();
138 for (i = 0; i < vec_len (f->buffers); i++)
140 b = vlib_get_buffer (vm, f->buffers[i]);
141 dpdk_rte_pktmbuf_free (vm, thread_index, b, 1);
145 vec_free (f->buffers);
147 memset (f, 0xab, sizeof (f[0]));
150 /* Add buffer free list. */
152 dpdk_buffer_delete_free_list (vlib_main_t * vm,
153 vlib_buffer_free_list_index_t free_list_index)
155 vlib_buffer_free_list_t *f;
158 ASSERT (vlib_get_thread_index () == 0);
160 f = vlib_buffer_get_free_list (vm, free_list_index);
162 del_free_list (vm, f);
164 pool_put (vm->buffer_free_list_pool, f);
166 for (i = 1; i < vec_len (vlib_mains); i++)
168 vlib_main_t *wvm = vlib_mains[i];
169 f = vlib_buffer_get_free_list (vlib_mains[i], free_list_index);
170 del_free_list (wvm, f);
171 pool_put (wvm->buffer_free_list_pool, f);
176 /* Make sure free list has at least given number of free buffers. */
178 CLIB_MULTIARCH_FN (dpdk_buffer_fill_free_list) (vlib_main_t * vm,
179 vlib_buffer_free_list_t * fl,
180 uword min_free_buffers)
182 dpdk_main_t *dm = &dpdk_main;
183 dpdk_buffer_main_t *dbm = &dpdk_buffer_main;
184 struct rte_mbuf **mb;
187 unsigned socket_id = rte_socket_id ();
188 u32 thread_index = vlib_get_thread_index ();
189 dpdk_buffer_per_thread_data *d = vec_elt_at_index (dbm->ptd, thread_index);
190 struct rte_mempool *rmp = dm->pktmbuf_pools[socket_id];
191 dpdk_mempool_private_t *privp = rte_mempool_get_priv (rmp);
196 if (PREDICT_FALSE (rmp == 0))
199 /* Already have enough free buffers on free list? */
200 n_alloc = min_free_buffers - vec_len (fl->buffers);
202 return min_free_buffers;
204 /* Always allocate round number of buffers. */
205 n_alloc = round_pow2 (n_alloc, CLIB_CACHE_LINE_BYTES / sizeof (u32));
207 /* Always allocate new buffers in reasonably large sized chunks. */
208 n_alloc = clib_max (n_alloc, fl->min_n_buffers_each_alloc);
210 vec_validate_aligned (d->mbuf_alloc_list, n_alloc - 1,
211 CLIB_CACHE_LINE_BYTES);
213 if (rte_mempool_get_bulk (rmp, (void *) d->mbuf_alloc_list, n_alloc) < 0)
216 memset (&bt, 0, sizeof (vlib_buffer_t));
217 vlib_buffer_init_for_free_list (&bt, fl);
218 bt.buffer_pool_index = privp->buffer_pool_index;
220 _vec_len (d->mbuf_alloc_list) = n_alloc;
222 first = vec_len (fl->buffers);
223 vec_resize_aligned (fl->buffers, n_alloc, CLIB_CACHE_LINE_BYTES);
226 mb = d->mbuf_alloc_list;
227 bi = fl->buffers + first;
229 ASSERT (n_left % 8 == 0);
233 if (PREDICT_FALSE (n_left < 24))
236 vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[16]), STORE);
237 vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[17]), STORE);
238 vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[18]), STORE);
239 vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[19]), STORE);
240 vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[20]), STORE);
241 vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[21]), STORE);
242 vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[22]), STORE);
243 vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[23]), STORE);
246 vlib_get_buffer_indices_with_offset (vm, (void **) mb, bi, 8,
247 sizeof (struct rte_mbuf));
248 clib_memcpy64_x4 (vlib_buffer_from_rte_mbuf (mb[0]),
249 vlib_buffer_from_rte_mbuf (mb[1]),
250 vlib_buffer_from_rte_mbuf (mb[2]),
251 vlib_buffer_from_rte_mbuf (mb[3]), &bt);
252 clib_memcpy64_x4 (vlib_buffer_from_rte_mbuf (mb[4]),
253 vlib_buffer_from_rte_mbuf (mb[5]),
254 vlib_buffer_from_rte_mbuf (mb[6]),
255 vlib_buffer_from_rte_mbuf (mb[7]), &bt);
262 if (fl->buffer_init_function)
263 fl->buffer_init_function (vm, fl, fl->buffers + first, n_alloc);
265 fl->n_alloc += n_alloc;
270 static_always_inline void
271 dpdk_prefetch_buffer (vlib_buffer_t * b)
274 mb = rte_mbuf_from_vlib_buffer (b);
275 CLIB_PREFETCH (mb, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
276 CLIB_PREFETCH (b, CLIB_CACHE_LINE_BYTES, LOAD);
279 static_always_inline void
280 recycle_or_free (vlib_main_t * vm, vlib_buffer_main_t * bm, u32 bi,
283 u32 thread_index = vlib_get_thread_index ();
285 dpdk_rte_pktmbuf_free (vm, thread_index, b, 1);
288 static_always_inline void
289 vlib_buffer_free_inline (vlib_main_t * vm,
290 u32 * buffers, u32 n_buffers, u32 follow_buffer_next)
292 vlib_buffer_main_t *bm = &buffer_main;
293 vlib_buffer_t *bufp[n_buffers], **b = bufp;
294 u32 thread_index = vlib_get_thread_index ();
296 u32 simple_mask = (VLIB_BUFFER_NON_DEFAULT_FREELIST |
297 VLIB_BUFFER_NEXT_PRESENT);
299 u32 (*cb) (vlib_main_t * vm, u32 * buffers, u32 n_buffers,
300 u32 follow_buffer_next);
302 cb = bm->buffer_free_callback;
304 if (PREDICT_FALSE (cb != 0))
305 n_buffers = (*cb) (vm, buffers, n_buffers, follow_buffer_next);
313 vlib_get_buffers (vm, bi, b, n_buffers);
324 dpdk_prefetch_buffer (p[0]);
325 dpdk_prefetch_buffer (p[1]);
326 dpdk_prefetch_buffer (p[2]);
327 dpdk_prefetch_buffer (p[3]);
330 for (i = 0; i < 4; i++)
331 VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[i]);
333 or_flags = b[0]->flags | b[1]->flags | b[2]->flags | b[3]->flags;
335 if (or_flags & simple_mask)
337 recycle_or_free (vm, bm, bi[0], b[0]);
338 recycle_or_free (vm, bm, bi[1], b[1]);
339 recycle_or_free (vm, bm, bi[2], b[2]);
340 recycle_or_free (vm, bm, bi[3], b[3]);
344 dpdk_rte_pktmbuf_free (vm, thread_index, b[0], 0);
345 dpdk_rte_pktmbuf_free (vm, thread_index, b[1], 0);
346 dpdk_rte_pktmbuf_free (vm, thread_index, b[2], 0);
347 dpdk_rte_pktmbuf_free (vm, thread_index, b[3], 0);
355 VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]);
356 recycle_or_free (vm, bm, bi[0], b[0]);
364 CLIB_MULTIARCH_FN (dpdk_buffer_free) (vlib_main_t * vm, u32 * buffers,
367 vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */
372 CLIB_MULTIARCH_FN (dpdk_buffer_free_no_next) (vlib_main_t * vm, u32 * buffers,
375 vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */
379 #ifndef CLIB_MARCH_VARIANT
381 dpdk_packet_template_init (vlib_main_t * vm,
384 uword n_packet_data_bytes,
385 uword min_n_buffers_each_alloc, u8 * name)
387 vlib_packet_template_t *t = (vlib_packet_template_t *) vt;
389 vlib_worker_thread_barrier_sync (vm);
390 memset (t, 0, sizeof (t[0]));
392 vec_add (t->packet_data, packet_data, n_packet_data_bytes);
394 vlib_worker_thread_barrier_release (vm);
397 static clib_error_t *
398 scan_vfio_fd (void *arg, u8 * path_name, u8 * file_name)
400 dpdk_buffer_main_t *dbm = &dpdk_buffer_main;
401 linux_vfio_main_t *lvm = &vfio_main;
402 const char fn[] = "/dev/vfio/vfio";
403 char buff[sizeof (fn)] = { 0 };
405 u8 *path = format (0, "%v%c", path_name, 0);
407 if (readlink ((char *) path, buff, sizeof (fn)) + 1 != sizeof (fn))
410 if (strncmp (fn, buff, sizeof (fn)))
413 fd = atoi ((char *) file_name);
414 if (fd != lvm->container_fd)
415 dbm->vfio_container_fd = fd;
423 dpdk_pool_create (vlib_main_t * vm, u8 * pool_name, u32 elt_size,
424 u32 num_elts, u32 pool_priv_size, u16 cache_size, u8 numa,
425 struct rte_mempool ** _mp,
426 vlib_physmem_region_index_t * pri)
428 dpdk_buffer_main_t *dbm = &dpdk_buffer_main;
429 struct rte_mempool *mp;
430 vlib_physmem_region_t *pr;
431 dpdk_mempool_private_t priv;
432 clib_error_t *error = 0;
433 size_t min_chunk_size, align;
439 mp = rte_mempool_create_empty ((char *) pool_name, num_elts, elt_size,
440 512, pool_priv_size, numa, 0);
442 return clib_error_return (0, "failed to create %s", pool_name);
444 rte_mempool_set_ops_byname (mp, RTE_MBUF_DEFAULT_MEMPOOL_OPS, NULL);
446 size = rte_mempool_op_calc_mem_size_default (mp, num_elts, 21,
447 &min_chunk_size, &align);
449 error = vlib_physmem_region_alloc (vm, (char *) pool_name, size, numa,
450 VLIB_PHYSMEM_F_HUGETLB |
451 VLIB_PHYSMEM_F_SHARED, pri);
454 rte_mempool_free (mp);
458 pr = vlib_physmem_get_region (vm, pri[0]);
460 /* Call the mempool priv initializer */
461 priv.mbp_priv.mbuf_data_room_size = VLIB_BUFFER_PRE_DATA_SIZE +
462 VLIB_BUFFER_DATA_SIZE;
463 priv.mbp_priv.mbuf_priv_size = VLIB_BUFFER_HDR_SIZE;
464 rte_pktmbuf_pool_init (mp, &priv);
466 for (i = 0; i < pr->n_pages; i++)
468 size_t page_size = 1ull << pr->log2_page_size;
469 ret = rte_mempool_populate_iova (mp, ((char *) pr->mem) + i * page_size,
470 pr->page_table[i], page_size, 0, 0);
473 rte_mempool_free (mp);
474 return clib_error_return (0, "failed to populate %s", pool_name);
480 /* DPDK currently doesn't provide API to map DMA memory for empty mempool
481 so we are using this hack, will be nice to have at least API to get
483 if (dbm->vfio_container_fd == -1)
484 foreach_directory_file ("/proc/self/fd", scan_vfio_fd, 0, 0);
486 if (dbm->vfio_container_fd != -1)
488 struct vfio_iommu_type1_dma_map dm = { 0 };
490 dm.argsz = sizeof (struct vfio_iommu_type1_dma_map);
491 dm.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
494 vec_foreach_index (i, pr->page_table)
496 dm.vaddr = pointer_to_uword (pr->mem) + ((u64)i << pr->log2_page_size);
497 dm.size = 1ull << pr->log2_page_size;
498 dm.iova = pr->page_table[i];
499 if ((rv = ioctl (dbm->vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dm)))
503 if (rv != 0 && errno != EINVAL)
504 clib_unix_warning ("ioctl(VFIO_IOMMU_MAP_DMA) pool '%s'", pool_name);
511 dpdk_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs,
514 dpdk_main_t *dm = &dpdk_main;
515 struct rte_mempool *rmp;
516 vlib_physmem_region_index_t pri;
517 clib_error_t *error = 0;
521 vec_validate_aligned (dm->pktmbuf_pools, socket_id, CLIB_CACHE_LINE_BYTES);
523 /* pool already exists, nothing to do */
524 if (dm->pktmbuf_pools[socket_id])
527 pool_name = format (0, "dpdk_mbuf_pool_socket%u%c", socket_id, 0);
529 elt_size = sizeof (struct rte_mbuf) +
530 VLIB_BUFFER_HDR_SIZE /* priv size */ +
531 VLIB_BUFFER_PRE_DATA_SIZE + VLIB_BUFFER_DATA_SIZE; /*data room size */
534 dpdk_pool_create (vm, pool_name, elt_size, num_mbufs,
535 sizeof (dpdk_mempool_private_t), 512, socket_id,
538 vec_free (pool_name);
542 /* call the object initializers */
543 rte_mempool_obj_iter (rmp, rte_pktmbuf_init, 0);
545 dpdk_mempool_private_t *privp = rte_mempool_get_priv (rmp);
546 privp->buffer_pool_index = vlib_buffer_pool_create (vm, pri, 0);
548 dm->pktmbuf_pools[socket_id] = rmp;
553 clib_error_report (error);
555 /* no usable pool for this socket, try to use pool from another one */
556 for (i = 0; i < vec_len (dm->pktmbuf_pools); i++)
558 if (dm->pktmbuf_pools[i])
560 clib_warning ("WARNING: Failed to allocate mempool for CPU socket "
561 "%u. Threads running on socket %u will use socket %u "
562 "mempool.", socket_id, socket_id, i);
563 dm->pktmbuf_pools[socket_id] = dm->pktmbuf_pools[i];
568 return clib_error_return (0, "failed to allocate mempool on socket %u",
574 u32 *vlib_buffer_state_validation_lock;
575 uword *vlib_buffer_state_validation_hash;
576 void *vlib_buffer_state_heap;
578 static clib_error_t *
579 buffer_state_validation_init (vlib_main_t * vm)
583 vlib_buffer_state_heap =
584 mheap_alloc_with_lock (0, 10 << 20, 0 /* locked */ );
585 oldheap = clib_mem_set_heap (vlib_buffer_state_heap);
587 vlib_buffer_state_validation_hash = hash_create (0, sizeof (uword));
588 vec_validate_aligned (vlib_buffer_state_validation_lock, 0,
589 CLIB_CACHE_LINE_BYTES);
590 clib_mem_set_heap (oldheap);
594 VLIB_INIT_FUNCTION (buffer_state_validation_init);
598 struct dpdk_validate_buf_result
604 #define DPDK_TRAJECTORY_POISON 31
607 dpdk_buffer_validate_trajectory (struct rte_mempool *mp, void *opaque,
608 void *obj, unsigned obj_idx)
611 struct dpdk_validate_buf_result *counter = opaque;
612 b = vlib_buffer_from_rte_mbuf ((struct rte_mbuf *) obj);
613 if (b->pre_data[0] != 0)
615 if (b->pre_data[0] == DPDK_TRAJECTORY_POISON)
616 counter->uninitialized++;
623 dpdk_buffer_validate_trajectory_all (u32 * uninitialized)
625 dpdk_main_t *dm = &dpdk_main;
626 struct dpdk_validate_buf_result counter = { 0 };
629 for (i = 0; i < vec_len (dm->pktmbuf_pools); i++)
630 rte_mempool_obj_iter (dm->pktmbuf_pools[i],
631 dpdk_buffer_validate_trajectory, &counter);
633 *uninitialized = counter.uninitialized;
634 return counter.invalid;
638 dpdk_buffer_poison_trajectory (struct rte_mempool *mp, void *opaque,
639 void *obj, unsigned obj_idx)
642 b = vlib_buffer_from_rte_mbuf ((struct rte_mbuf *) obj);
643 b->pre_data[0] = DPDK_TRAJECTORY_POISON;
647 dpdk_buffer_poison_trajectory_all (void)
649 dpdk_main_t *dm = &dpdk_main;
652 for (i = 0; i < vec_len (dm->pktmbuf_pools); i++)
653 rte_mempool_obj_iter (dm->pktmbuf_pools[i], dpdk_buffer_poison_trajectory,
658 static clib_error_t *
659 dpdk_buffer_init (vlib_main_t * vm)
661 dpdk_buffer_main_t *dbm = &dpdk_buffer_main;
662 vlib_thread_main_t *tm = vlib_get_thread_main ();
664 vec_validate_aligned (dbm->ptd, tm->n_vlib_mains - 1,
665 CLIB_CACHE_LINE_BYTES);
667 dbm->vfio_container_fd = -1;
672 VLIB_INIT_FUNCTION (dpdk_buffer_init);
675 VLIB_BUFFER_REGISTER_CALLBACKS (dpdk, static) = {
676 .vlib_buffer_fill_free_list_cb = &dpdk_buffer_fill_free_list,
677 .vlib_buffer_free_cb = &dpdk_buffer_free,
678 .vlib_buffer_free_no_next_cb = &dpdk_buffer_free_no_next,
679 .vlib_packet_template_init_cb = &dpdk_packet_template_init,
680 .vlib_buffer_delete_free_list_cb = &dpdk_buffer_delete_free_list,
685 vlib_buffer_fill_free_list_cb_t __clib_weak dpdk_buffer_fill_free_list_avx512;
686 vlib_buffer_fill_free_list_cb_t __clib_weak dpdk_buffer_fill_free_list_avx2;
687 vlib_buffer_free_cb_t __clib_weak dpdk_buffer_free_avx512;
688 vlib_buffer_free_cb_t __clib_weak dpdk_buffer_free_avx2;
689 vlib_buffer_free_no_next_cb_t __clib_weak dpdk_buffer_free_no_next_avx512;
690 vlib_buffer_free_no_next_cb_t __clib_weak dpdk_buffer_free_no_next_avx2;
692 static void __clib_constructor
693 dpdk_input_multiarch_select (void)
695 vlib_buffer_callbacks_t *cb = &__dpdk_buffer_callbacks;
696 if (dpdk_buffer_fill_free_list_avx512 && clib_cpu_supports_avx512f ())
698 cb->vlib_buffer_fill_free_list_cb = dpdk_buffer_fill_free_list_avx512;
699 cb->vlib_buffer_free_cb = dpdk_buffer_free_avx512;
700 cb->vlib_buffer_free_no_next_cb = dpdk_buffer_free_no_next_avx512;
702 else if (dpdk_buffer_fill_free_list_avx2 && clib_cpu_supports_avx2 ())
704 cb->vlib_buffer_fill_free_list_cb = dpdk_buffer_fill_free_list_avx2;
705 cb->vlib_buffer_free_cb = dpdk_buffer_free_avx2;
706 cb->vlib_buffer_free_no_next_cb = dpdk_buffer_free_no_next_avx2;
714 * fd.io coding-style-patch-verification: ON
717 * eval: (c-set-style "gnu")