2 *------------------------------------------------------------------
3 * Copyright (c) 2018 Cisco and/or its affiliates.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *------------------------------------------------------------------
18 #include <vlib/vlib.h>
19 #include <vlib/unix/unix.h>
20 #include <vlib/pci/pci.h>
21 #include <vnet/ethernet/ethernet.h>
22 #include <vnet/devices/devices.h>
24 #include <rdma/rdma.h>
26 #define foreach_rdma_input_error \
27 _(BUFFER_ALLOC, "buffer alloc error")
31 #define _(f,s) RDMA_INPUT_ERROR_##f,
32 foreach_rdma_input_error
37 static __clib_unused char *rdma_input_error_strings[] = {
39 foreach_rdma_input_error
44 static_always_inline void
45 ibv_set_recv_wr_and_sge (struct ibv_recv_wr *w, struct ibv_sge *s, u64 va,
46 u32 data_size, u32 lkey)
49 s[0].length = data_size;
56 static_always_inline void
57 rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd,
61 struct ibv_recv_wr wr[VLIB_FRAME_SIZE], *w = wr;
62 struct ibv_sge sge[VLIB_FRAME_SIZE], *s = sge;
63 u32 mask = rxq->size - 1;
64 u32 slot = rxq->tail & mask;
65 u32 *bufs = rxq->bufs + slot;
66 u32 data_size = vlib_buffer_get_default_data_size (vm);
69 /* do not enqueue more packet than ring space */
70 n_alloc = clib_min (VLIB_FRAME_SIZE, rxq->size - (rxq->tail - rxq->head));
72 /* do not bother to allocate if too small */
76 /* avoid wrap-around logic in core loop */
77 n_alloc = clib_min (n_alloc, rxq->size - slot);
79 n_alloc &= ~7; /* round to 8 */
81 n = vlib_buffer_alloc_to_ring_from_pool (vm, rxq->bufs, slot, rxq->size,
84 if (PREDICT_FALSE (n != n_alloc))
90 vlib_buffer_free_from_ring (vm, rxq->bufs, slot, rxq->size, n);
94 /* partial allocation, round and return rest */
98 vlib_buffer_free_from_ring (vm, rxq->bufs, (slot + n) & mask,
107 if (PREDICT_TRUE (n >= 16))
109 clib_prefetch_store (s + 16);
110 clib_prefetch_store (w + 16);
113 vlib_get_buffers_with_offset (vm, bufs, (void **) va, 8,
114 sizeof (vlib_buffer_t));
116 ibv_set_recv_wr_and_sge (w++, s++, va[0], data_size, lkey);
117 ibv_set_recv_wr_and_sge (w++, s++, va[1], data_size, lkey);
118 ibv_set_recv_wr_and_sge (w++, s++, va[2], data_size, lkey);
119 ibv_set_recv_wr_and_sge (w++, s++, va[3], data_size, lkey);
120 ibv_set_recv_wr_and_sge (w++, s++, va[4], data_size, lkey);
121 ibv_set_recv_wr_and_sge (w++, s++, va[5], data_size, lkey);
122 ibv_set_recv_wr_and_sge (w++, s++, va[6], data_size, lkey);
123 ibv_set_recv_wr_and_sge (w++, s++, va[7], data_size, lkey);
129 w[-1].next = 0; /* fix next pointer in WR linked-list last item */
132 if (ibv_post_wq_recv (rxq->wq, wr, &w) != 0)
135 vlib_buffer_free_from_ring (vm, rxq->bufs, slot + n, rxq->size,
142 static_always_inline void
143 rdma_device_input_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
144 const rdma_device_t * rd, u32 n_left, const u32 * bi,
149 if (PREDICT_TRUE (0 == (n_trace = vlib_get_trace_count (vm, node))))
153 while (n_trace && n_left)
156 rdma_input_trace_t *tr;
157 b = vlib_get_buffer (vm, bi[0]);
158 vlib_trace_buffer (vm, node, next_index, b,
159 /* follow_chain */ 0);
160 tr = vlib_add_trace (vm, node, b, sizeof (*tr));
161 tr->next_index = next_index;
162 tr->hw_if_index = rd->hw_if_index;
170 vlib_set_trace_count (vm, node, n_trace);
173 static_always_inline void
174 rdma_device_input_ethernet (vlib_main_t * vm, vlib_node_runtime_t * node,
175 const rdma_device_t * rd, u32 next_index)
177 vlib_next_frame_t *nf;
179 ethernet_input_frame_t *ef;
181 if (PREDICT_FALSE (VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT != next_index))
185 vlib_node_runtime_get_next_frame (vm, node,
186 VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT);
187 f = vlib_get_frame (vm, nf->frame);
188 f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX;
189 /* FIXME: f->flags |= ETH_INPUT_FRAME_F_IP4_CKSUM_OK; */
191 ef = vlib_frame_scalar_args (f);
192 ef->sw_if_index = rd->sw_if_index;
193 ef->hw_if_index = rd->hw_if_index;
196 static_always_inline u32
197 rdma_device_input_bufs (vlib_main_t * vm, const rdma_device_t * rd,
198 u32 * next, u32 * bi, struct ibv_wc * wc,
199 u32 n_left_from, vlib_buffer_t * bt)
201 vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
202 u32 n_rx_bytes[4] = { 0 };
204 vlib_get_buffers (vm, bi, bufs, n_left_from);
205 ASSERT (bt->buffer_pool_index == bufs[0]->buffer_pool_index);
207 while (n_left_from >= 4)
209 if (PREDICT_TRUE (n_left_from >= 8))
211 CLIB_PREFETCH (&wc[4 + 0], CLIB_CACHE_LINE_BYTES, LOAD);
212 CLIB_PREFETCH (&wc[4 + 1], CLIB_CACHE_LINE_BYTES, LOAD);
213 CLIB_PREFETCH (&wc[4 + 2], CLIB_CACHE_LINE_BYTES, LOAD);
214 CLIB_PREFETCH (&wc[4 + 3], CLIB_CACHE_LINE_BYTES, LOAD);
215 vlib_prefetch_buffer_header (b[4 + 0], STORE);
216 vlib_prefetch_buffer_header (b[4 + 1], STORE);
217 vlib_prefetch_buffer_header (b[4 + 2], STORE);
218 vlib_prefetch_buffer_header (b[4 + 3], STORE);
221 vlib_buffer_copy_indices (next, bi, 4);
223 vlib_buffer_copy_template (b[0], bt);
224 vlib_buffer_copy_template (b[1], bt);
225 vlib_buffer_copy_template (b[2], bt);
226 vlib_buffer_copy_template (b[3], bt);
228 b[0]->current_length = wc[0].byte_len;
229 b[1]->current_length = wc[1].byte_len;
230 b[2]->current_length = wc[2].byte_len;
231 b[3]->current_length = wc[3].byte_len;
233 n_rx_bytes[0] += wc[0].byte_len;
234 n_rx_bytes[1] += wc[1].byte_len;
235 n_rx_bytes[2] += wc[2].byte_len;
236 n_rx_bytes[3] += wc[3].byte_len;
245 while (n_left_from >= 1)
247 vlib_buffer_copy_indices (next, bi, 1);
248 vlib_buffer_copy_template (b[0], bt);
249 b[0]->current_length = wc[0].byte_len;
250 n_rx_bytes[0] += wc[0].byte_len;
259 return n_rx_bytes[0] + n_rx_bytes[1] + n_rx_bytes[2] + n_rx_bytes[3];
262 static_always_inline uword
263 rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
264 vlib_frame_t * frame, rdma_device_t * rd, u16 qid)
266 rdma_main_t *rm = &rdma_main;
267 vnet_main_t *vnm = vnet_get_main ();
268 rdma_per_thread_data_t *ptd = vec_elt_at_index (rm->per_thread_data,
270 rdma_rxq_t *rxq = vec_elt_at_index (rd->rxqs, qid);
271 struct ibv_wc wc[VLIB_FRAME_SIZE];
273 u32 next_index, *to_next, n_left_to_next;
274 u32 n_rx_packets, n_rx_bytes;
277 ASSERT (rxq->size >= VLIB_FRAME_SIZE && is_pow2 (rxq->size));
278 ASSERT (rxq->tail - rxq->head <= rxq->size);
280 n_rx_packets = ibv_poll_cq (rxq->cq, VLIB_FRAME_SIZE, wc);
281 ASSERT (n_rx_packets <= rxq->tail - rxq->head);
283 if (PREDICT_FALSE (n_rx_packets <= 0))
286 /* init buffer template */
287 vlib_buffer_copy_template (&bt, &ptd->buffer_template);
288 vnet_buffer (&bt)->sw_if_index[VLIB_RX] = rd->sw_if_index;
289 bt.buffer_pool_index = rd->pool;
291 /* update buffer template for input feature arcs if any */
292 next_index = rd->per_interface_next_index;
293 if (PREDICT_FALSE (vnet_device_input_have_features (rd->sw_if_index)))
294 vnet_feature_start_device_input_x1 (rd->sw_if_index, &next_index, &bt);
296 vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next);
297 ASSERT (n_rx_packets <= n_left_to_next);
300 * avoid wrap-around logic in core loop
301 * we requested VLIB_FRAME_SIZE packets and rxq->size >= VLIB_FRAME_SIZE
302 * => we can process all packets in 2 iterations max
304 slot = rxq->head & (rxq->size - 1);
305 n_tail = clib_min (n_rx_packets, rxq->size - slot);
307 rdma_device_input_bufs (vm, rd, &to_next[0], &rxq->bufs[slot], wc, n_tail,
309 if (n_tail < n_rx_packets)
311 rdma_device_input_bufs (vm, rd, &to_next[n_tail], &rxq->bufs[0],
312 &wc[n_tail], n_rx_packets - n_tail, &bt);
313 rdma_device_input_ethernet (vm, node, rd, next_index);
315 vlib_put_next_frame (vm, node, next_index, n_left_to_next - n_rx_packets);
317 rxq->head += n_rx_packets;
319 rdma_device_input_trace (vm, node, rd, n_rx_packets, to_next, next_index);
321 vlib_increment_combined_counter
322 (vnm->interface_main.combined_sw_if_counters +
323 VNET_INTERFACE_COUNTER_RX, vm->thread_index,
324 rd->hw_if_index, n_rx_packets, n_rx_bytes);
327 rdma_device_input_refill (vm, rd, rxq);
332 VLIB_NODE_FN (rdma_input_node) (vlib_main_t * vm,
333 vlib_node_runtime_t * node,
334 vlib_frame_t * frame)
337 rdma_main_t *rm = &rdma_main;
338 vnet_device_input_runtime_t *rt = (void *) node->runtime_data;
339 vnet_device_and_queue_t *dq;
341 foreach_device_and_queue (dq, rt->devices_and_queues)
344 rd = vec_elt_at_index (rm->devices, dq->dev_instance);
345 if (PREDICT_TRUE (rd->flags & RDMA_DEVICE_F_ADMIN_UP))
346 n_rx += rdma_device_input_inline (vm, node, frame, rd, dq->queue_id);
352 VLIB_REGISTER_NODE (rdma_input_node) = {
353 .name = "rdma-input",
354 .flags = VLIB_NODE_FLAG_TRACE_SUPPORTED,
355 .sibling_of = "device-input",
356 .format_trace = format_rdma_input_trace,
357 .type = VLIB_NODE_TYPE_INPUT,
358 .state = VLIB_NODE_STATE_DISABLED,
359 .n_errors = RDMA_INPUT_N_ERROR,
360 .error_strings = rdma_input_error_strings,
367 * fd.io coding-style-patch-verification: ON
370 * eval: (c-set-style "gnu")