2 *------------------------------------------------------------------
3 * Copyright (c) 2018 Cisco and/or its affiliates.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *------------------------------------------------------------------
18 #include <vlib/vlib.h>
19 #include <vlib/unix/unix.h>
20 #include <vlib/pci/pci.h>
21 #include <vnet/ethernet/ethernet.h>
22 #include <vnet/devices/devices.h>
24 #include <rdma/rdma.h>
26 #define foreach_rdma_input_error \
27 _(BUFFER_ALLOC, "buffer alloc error")
31 #define _(f,s) RDMA_INPUT_ERROR_##f,
32 foreach_rdma_input_error
37 static __clib_unused char *rdma_input_error_strings[] = {
39 foreach_rdma_input_error
43 static_always_inline void
44 rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd,
48 u32 buffers[VLIB_FRAME_SIZE], *bi = buffers;
49 vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
50 struct ibv_recv_wr wr[VLIB_FRAME_SIZE], *w = wr;
51 struct ibv_sge sge[VLIB_FRAME_SIZE], *s = sge;
53 if (PREDICT_FALSE (rxq->n_enq >= rxq->size))
56 n_alloc = clib_min (VLIB_FRAME_SIZE, rxq->size - rxq->n_enq);
57 n_alloc = n = vlib_buffer_alloc (vm, buffers, n_alloc);
58 vlib_get_buffers (vm, buffers, bufs, n_alloc);
62 if (PREDICT_TRUE (n >= 8))
64 CLIB_PREFETCH (&s[4 + 0], 4 * sizeof (s[0]), STORE);
65 CLIB_PREFETCH (&w[4 + 0], 4 * sizeof (w[0]), STORE);
68 s[0].addr = vlib_buffer_get_va (b[0]);
69 s[0].length = vlib_buffer_get_default_data_size (vm);
70 s[0].lkey = rd->mr->lkey;
72 s[1].addr = vlib_buffer_get_va (b[1]);
73 s[1].length = vlib_buffer_get_default_data_size (vm);
74 s[1].lkey = rd->mr->lkey;
76 s[2].addr = vlib_buffer_get_va (b[2]);
77 s[2].length = vlib_buffer_get_default_data_size (vm);
78 s[2].lkey = rd->mr->lkey;
80 s[3].addr = vlib_buffer_get_va (b[3]);
81 s[3].length = vlib_buffer_get_default_data_size (vm);
82 s[3].lkey = rd->mr->lkey;
85 w[0].next = &w[0] + 1;
90 w[1].next = &w[1] + 1;
95 w[2].next = &w[2] + 1;
100 w[3].next = &w[3] + 1;
101 w[3].sg_list = &s[3];
113 s[0].addr = vlib_buffer_get_va (b[0]);
114 s[0].length = vlib_buffer_get_default_data_size (vm);
115 s[0].lkey = rd->mr->lkey;
118 w[0].next = &w[0] + 1;
119 w[0].sg_list = &s[0];
129 w[-1].next = 0; /* fix next pointer in WR linked-list last item */
132 ibv_post_wq_recv (rxq->wq, wr, &w);
133 n = wr == w ? n_alloc : (uintptr_t) (w - wr);
135 if (PREDICT_FALSE (n != n_alloc))
136 vlib_buffer_free (vm, buffers + n, n_alloc - n);
141 static_always_inline void
142 rdma_device_input_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
143 const rdma_device_t * rd, u32 n_left, const u32 * bi)
147 if (PREDICT_TRUE (0 == (n_trace = vlib_get_trace_count (vm, node))))
151 while (n_trace && n_left)
154 rdma_input_trace_t *tr;
155 b = vlib_get_buffer (vm, bi[0]);
156 vlib_trace_buffer (vm, node, rd->per_interface_next_index, b,
157 /* follow_chain */ 0);
158 tr = vlib_add_trace (vm, node, b, sizeof (*tr));
159 tr->next_index = rd->per_interface_next_index;
160 tr->hw_if_index = rd->hw_if_index;
168 vlib_set_trace_count (vm, node, n_trace);
171 static_always_inline void
172 rdma_device_input_ethernet (vlib_main_t * vm, vlib_node_runtime_t * node,
173 const rdma_device_t * rd)
175 vlib_next_frame_t *nf;
177 ethernet_input_frame_t *ef;
180 (VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT != rd->per_interface_next_index))
184 vlib_node_runtime_get_next_frame (vm, node, rd->per_interface_next_index);
185 f = vlib_get_frame (vm, nf->frame_index);
186 f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX;
187 /* FIXME: f->flags |= ETH_INPUT_FRAME_F_IP4_CKSUM_OK; */
189 ef = vlib_frame_scalar_args (f);
190 ef->sw_if_index = rd->sw_if_index;
191 ef->hw_if_index = rd->hw_if_index;
194 static_always_inline u32
195 rdma_device_input_load_wc (u32 n_left_from, struct ibv_wc * wc, u32 * to_next,
198 u32 n_rx_bytes[4] = { 0 };
200 while (n_left_from >= 4)
202 if (PREDICT_TRUE (n_left_from >= 8))
204 CLIB_PREFETCH (&wc[4 + 0], CLIB_CACHE_LINE_BYTES, LOAD);
205 CLIB_PREFETCH (&wc[4 + 1], CLIB_CACHE_LINE_BYTES, LOAD);
206 CLIB_PREFETCH (&wc[4 + 2], CLIB_CACHE_LINE_BYTES, LOAD);
207 CLIB_PREFETCH (&wc[4 + 3], CLIB_CACHE_LINE_BYTES, LOAD);
208 CLIB_PREFETCH (&bufsz[4 + 0], 4 * sizeof (bufsz[0]), STORE);
209 CLIB_PREFETCH (&to_next[4 + 0], 4 * sizeof (to_next[0]), STORE);
212 to_next[0] = wc[0].wr_id;
213 to_next[1] = wc[1].wr_id;
214 to_next[2] = wc[2].wr_id;
215 to_next[3] = wc[3].wr_id;
217 bufsz[0] = wc[0].byte_len;
218 bufsz[1] = wc[1].byte_len;
219 bufsz[2] = wc[2].byte_len;
220 bufsz[3] = wc[3].byte_len;
222 n_rx_bytes[0] += wc[0].byte_len;
223 n_rx_bytes[1] += wc[1].byte_len;
224 n_rx_bytes[2] += wc[2].byte_len;
225 n_rx_bytes[3] += wc[3].byte_len;
233 while (n_left_from >= 1)
235 to_next[0] = wc[0].wr_id;
236 bufsz[0] = wc[0].byte_len;
237 n_rx_bytes[0] += wc[0].byte_len;
245 return n_rx_bytes[0] + n_rx_bytes[1] + n_rx_bytes[2] + n_rx_bytes[3];
248 static_always_inline void
249 rdma_device_input_bufs_init (u32 n_left_from, vlib_buffer_t ** bufs,
250 u32 * bufsz, u32 sw_if_index)
252 while (n_left_from >= 4)
254 if (PREDICT_TRUE (n_left_from >= 8))
256 vlib_prefetch_buffer_header (bufs[4 + 0], STORE);
257 vlib_prefetch_buffer_header (bufs[4 + 1], STORE);
258 vlib_prefetch_buffer_header (bufs[4 + 2], STORE);
259 vlib_prefetch_buffer_header (bufs[4 + 3], STORE);
260 CLIB_PREFETCH (&bufsz[4 + 0], 4 * sizeof (bufsz[0]), LOAD);
263 bufs[0]->current_length = bufsz[0];
264 bufs[1]->current_length = bufsz[1];
265 bufs[2]->current_length = bufsz[2];
266 bufs[3]->current_length = bufsz[3];
268 vnet_buffer (bufs[0])->sw_if_index[VLIB_RX] = sw_if_index;
269 vnet_buffer (bufs[1])->sw_if_index[VLIB_RX] = sw_if_index;
270 vnet_buffer (bufs[2])->sw_if_index[VLIB_RX] = sw_if_index;
271 vnet_buffer (bufs[3])->sw_if_index[VLIB_RX] = sw_if_index;
273 vnet_buffer (bufs[0])->sw_if_index[VLIB_TX] = ~0;
274 vnet_buffer (bufs[1])->sw_if_index[VLIB_TX] = ~0;
275 vnet_buffer (bufs[2])->sw_if_index[VLIB_TX] = ~0;
276 vnet_buffer (bufs[3])->sw_if_index[VLIB_TX] = ~0;
283 while (n_left_from >= 1)
285 bufs[0]->current_length = bufsz[0];
286 vnet_buffer (bufs[0])->sw_if_index[VLIB_RX] = sw_if_index;
287 vnet_buffer (bufs[0])->sw_if_index[VLIB_TX] = ~0;
295 static_always_inline uword
296 rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
297 vlib_frame_t * frame, rdma_device_t * rd, u16 qid)
299 vnet_main_t *vnm = vnet_get_main ();
300 rdma_rxq_t *rxq = vec_elt_at_index (rd->rxqs, qid);
301 struct ibv_wc wc[VLIB_FRAME_SIZE];
302 u32 bufsz[VLIB_FRAME_SIZE];
303 vlib_buffer_t *bufs[VLIB_FRAME_SIZE];
304 u32 *to_next, n_left_to_next;
305 u32 n_rx_packets, n_rx_bytes;
307 n_rx_packets = ibv_poll_cq (rxq->cq, VLIB_FRAME_SIZE, wc);
309 if (PREDICT_FALSE (n_rx_packets <= 0))
311 rdma_device_input_refill (vm, rd, rxq);
315 vlib_get_new_next_frame (vm, node, rd->per_interface_next_index, to_next,
317 n_rx_bytes = rdma_device_input_load_wc (n_rx_packets, wc, to_next, bufsz);
318 vlib_get_buffers (vm, to_next, bufs, n_rx_packets);
319 rdma_device_input_bufs_init (n_rx_packets, bufs, bufsz, rd->sw_if_index);
320 rdma_device_input_trace (vm, node, rd, n_rx_packets, to_next);
321 rdma_device_input_ethernet (vm, node, rd);
323 vlib_put_next_frame (vm, node, rd->per_interface_next_index,
324 n_left_to_next - n_rx_packets);
326 vlib_increment_combined_counter
327 (vnm->interface_main.combined_sw_if_counters +
328 VNET_INTERFACE_COUNTER_RX, vm->thread_index,
329 rd->hw_if_index, n_rx_packets, n_rx_bytes);
331 rxq->n_enq -= n_rx_packets;
333 rdma_device_input_refill (vm, rd, rxq);
338 VLIB_NODE_FN (rdma_input_node) (vlib_main_t * vm,
339 vlib_node_runtime_t * node,
340 vlib_frame_t * frame)
343 rdma_main_t *rm = &rdma_main;
344 vnet_device_input_runtime_t *rt = (void *) node->runtime_data;
345 vnet_device_and_queue_t *dq;
347 foreach_device_and_queue (dq, rt->devices_and_queues)
350 rd = vec_elt_at_index (rm->devices, dq->dev_instance);
351 if (PREDICT_TRUE (rd->flags & RDMA_DEVICE_F_ADMIN_UP))
352 n_rx += rdma_device_input_inline (vm, node, frame, rd, dq->queue_id);
358 VLIB_REGISTER_NODE (rdma_input_node) = {
359 .name = "rdma-input",
360 .sibling_of = "device-input",
361 .format_trace = format_rdma_input_trace,
362 .type = VLIB_NODE_TYPE_INPUT,
363 .state = VLIB_NODE_STATE_DISABLED,
364 .n_errors = RDMA_INPUT_N_ERROR,
365 .error_strings = rdma_input_error_strings,
372 * fd.io coding-style-patch-verification: ON
375 * eval: (c-set-style "gnu")