2 *------------------------------------------------------------------
3 * Copyright (c) 2018 Cisco and/or its affiliates.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *------------------------------------------------------------------
21 #include <linux/if_link.h>
22 #include <linux/if_ether.h>
24 #include <vppinfra/linux/sysfs.h>
25 #include <vlib/vlib.h>
26 #include <vlib/unix/unix.h>
27 #include <vlib/pci/pci.h>
28 #include <vnet/ethernet/ethernet.h>
30 #include <rdma/rdma.h>
32 rdma_main_t rdma_main;
34 #define rdma_log_debug(dev, f, ...) \
36 vlib_log(VLIB_LOG_LEVEL_DEBUG, rdma_main.log_class, "%U: " f, \
37 format_vlib_pci_addr, &rd->pci_addr, ##__VA_ARGS__); \
41 rdma_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags)
43 rdma_main_t *rm = &rdma_main;
44 vlib_log_warn (rm->log_class, "TODO");
49 rdma_update_state (vnet_main_t * vnm, rdma_device_t * rd, int port)
51 struct ibv_port_attr attr;
55 if (ibv_query_port (rd->ctx, port, &attr))
57 vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, 0);
58 vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
65 case IBV_PORT_ACTIVE: /* fallthrough */
66 case IBV_PORT_ACTIVE_DEFER:
67 rd->flags |= RDMA_DEVICE_F_LINK_UP;
68 vnet_hw_interface_set_flags (vnm, rd->hw_if_index,
69 VNET_HW_INTERFACE_FLAG_LINK_UP);
72 rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
73 vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
78 switch (attr.active_width)
93 switch (attr.active_speed)
101 case 4: /* fallthrough */
112 vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, width * speed);
115 static clib_error_t *
116 rdma_async_event_error_ready (clib_file_t * f)
118 rdma_main_t *rm = &rdma_main;
119 rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data);
120 return clib_error_return (0, "RDMA async event error for device %U",
121 format_vlib_pci_addr, &rd->pci_addr);
124 static clib_error_t *
125 rdma_async_event_read_ready (clib_file_t * f)
127 vnet_main_t *vnm = vnet_get_main ();
128 rdma_main_t *rm = &rdma_main;
129 rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data);
131 struct ibv_async_event event;
132 ret = ibv_get_async_event (rd->ctx, &event);
135 return clib_error_return_unix (0, "ibv_get_async_event() failed");
138 switch (event.event_type)
140 case IBV_EVENT_PORT_ACTIVE:
141 rdma_update_state (vnm, rd, event.element.port_num);
143 case IBV_EVENT_PORT_ERR:
144 rdma_update_state (vnm, rd, event.element.port_num);
146 case IBV_EVENT_DEVICE_FATAL:
147 rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
148 vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
149 vlib_log_emerg (rm->log_class, "Fatal RDMA error for device %U",
150 format_vlib_pci_addr, &rd->pci_addr);
153 vlib_log_warn (rm->log_class,
154 "Unhandeld RDMA async event %i for device %U",
155 event.event_type, format_vlib_pci_addr, &rd->pci_addr);
159 ibv_ack_async_event (&event);
163 static clib_error_t *
164 rdma_async_event_init (rdma_device_t * rd)
166 clib_file_t t = { 0 };
169 /* make RDMA async event fd non-blocking */
170 ret = fcntl (rd->ctx->async_fd, F_GETFL);
173 return clib_error_return_unix (0, "fcntl(F_GETFL) failed");
175 ret = fcntl (rd->ctx->async_fd, F_SETFL, ret | O_NONBLOCK);
178 return clib_error_return_unix (0, "fcntl(F_SETFL, O_NONBLOCK) failed");
181 /* register RDMA async event fd */
182 t.read_function = rdma_async_event_read_ready;
183 t.file_descriptor = rd->ctx->async_fd;
184 t.error_function = rdma_async_event_error_ready;
185 t.private_data = rd->dev_instance;
187 format (0, "RMDA %U async event", format_vlib_pci_addr, &rd->pci_addr);
189 rd->async_event_clib_file_index = clib_file_add (&file_main, &t);
195 rdma_async_event_cleanup (rdma_device_t * rd)
197 clib_file_del_by_index (&file_main, rd->async_event_clib_file_index);
200 static clib_error_t *
201 rdma_register_interface (vnet_main_t * vnm, rdma_device_t * rd)
203 return ethernet_register_interface (vnm, rdma_device_class.index,
204 rd->dev_instance, rd->hwaddr,
205 &rd->hw_if_index, rdma_flag_change);
209 rdma_unregister_interface (vnet_main_t * vnm, rdma_device_t * rd)
211 vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
212 vnet_hw_interface_unassign_rx_thread (vnm, rd->hw_if_index, 0);
213 ethernet_delete_interface (vnm, rd->hw_if_index);
217 rdma_dev_cleanup (rdma_device_t * rd)
219 rdma_main_t *rm = &rdma_main;
223 #define _(fn, arg) if (arg) \
226 if ((rv = fn (arg))) \
227 rdma_log_debug (rd, #fn "() failed (rv = %d)", rv); \
230 _(ibv_destroy_flow, rd->flow_mcast);
231 _(ibv_destroy_flow, rd->flow_ucast);
232 _(ibv_dereg_mr, rd->mr);
233 vec_foreach (txq, rd->txqs)
235 _(ibv_destroy_qp, txq->qp);
236 _(ibv_destroy_cq, txq->cq);
238 vec_foreach (rxq, rd->rxqs)
240 _(ibv_destroy_qp, rxq->qp);
241 _(ibv_destroy_cq, rxq->cq);
243 _(ibv_dealloc_pd, rd->pd);
244 _(ibv_close_device, rd->ctx);
247 clib_error_free (rd->error);
251 pool_put (rm->devices, rd);
254 static clib_error_t *
255 rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
258 struct ibv_qp_init_attr qpia;
259 struct ibv_qp_attr qpa;
262 vec_validate_aligned (rd->rxqs, qid, CLIB_CACHE_LINE_BYTES);
263 rxq = vec_elt_at_index (rd->rxqs, qid);
266 if ((rxq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
267 return clib_error_return_unix (0, "Create CQ Failed");
269 memset (&qpia, 0, sizeof (qpia));
270 qpia.qp_type = IBV_QPT_RAW_PACKET;
271 qpia.send_cq = rxq->cq;
272 qpia.recv_cq = rxq->cq;
273 qpia.cap.max_recv_wr = n_desc;
274 qpia.cap.max_recv_sge = 1;
276 if ((rxq->qp = ibv_create_qp (rd->pd, &qpia)) == 0)
277 return clib_error_return_unix (0, "Queue Pair create failed");
279 memset (&qpa, 0, sizeof (qpa));
280 qp_flags = IBV_QP_STATE | IBV_QP_PORT;
281 qpa.qp_state = IBV_QPS_INIT;
283 if (ibv_modify_qp (rxq->qp, &qpa, qp_flags) != 0)
284 return clib_error_return_unix (0, "Modify QP (init) Failed");
286 memset (&qpa, 0, sizeof (qpa));
287 qp_flags = IBV_QP_STATE;
288 qpa.qp_state = IBV_QPS_RTR;
289 if (ibv_modify_qp (rxq->qp, &qpa, qp_flags) != 0)
290 return clib_error_return_unix (0, "Modify QP (receive) Failed");
295 static clib_error_t *
296 rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
299 struct ibv_qp_init_attr qpia;
300 struct ibv_qp_attr qpa;
303 vec_validate_aligned (rd->txqs, qid, CLIB_CACHE_LINE_BYTES);
304 txq = vec_elt_at_index (rd->txqs, qid);
307 if ((txq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
308 return clib_error_return_unix (0, "Create CQ Failed");
310 memset (&qpia, 0, sizeof (qpia));
311 qpia.qp_type = IBV_QPT_RAW_PACKET;
312 qpia.send_cq = txq->cq;
313 qpia.recv_cq = txq->cq;
314 qpia.cap.max_send_wr = n_desc;
315 qpia.cap.max_send_sge = 1;
317 if ((txq->qp = ibv_create_qp (rd->pd, &qpia)) == 0)
318 return clib_error_return_unix (0, "Queue Pair create failed");
320 memset (&qpa, 0, sizeof (qpa));
321 qp_flags = IBV_QP_STATE | IBV_QP_PORT;
322 qpa.qp_state = IBV_QPS_INIT;
324 if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
325 return clib_error_return_unix (0, "Modify QP (init) Failed");
327 memset (&qpa, 0, sizeof (qpa));
328 qp_flags = IBV_QP_STATE;
329 qpa.qp_state = IBV_QPS_RTR;
330 if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
331 return clib_error_return_unix (0, "Modify QP (receive) Failed");
333 memset (&qpa, 0, sizeof (qpa));
334 qp_flags = IBV_QP_STATE;
335 qpa.qp_state = IBV_QPS_RTS;
336 if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
337 return clib_error_return_unix (0, "Modify QP (send) Failed");
341 static clib_error_t *
342 rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd)
345 vlib_buffer_main_t *bm = vm->buffer_main;
346 vlib_thread_main_t *tm = vlib_get_thread_main ();
350 return clib_error_return_unix (0, "Device Open Failed");
352 if ((rd->pd = ibv_alloc_pd (rd->ctx)) == 0)
353 return clib_error_return_unix (0, "PD Alloc Failed");
355 if ((err = rdma_rxq_init (vm, rd, 0, 512)))
358 for (i = 0; i < tm->n_vlib_mains; i++)
359 if ((err = rdma_txq_init (vm, rd, i, 512)))
362 if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
364 IBV_ACCESS_LOCAL_WRITE)) == 0)
365 return clib_error_return_unix (0, "Register MR Failed");
367 ethernet_mac_address_generate (rd->hwaddr);
370 * restrict packets steering to our MAC
371 * allows to share a single HW NIC with multiple RDMA ifaces
374 struct raw_eth_flow_attr
376 struct ibv_flow_attr attr;
377 struct ibv_flow_spec_eth spec_eth;
378 } __attribute__ ((packed)) fa;
379 memset (&fa, 0, sizeof (fa));
380 fa.attr.num_of_specs = 1;
382 fa.spec_eth.type = IBV_FLOW_SPEC_ETH;
383 fa.spec_eth.size = sizeof (struct ibv_flow_spec_eth);
384 memcpy (fa.spec_eth.val.dst_mac, rd->hwaddr,
385 sizeof (fa.spec_eth.val.dst_mac));
386 memset (fa.spec_eth.mask.dst_mac, 0xff, sizeof (fa.spec_eth.mask.dst_mac));
387 if ((rd->flow_ucast = ibv_create_flow (rd->rxqs[0].qp, &fa.attr)) == 0)
388 return clib_error_return_unix (0, "create Flow Failed");
390 /* receive multicast packets too */
391 memset (&fa, 0, sizeof (fa));
392 fa.attr.num_of_specs = 1;
394 fa.attr.flags = IBV_FLOW_ATTR_FLAGS_DONT_TRAP; /* let others receive them too */
395 fa.spec_eth.type = IBV_FLOW_SPEC_ETH;
396 fa.spec_eth.size = sizeof (struct ibv_flow_spec_eth);
397 fa.spec_eth.val.dst_mac[0] = 1;
398 fa.spec_eth.mask.dst_mac[0] = 1;
399 if ((rd->flow_mcast = ibv_create_flow (rd->rxqs[0].qp, &fa.attr)) == 0)
400 return clib_error_return_unix (0, "create Flow Failed");
406 sysfs_path_to_pci_addr (char *path, vlib_pci_addr_t * addr)
412 s = clib_sysfs_link_to_name (path);
413 unformat_init_string (&in, (char *) s, strlen ((char *) s));
414 rv = unformat (&in, "%U", unformat_vlib_pci_addr, addr);
421 rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
423 vnet_main_t *vnm = vnet_get_main ();
424 rdma_main_t *rm = &rdma_main;
425 rdma_device_t *rd = 0;
426 struct ibv_device **dev_list = 0;
430 pool_get_zero (rm->devices, rd);
431 rd->dev_instance = rd - rm->devices;
432 rd->per_interface_next_index = ~0;
434 /* check if device exist and if it is bound to mlx5_core */
435 s = format (s, "/sys/class/net/%s/device/driver/module%c", args->ifname, 0);
436 s2 = clib_sysfs_link_to_name ((char *) s);
438 if (s2 == 0 || strncmp ((char *) s2, "mlx5_core", 9) != 0)
441 clib_error_return (0,
442 "invalid interface (only mlx5 supported for now)");
446 /* extract PCI address */
447 vec_reset_length (s);
448 s = format (s, "/sys/class/net/%s/device%c", args->ifname, 0);
449 if (sysfs_path_to_pci_addr ((char *) s, &rd->pci_addr) == 0)
451 args->error = clib_error_return (0, "cannot find PCI address");
455 dev_list = ibv_get_device_list (&n_devs);
459 clib_error_return_unix (0,
460 "no RDMA devices available, errno = %d. Is the ib_uverbs module loaded?",
465 for (int i = 0; i < n_devs; i++)
467 vlib_pci_addr_t addr;
469 vec_reset_length (s);
470 s = format (s, "%s/device%c", dev_list[i]->dev_path, 0);
472 if (sysfs_path_to_pci_addr ((char *) s, &addr) == 0)
475 if (addr.as_u32 != rd->pci_addr.as_u32)
478 if ((rd->ctx = ibv_open_device (dev_list[i])))
482 if ((args->error = rdma_dev_init (vm, rd)))
485 if ((args->error = rdma_register_interface (vnm, rd)))
488 if ((args->error = rdma_async_event_init (rd)))
491 rdma_update_state (vnm, rd, 1);
493 vnet_sw_interface_t *sw = vnet_get_hw_sw_interface (vnm, rd->hw_if_index);
494 args->sw_if_index = rd->sw_if_index = sw->sw_if_index;
496 * FIXME: add support for interrupt mode
497 * vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, rd->hw_if_index);
498 * hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE;
500 vnet_hw_interface_set_input_node (vnm, rd->hw_if_index,
501 rdma_input_node.index);
502 vnet_hw_interface_assign_rx_thread (vnm, rd->hw_if_index, 0, ~0);
506 rdma_unregister_interface (vnm, rd);
508 rdma_dev_cleanup (rd);
510 ibv_free_device_list (dev_list);
514 args->rv = VNET_API_ERROR_INVALID_INTERFACE;
515 vlib_log_err (rm->log_class, "%U", format_clib_error, args->error);
519 rdma_delete_if (vlib_main_t * vm, rdma_device_t * rd)
521 rdma_async_event_cleanup (rd);
522 rdma_unregister_interface (vnet_get_main (), rd);
523 rdma_dev_cleanup (rd);
526 static clib_error_t *
527 rdma_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
529 vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, hw_if_index);
530 rdma_main_t *rm = &rdma_main;
531 rdma_device_t *rd = vec_elt_at_index (rm->devices, hi->dev_instance);
532 uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
534 if (rd->flags & RDMA_DEVICE_F_ERROR)
535 return clib_error_return (0, "device is in error state");
539 vnet_hw_interface_set_flags (vnm, rd->hw_if_index,
540 VNET_HW_INTERFACE_FLAG_LINK_UP);
541 rd->flags |= RDMA_DEVICE_F_ADMIN_UP;
545 vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
546 rd->flags &= ~RDMA_DEVICE_F_ADMIN_UP;
552 rdma_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index,
555 rdma_main_t *rm = &rdma_main;
556 vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
557 rdma_device_t *rd = pool_elt_at_index (rm->devices, hw->dev_instance);
559 /* Shut off redirection */
560 if (node_index == ~0)
562 rd->per_interface_next_index = node_index;
566 rd->per_interface_next_index =
567 vlib_node_add_next (vlib_get_main (), rdma_input_node.index, node_index);
570 static char *rdma_tx_func_error_strings[] = {
572 foreach_rdma_tx_func_error
577 VNET_DEVICE_CLASS (rdma_device_class,) =
579 .name = "RDMA interface",
580 .format_device = format_rdma_device,
581 .format_device_name = format_rdma_device_name,
582 .admin_up_down_function = rdma_interface_admin_up_down,
583 .rx_redirect_to_node = rdma_set_interface_next_node,
584 .tx_function_n_errors = RDMA_TX_N_ERROR,
585 .tx_function_error_strings = rdma_tx_func_error_strings,
590 rdma_init (vlib_main_t * vm)
592 rdma_main_t *rm = &rdma_main;
594 rm->log_class = vlib_log_register_class ("rdma", 0);
599 VLIB_INIT_FUNCTION (rdma_init);
602 * fd.io coding-style-patch-verification: ON
605 * eval: (c-set-style "gnu")