2 *------------------------------------------------------------------
3 * Copyright (c) 2018 Cisco and/or its affiliates.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *------------------------------------------------------------------
21 #include <linux/if_link.h>
22 #include <linux/if_ether.h>
24 #include <vppinfra/linux/sysfs.h>
25 #include <vlib/vlib.h>
26 #include <vlib/unix/unix.h>
27 #include <vlib/pci/pci.h>
28 #include <vnet/ethernet/ethernet.h>
30 #include <rdma/rdma.h>
32 rdma_main_t rdma_main;
34 #define rdma_log_debug(dev, f, ...) \
36 vlib_log(VLIB_LOG_LEVEL_DEBUG, rdma_main.log_class, "%U: " f, \
37 format_vlib_pci_addr, &rd->pci_addr, ##__VA_ARGS__); \
41 rdma_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags)
43 rdma_main_t *rm = &rdma_main;
44 vlib_log_warn (rm->log_class, "TODO");
49 rdma_update_state (vnet_main_t * vnm, rdma_device_t * rd, int port)
51 struct ibv_port_attr attr;
55 if (ibv_query_port (rd->ctx, port, &attr))
57 vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, 0);
58 vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
65 case IBV_PORT_ACTIVE: /* fallthrough */
66 case IBV_PORT_ACTIVE_DEFER:
67 rd->flags |= RDMA_DEVICE_F_LINK_UP;
68 vnet_hw_interface_set_flags (vnm, rd->hw_if_index,
69 VNET_HW_INTERFACE_FLAG_LINK_UP);
72 rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
73 vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
78 switch (attr.active_width)
93 switch (attr.active_speed)
101 case 4: /* fallthrough */
112 vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, width * speed);
115 static clib_error_t *
116 rdma_async_event_error_ready (clib_file_t * f)
118 rdma_main_t *rm = &rdma_main;
119 rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data);
120 return clib_error_return (0, "RDMA async event error for device %U",
121 format_vlib_pci_addr, &rd->pci_addr);
124 static clib_error_t *
125 rdma_async_event_read_ready (clib_file_t * f)
127 vnet_main_t *vnm = vnet_get_main ();
128 rdma_main_t *rm = &rdma_main;
129 rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data);
131 struct ibv_async_event event;
132 ret = ibv_get_async_event (rd->ctx, &event);
135 return clib_error_return_unix (0, "ibv_get_async_event() failed");
138 switch (event.event_type)
140 case IBV_EVENT_PORT_ACTIVE:
141 rdma_update_state (vnm, rd, event.element.port_num);
143 case IBV_EVENT_PORT_ERR:
144 rdma_update_state (vnm, rd, event.element.port_num);
146 case IBV_EVENT_DEVICE_FATAL:
147 rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
148 vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
149 vlib_log_emerg (rm->log_class, "Fatal RDMA error for device %U",
150 format_vlib_pci_addr, &rd->pci_addr);
153 vlib_log_warn (rm->log_class,
154 "Unhandeld RDMA async event %i for device %U",
155 event.event_type, format_vlib_pci_addr, &rd->pci_addr);
159 ibv_ack_async_event (&event);
163 static clib_error_t *
164 rdma_async_event_init (rdma_device_t * rd)
166 clib_file_t t = { 0 };
169 /* make RDMA async event fd non-blocking */
170 ret = fcntl (rd->ctx->async_fd, F_GETFL);
173 return clib_error_return_unix (0, "fcntl(F_GETFL) failed");
175 ret = fcntl (rd->ctx->async_fd, F_SETFL, ret | O_NONBLOCK);
178 return clib_error_return_unix (0, "fcntl(F_SETFL, O_NONBLOCK) failed");
181 /* register RDMA async event fd */
182 t.read_function = rdma_async_event_read_ready;
183 t.file_descriptor = rd->ctx->async_fd;
184 t.error_function = rdma_async_event_error_ready;
185 t.private_data = rd->dev_instance;
187 format (0, "RMDA %U async event", format_vlib_pci_addr, &rd->pci_addr);
189 rd->async_event_clib_file_index = clib_file_add (&file_main, &t);
195 rdma_async_event_cleanup (rdma_device_t * rd)
197 clib_file_del_by_index (&file_main, rd->async_event_clib_file_index);
200 static clib_error_t *
201 rdma_register_interface (vnet_main_t * vnm, rdma_device_t * rd)
203 return ethernet_register_interface (vnm, rdma_device_class.index,
204 rd->dev_instance, rd->hwaddr,
205 &rd->hw_if_index, rdma_flag_change);
209 rdma_unregister_interface (vnet_main_t * vnm, rdma_device_t * rd)
211 vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
212 vnet_hw_interface_unassign_rx_thread (vnm, rd->hw_if_index, 0);
213 ethernet_delete_interface (vnm, rd->hw_if_index);
217 rdma_dev_cleanup (rdma_device_t * rd)
219 rdma_main_t *rm = &rdma_main;
223 #define _(fn, arg) if (arg) \
226 if ((rv = fn (arg))) \
227 rdma_log_debug (rd, #fn "() failed (rv = %d)", rv); \
230 _(ibv_destroy_flow, rd->flow_mcast);
231 _(ibv_destroy_flow, rd->flow_ucast);
232 _(ibv_dereg_mr, rd->mr);
233 vec_foreach (txq, rd->txqs)
235 _(ibv_destroy_qp, txq->qp);
236 _(ibv_destroy_cq, txq->cq);
238 vec_foreach (rxq, rd->rxqs)
240 _(ibv_destroy_qp, rxq->qp);
241 _(ibv_destroy_cq, rxq->cq);
243 _(ibv_dealloc_pd, rd->pd);
244 _(ibv_close_device, rd->ctx);
247 clib_error_free (rd->error);
252 pool_put (rm->devices, rd);
255 static clib_error_t *
256 rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
259 struct ibv_qp_init_attr qpia;
260 struct ibv_qp_attr qpa;
263 vec_validate_aligned (rd->rxqs, qid, CLIB_CACHE_LINE_BYTES);
264 rxq = vec_elt_at_index (rd->rxqs, qid);
267 if ((rxq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
268 return clib_error_return_unix (0, "Create CQ Failed");
270 memset (&qpia, 0, sizeof (qpia));
271 qpia.send_cq = rxq->cq;
272 qpia.recv_cq = rxq->cq;
273 qpia.cap.max_recv_wr = n_desc;
274 qpia.cap.max_recv_sge = 1;
275 qpia.qp_type = IBV_QPT_RAW_PACKET;
277 if ((rxq->qp = ibv_create_qp (rd->pd, &qpia)) == 0)
278 return clib_error_return_unix (0, "Queue Pair create failed");
280 memset (&qpa, 0, sizeof (qpa));
281 qp_flags = IBV_QP_STATE | IBV_QP_PORT;
282 qpa.qp_state = IBV_QPS_INIT;
284 if (ibv_modify_qp (rxq->qp, &qpa, qp_flags) != 0)
285 return clib_error_return_unix (0, "Modify QP (init) Failed");
287 memset (&qpa, 0, sizeof (qpa));
288 qp_flags = IBV_QP_STATE;
289 qpa.qp_state = IBV_QPS_RTR;
290 if (ibv_modify_qp (rxq->qp, &qpa, qp_flags) != 0)
291 return clib_error_return_unix (0, "Modify QP (receive) Failed");
296 static clib_error_t *
297 rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
300 struct ibv_qp_init_attr qpia;
301 struct ibv_qp_attr qpa;
304 vec_validate_aligned (rd->txqs, qid, CLIB_CACHE_LINE_BYTES);
305 txq = vec_elt_at_index (rd->txqs, qid);
308 if ((txq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
309 return clib_error_return_unix (0, "Create CQ Failed");
311 memset (&qpia, 0, sizeof (qpia));
312 qpia.send_cq = txq->cq;
313 qpia.recv_cq = txq->cq;
314 qpia.cap.max_send_wr = n_desc;
315 qpia.cap.max_send_sge = 1;
316 qpia.qp_type = IBV_QPT_RAW_PACKET;
319 if ((txq->qp = ibv_create_qp (rd->pd, &qpia)) == 0)
320 return clib_error_return_unix (0, "Queue Pair create failed");
322 memset (&qpa, 0, sizeof (qpa));
323 qp_flags = IBV_QP_STATE | IBV_QP_PORT;
324 qpa.qp_state = IBV_QPS_INIT;
326 if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
327 return clib_error_return_unix (0, "Modify QP (init) Failed");
329 memset (&qpa, 0, sizeof (qpa));
330 qp_flags = IBV_QP_STATE;
331 qpa.qp_state = IBV_QPS_RTR;
332 if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
333 return clib_error_return_unix (0, "Modify QP (receive) Failed");
335 memset (&qpa, 0, sizeof (qpa));
336 qp_flags = IBV_QP_STATE;
337 qpa.qp_state = IBV_QPS_RTS;
338 if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
339 return clib_error_return_unix (0, "Modify QP (send) Failed");
343 static clib_error_t *
344 rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd)
347 vlib_buffer_main_t *bm = vm->buffer_main;
348 vlib_thread_main_t *tm = vlib_get_thread_main ();
352 return clib_error_return_unix (0, "Device Open Failed");
354 if ((rd->pd = ibv_alloc_pd (rd->ctx)) == 0)
355 return clib_error_return_unix (0, "PD Alloc Failed");
357 if ((err = rdma_rxq_init (vm, rd, 0, 512)))
360 for (i = 0; i < tm->n_vlib_mains; i++)
361 if ((err = rdma_txq_init (vm, rd, i, 512)))
364 if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
366 IBV_ACCESS_LOCAL_WRITE)) == 0)
367 return clib_error_return_unix (0, "Register MR Failed");
369 ethernet_mac_address_generate (rd->hwaddr);
372 * restrict packets steering to our MAC
373 * allows to share a single HW NIC with multiple RDMA ifaces
376 struct raw_eth_flow_attr
378 struct ibv_flow_attr attr;
379 struct ibv_flow_spec_eth spec_eth;
380 } __attribute__ ((packed)) fa;
381 memset (&fa, 0, sizeof (fa));
382 fa.attr.num_of_specs = 1;
384 fa.spec_eth.type = IBV_FLOW_SPEC_ETH;
385 fa.spec_eth.size = sizeof (struct ibv_flow_spec_eth);
386 memcpy (fa.spec_eth.val.dst_mac, rd->hwaddr,
387 sizeof (fa.spec_eth.val.dst_mac));
388 memset (fa.spec_eth.mask.dst_mac, 0xff, sizeof (fa.spec_eth.mask.dst_mac));
389 if ((rd->flow_ucast = ibv_create_flow (rd->rxqs[0].qp, &fa.attr)) == 0)
390 return clib_error_return_unix (0, "create Flow Failed");
392 /* receive multicast packets too */
393 memset (&fa, 0, sizeof (fa));
394 fa.attr.num_of_specs = 1;
396 fa.attr.flags = IBV_FLOW_ATTR_FLAGS_DONT_TRAP; /* let others receive them too */
397 fa.spec_eth.type = IBV_FLOW_SPEC_ETH;
398 fa.spec_eth.size = sizeof (struct ibv_flow_spec_eth);
399 fa.spec_eth.val.dst_mac[0] = 1;
400 fa.spec_eth.mask.dst_mac[0] = 1;
401 if ((rd->flow_mcast = ibv_create_flow (rd->rxqs[0].qp, &fa.attr)) == 0)
402 return clib_error_return_unix (0, "create Flow Failed");
408 sysfs_path_to_pci_addr (char *path, vlib_pci_addr_t * addr)
414 s = clib_sysfs_link_to_name (path);
415 unformat_init_string (&in, (char *) s, strlen ((char *) s));
416 rv = unformat (&in, "%U", unformat_vlib_pci_addr, addr);
423 rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
425 vnet_main_t *vnm = vnet_get_main ();
426 rdma_main_t *rm = &rdma_main;
427 rdma_device_t *rd = 0;
428 struct ibv_device **dev_list = 0;
432 pool_get_zero (rm->devices, rd);
433 rd->dev_instance = rd - rm->devices;
434 rd->per_interface_next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
435 rd->name = vec_dup (args->name);
437 /* check if device exist and if it is bound to mlx5_core */
438 s = format (s, "/sys/class/net/%s/device/driver/module%c", args->ifname, 0);
439 s2 = clib_sysfs_link_to_name ((char *) s);
441 if (s2 == 0 || strncmp ((char *) s2, "mlx5_core", 9) != 0)
444 clib_error_return (0,
445 "invalid interface (only mlx5 supported for now)");
449 /* extract PCI address */
450 vec_reset_length (s);
451 s = format (s, "/sys/class/net/%s/device%c", args->ifname, 0);
452 if (sysfs_path_to_pci_addr ((char *) s, &rd->pci_addr) == 0)
454 args->error = clib_error_return (0, "cannot find PCI address");
458 dev_list = ibv_get_device_list (&n_devs);
462 clib_error_return_unix (0,
463 "no RDMA devices available, errno = %d. Is the ib_uverbs module loaded?",
468 for (int i = 0; i < n_devs; i++)
470 vlib_pci_addr_t addr;
472 vec_reset_length (s);
473 s = format (s, "%s/device%c", dev_list[i]->dev_path, 0);
475 if (sysfs_path_to_pci_addr ((char *) s, &addr) == 0)
478 if (addr.as_u32 != rd->pci_addr.as_u32)
481 if ((rd->ctx = ibv_open_device (dev_list[i])))
485 if ((args->error = rdma_dev_init (vm, rd)))
488 if ((args->error = rdma_register_interface (vnm, rd)))
491 if ((args->error = rdma_async_event_init (rd)))
494 rdma_update_state (vnm, rd, 1);
496 vnet_sw_interface_t *sw = vnet_get_hw_sw_interface (vnm, rd->hw_if_index);
497 args->sw_if_index = rd->sw_if_index = sw->sw_if_index;
499 * FIXME: add support for interrupt mode
500 * vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, rd->hw_if_index);
501 * hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE;
503 vnet_hw_interface_set_input_node (vnm, rd->hw_if_index,
504 rdma_input_node.index);
505 vnet_hw_interface_assign_rx_thread (vnm, rd->hw_if_index, 0, ~0);
509 rdma_unregister_interface (vnm, rd);
511 rdma_dev_cleanup (rd);
513 ibv_free_device_list (dev_list);
517 args->rv = VNET_API_ERROR_INVALID_INTERFACE;
518 vlib_log_err (rm->log_class, "%U", format_clib_error, args->error);
522 rdma_delete_if (vlib_main_t * vm, rdma_device_t * rd)
524 rdma_async_event_cleanup (rd);
525 rdma_unregister_interface (vnet_get_main (), rd);
526 rdma_dev_cleanup (rd);
529 static clib_error_t *
530 rdma_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
532 vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, hw_if_index);
533 rdma_main_t *rm = &rdma_main;
534 rdma_device_t *rd = vec_elt_at_index (rm->devices, hi->dev_instance);
535 uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
537 if (rd->flags & RDMA_DEVICE_F_ERROR)
538 return clib_error_return (0, "device is in error state");
542 vnet_hw_interface_set_flags (vnm, rd->hw_if_index,
543 VNET_HW_INTERFACE_FLAG_LINK_UP);
544 rd->flags |= RDMA_DEVICE_F_ADMIN_UP;
548 vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
549 rd->flags &= ~RDMA_DEVICE_F_ADMIN_UP;
555 rdma_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index,
558 rdma_main_t *rm = &rdma_main;
559 vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
560 rdma_device_t *rd = pool_elt_at_index (rm->devices, hw->dev_instance);
562 /* Shut off redirection */
563 if (node_index == ~0)
565 rd->per_interface_next_index = node_index;
569 rd->per_interface_next_index =
570 vlib_node_add_next (vlib_get_main (), rdma_input_node.index, node_index);
573 static char *rdma_tx_func_error_strings[] = {
575 foreach_rdma_tx_func_error
580 VNET_DEVICE_CLASS (rdma_device_class,) =
582 .name = "RDMA interface",
583 .format_device = format_rdma_device,
584 .format_device_name = format_rdma_device_name,
585 .admin_up_down_function = rdma_interface_admin_up_down,
586 .rx_redirect_to_node = rdma_set_interface_next_node,
587 .tx_function_n_errors = RDMA_TX_N_ERROR,
588 .tx_function_error_strings = rdma_tx_func_error_strings,
593 rdma_init (vlib_main_t * vm)
595 rdma_main_t *rm = &rdma_main;
597 rm->log_class = vlib_log_register_class ("rdma", 0);
602 VLIB_INIT_FUNCTION (rdma_init);
605 * fd.io coding-style-patch-verification: ON
608 * eval: (c-set-style "gnu")