Add RDMA ibverb driver plugin 43/17643/11
authorBenoît Ganne <bganne@cisco.com>
Mon, 25 Mar 2019 10:41:34 +0000 (11:41 +0100)
committerDamjan Marion <dmarion@me.com>
Thu, 28 Mar 2019 19:31:59 +0000 (19:31 +0000)
RDMA ibverb is a userspace API to efficiently rx/tx packets. This is an
initial, unoptimized driver targeting Mellanox cards.
Next steps should include batching, multiqueue and additional cards.

Change-Id: I0309c7a543f75f2f9317eaf63ca502ac7a093ef9
Signed-off-by: Benoît Ganne <bganne@cisco.com>
15 files changed:
build/external/Makefile
build/external/deb/debian/rules
build/external/packages.mk
build/external/packages/dpdk.mk
build/external/packages/rdma-core.mk [new file with mode: 0644]
src/plugins/rdma/CMakeLists.txt [new file with mode: 0644]
src/plugins/rdma/cli.c [new file with mode: 0644]
src/plugins/rdma/device.c [new file with mode: 0644]
src/plugins/rdma/format.c [new file with mode: 0644]
src/plugins/rdma/input.c [new file with mode: 0644]
src/plugins/rdma/output.c [new file with mode: 0644]
src/plugins/rdma/plugin.c [new file with mode: 0644]
src/plugins/rdma/rdma.h [new file with mode: 0644]
src/vnet/devices/tap/tap.c
src/vnet/ethernet/mac_address.h

index a1352a6..084d694 100644 (file)
@@ -31,11 +31,18 @@ include packages.mk
 include packages/nasm.mk
 include packages/ipsec-mb.mk
 include packages/dpdk.mk
+include packages/rdma-core.mk
 
 .PHONY: clean
 clean:
        @rm -rf $(B) $(I)
 
+.PHONY: install
+install: dpdk-install rdma-core-install
+
+.PHONY: config
+config: dpdk-config rdma-core-config
+
 ##############################################################################
 # .deb packaging
 ##############################################################################
@@ -62,11 +69,6 @@ build-deb: $(DEV_DEB)
 
 install-deb:
 ifneq ($(INSTALLED_VER),$(DEB_VER)-$(PKG_SUFFIX))
-       @echo "=========================================================="
-       @echo " Out of date vpp-ext-deps package installed."
-       @echo " Installed: $(INSTALLED_VER)"
-       @echo " Needed: $(DEB_VER)-$(PKG_SUFFIX)"
-       @echo "=========================================================="
        @make $(DEV_DEB)
        @sudo dpkg -i $(DEV_DEB)
 else
@@ -78,9 +80,9 @@ endif
 check-deb:
 ifneq ($(INSTALLED_VER),$(DEB_VER)-$(PKG_SUFFIX))
        @echo "=========================================================="
-       @echo " Outdated DPDK package detected:"
-       @echo "  Installed: vpp-ext-deps $(INSTALLED_VER)"
-       @echo "  Current:   vpp-ext-deps $(DEB_VER)-$(PKG_SUFFIX)"
+       @echo " Out of date vpp-ext-deps package installed."
+       @echo " Installed: $(INSTALLED_VER)"
+       @echo " Needed: $(DEB_VER)-$(PKG_SUFFIX)"
        @echo ""
        @echo " Please upgrade by invoking 'make install-ext-deps'"
        @echo " from the top level directory."
@@ -115,16 +117,16 @@ ifneq ($(INSTALLED_RPM_VER),$(RPM_VER)-$(PKG_SUFFIX))
        sudo rpm -Uih --force $(DEV_RPM)
 else
        @echo "=========================================================="
-       @echo " Up-to-date DPDK package already installed"
+       @echo " Up-to-date vpp-ext-deps package already installed"
        @echo "=========================================================="
 endif
 
 check-rpm:
 ifneq ($(INSTALLED_RPM_VER),$(RPM_VER)-$(PKG_SUFFIX))
        @echo "=========================================================="
-       @echo " Outdated DPDK package detected:"
-       @echo "  Installed: vpp-ext-deps $(INSTALLED_RPM_VER)"
-       @echo "  Current:   vpp-ext-deps $(RPM_VER)-$(PKG_SUFFIX)"
+       @echo " Out of date vpp-ext-deps package installed."
+       @echo " Installed: $(INSTALLED_RPM_VER)"
+       @echo " Needed: $(RPM_VER)-$(PKG_SUFFIX)"
        @echo ""
        @echo " Please upgrade by invoking 'make install-ext-deps'"
        @echo " from the top level directory."
@@ -140,9 +142,9 @@ endif
 ebuild-build:
 ifeq ($(INSTALLED_VER)$(INSTALLED_RPM_VER),)
        @echo "=========================================================="
-       @echo "Building DPDK from source. Consider installing development"
-       @echo "package by invoking 'make install-ext-deps' from the"
-       @echo "top level directory"
+       @echo "Building vpp-ext-deps from source. Consider installing"
+       @echo "development package by invoking 'make install-ext-deps'"
+       @echo "from the top level directory"
        @echo "=========================================================="
        make config
 else
index 6393f82..2b1157e 100755 (executable)
@@ -20,7 +20,6 @@ override_dh_clean:
        make $(MAKE_ARGS) clean
 
 override_dh_auto_configure:
-       make $(MAKE_ARGS) config
 
 override_dh_install:
        make $(MAKE_ARGS) install
index 4056b2f..005c2a9 100644 (file)
@@ -31,12 +31,12 @@ $1_install_log ?= $(B)/$1.install.log
 downloads/$($1_tarball):
        mkdir -p downloads
        @if [ -e $(DL_CACHE_DIR)/$($1_tarball) ] ; \
-               then cp $(DL_CACHE_DIR)/$($1_tarball) downloads/ ; \
+               then cp $(DL_CACHE_DIR)/$($1_tarball) $$@ ; \
        else \
                echo "Downloading $($1_url)" ; \
-               curl -o downloads/$($1_tarball) -LO $($1_url) ; \
+               curl -o $$@ -LO $($1_url) ; \
        fi
-       @rm -f $(B)/.download.ok
+       @rm -f $(B)/.$1.download.ok
 
 $(B)/.$1.download.ok: downloads/$($1_tarball)
        @mkdir -p $(B)
index 68c2767..ae9d9c5 100644 (file)
@@ -167,9 +167,7 @@ define set
 fi
 endef
 
-all: build
-
-$(B)/custom-config: $(B)/.patch.ok Makefile
+$(B)/custom-config: $(B)/.dpdk-patch.ok Makefile
        @echo --- generating custom config from $(DPDK_SOURCE)/config/defconfig_$(DPDK_TARGET) ---
        @cpp -undef -ffreestanding -x assembler-with-cpp $(DPDK_SOURCE)/config/defconfig_$(DPDK_TARGET) $@
        $(call set,RTE_MACHINE,$(DPDK_MACHINE))
@@ -230,18 +228,19 @@ $(B)/custom-config: $(B)/.patch.ok Makefile
        $(call set,RTE_LIBRTE_DPAA_PMD,n)
        $(call set,RTE_LIBRTE_PMD_DPAA_SEC,n)
        $(call set,RTE_LIBRTE_PMD_DPAA_EVENTDEV,n)
-       @rm -f .config.ok
+       @rm -f .dpdk-config.ok
+
+DPDK_DOWNLOADS = $(CURDIR)/downloads/$(DPDK_TARBALL)
 
-$(CURDIR)/$(DPDK_TARBALL):
+$(DPDK_DOWNLOADS):
+       mkdir -p downloads
        @if [ -e $(DPDK_DOWNLOAD_DIR)/$(DPDK_TARBALL) ] ; \
-               then cp $(DPDK_DOWNLOAD_DIR)/$(DPDK_TARBALL) $(CURDIR) ; \
-               else curl -o $(CURDIR)/$(DPDK_TARBALL) -LO $(DPDK_TAR_URL) ; \
+               then cp $(DPDK_DOWNLOAD_DIR)/$(DPDK_TARBALL) $@ ; \
+               else curl -o $@ -LO $(DPDK_TAR_URL) ; \
        fi
-       @rm -f $(B)/.download.ok
+       @rm -f $(B)/.dpdk-download.ok
 
-DPDK_DOWNLOADS = $(CURDIR)/$(DPDK_TARBALL)
-
-$(B)/.download.ok: $(DPDK_DOWNLOADS)
+$(B)/.dpdk-download.ok: $(DPDK_DOWNLOADS)
        @mkdir -p $(B)
        @openssl md5 $< | cut -f 2 -d " " - > $(B)/$(DPDK_TARBALL).md5sum
        @([ "$$(<$(B)/$(DPDK_TARBALL).md5sum)" = "$(DPDK_$(DPDK_VERSION)_TARBALL_MD5_CKSUM)" ] || \
@@ -249,18 +248,18 @@ $(B)/.download.ok: $(DPDK_DOWNLOADS)
                rm $(B)/$(DPDK_TARBALL).md5sum && false ))
        @touch $@
 
-.PHONY: download
-download: $(B)/.download.ok
+.PHONY: dpdk-download
+dpdk-download: $(B)/.dpdk-download.ok
 
-$(B)/.extract.ok: $(B)/.download.ok
+$(B)/.dpdk-extract.ok: $(B)/.dpdk-download.ok
        @echo --- extracting $(DPDK_TARBALL) ---
-       @tar --directory $(B) --extract --file $(CURDIR)/$(DPDK_TARBALL)
+       @tar --directory $(B) --extract --file $(DPDK_DOWNLOADS)
        @touch $@
 
-.PHONY: extract
-extract: $(B)/.extract.ok
+.PHONY: dpdk-extract
+dpdk-extract: $(B)/.dpdk-extract.ok
 
-$(B)/.patch.ok: $(B)/.extract.ok
+$(B)/.dpdk-patch.ok: $(B)/.dpdk-extract.ok
 ifneq ($(wildcard $(CURDIR)/patches/dpdk_$(DPDK_VERSION)/*.patch),)
        @echo --- patching ---
        @for f in $(CURDIR)/patches/dpdk_$(DPDK_VERSION)/*.patch ; do \
@@ -270,26 +269,23 @@ ifneq ($(wildcard $(CURDIR)/patches/dpdk_$(DPDK_VERSION)/*.patch),)
 endif
        @touch $@
 
-.PHONY: patch
-patch: $(B)/.patch.ok
+.PHONY: dpdk-patch
+dpdk-patch: $(B)/.dpdk-patch.ok
 
-$(B)/.config.ok: $(B)/.patch.ok $(B)/custom-config
+$(B)/.dpdk-config.ok: $(B)/.dpdk-patch.ok $(B)/custom-config
        @make $(DPDK_MAKE_ARGS) config
        @touch $@
 
-.PHONY: config
-config: $(B)/.config.ok
+.PHONY: dpdk-config
+dpdk-config: $(B)/.dpdk-config.ok
 
-.PHONY: build-dpdk
-build-dpdk: $(DPDK_BUILD_DEPS)
-       @if [ ! -e $(B)/.config.ok ] ; then echo 'Please run "make config" first' && false ; fi
+$(B)/.dpdk-build.ok: dpdk-config $(DPDK_BUILD_DEPS)
+       @if [ ! -e $(B)/.dpdk-config.ok ] ; then echo 'Please run "make config" first' && false ; fi
        @make $(DPDK_MAKE_ARGS) install
-
-$(B)/.build.ok: build-dpdk
        @touch $@
 
-.PHONY: build
-build: $(B)/.build.ok
+.PHONY: dpdk-build
+dpdk-build: $(B)/.dpdk-build.ok
 
-.PHONY: install
-install: $(B)/.build.ok
+.PHONY: dpdk-install
+dpdk-install: $(B)/.dpdk-build.ok
diff --git a/build/external/packages/rdma-core.mk b/build/external/packages/rdma-core.mk
new file mode 100644 (file)
index 0000000..0e8c878
--- /dev/null
@@ -0,0 +1,46 @@
+# Copyright (c) 2018 Cisco and/or its affiliates.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+rdma-core_version             := 23
+rdma-core_tarball             := rdma-core-$(rdma-core_version).tar.gz
+rdma-core_tarball_md5sum_22.1 := dde4d30e3db20893408ae51041117034
+rdma-core_tarball_md5sum_23 := c78575735c4a71609c1a214ea16cd8dc
+rdma-core_tarball_md5sum      := $(rdma-core_tarball_md5sum_$(rdma-core_version))
+rdma-core_tarball_strip_dirs  := 1
+rdma-core_url                 := http://github.com/linux-rdma/rdma-core/releases/download/v$(rdma-core_version)/$(rdma-core_tarball)
+
+RDMA_FILES := include/infiniband/verbs.h \
+             include/infiniband/verbs_api.h \
+             include/infiniband/ib_user_ioctl_verbs.h \
+             include/rdma/ib_user_verbs.h \
+             lib/statics/libibverbs.a \
+             lib/statics/libmlx5.a
+
+define  rdma-core_config_cmds
+       cd $(rdma-core_build_dir) && \
+         cmake -G Ninja $(rdma-core_src_dir) \
+           -DENABLE_STATIC=1 -DENABLE_RESOLVE_NEIGH=0 -DNO_PYVERBS=1 -DENABLE_VALGRIND=0 \
+           -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+           -DCMAKE_C_FLAGS=-fPIC > $(rdma-core_config_log)
+endef
+
+define  rdma-core_build_cmds
+       cmake --build $(rdma-core_build_dir) -- libibverbs.a libmlx5.a > $(rdma-core_build_log)
+endef
+
+define  rdma-core_install_cmds
+       mkdir -p $(rdma-core_install_dir)
+       tar -C $(rdma-core_build_dir) --xform='s|/statics/|/|' -hc $(RDMA_FILES) | tar -C $(rdma-core_install_dir) -xv > $(rdma-core_install_log)
+endef
+
+$(eval $(call package,rdma-core))
diff --git a/src/plugins/rdma/CMakeLists.txt b/src/plugins/rdma/CMakeLists.txt
new file mode 100644 (file)
index 0000000..35d43db
--- /dev/null
@@ -0,0 +1,61 @@
+# Copyright (c) 2018 Cisco and/or its affiliates.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+message(STATUS "RDMA plugins - looking for ibverbs")
+
+find_path(IBVERBS_INCLUDE_DIR NAMES infiniband/verbs.h)
+find_library(IBVERBS_LIB NAMES libibverbs.a)
+find_library(MLX5_LIB NAMES libmlx5.a)
+
+if (NOT IBVERBS_LIB OR NOT MLX5_LIB)
+  message(WARNING "RDMA plugins - ibverbs not found - rdma_plugin disabled")
+  return()
+endif()
+
+if (MLX5_LIB)
+  string_append(RDMA_LINK_FLAGS "-Wl,--whole-archive,${MLX5_LIB},--no-whole-archive")
+endif()
+
+set(CMAKE_REQUIRED_FLAGS "-fPIC -shared ${IBVERBS_LIB} ${RDMA_LINK_FLAGS}")
+CHECK_C_SOURCE_COMPILES("" IBVERBS_COMPILES_CHECK)
+
+if (NOT IBVERBS_COMPILES_CHECK)
+  message(WARNING "RDMA plugins - no working ibverbs found - rdma_plugin disabled")
+  return()
+endif()
+
+message(STATUS "RDMA plugins - found ${IBVERBS_INCLUDE_DIR}")
+message(STATUS "RDMA plugins - found ${IBVERBS_LIB}")
+message(STATUS "RDMA plugins - found ${MLX5_LIB}")
+
+include_directories(${IBVERBS_INCLUDE_DIR})
+
+add_vpp_plugin(rdma
+  SOURCES
+  cli.c
+  device.c
+  format.c
+  plugin.c
+  input.c
+  output.c
+
+  MULTIARCH_SOURCES
+  input.c
+  output.c
+
+  LINK_FLAGS
+  "${RDMA_LINK_FLAGS}"
+
+  LINK_LIBRARIES
+  ${IBVERBS_LIB}
+)
diff --git a/src/plugins/rdma/cli.c b/src/plugins/rdma/cli.c
new file mode 100644 (file)
index 0000000..8919603
--- /dev/null
@@ -0,0 +1,133 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+#include <stdint.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <inttypes.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vlib/pci/pci.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <rdma/rdma.h>
+
+static clib_error_t *
+rdma_create_command_fn (vlib_main_t * vm, unformat_input_t * input,
+                       vlib_cli_command_t * cmd)
+{
+  unformat_input_t _line_input, *line_input = &_line_input;
+  rdma_create_if_args_t args;
+
+  clib_memset (&args, 0, sizeof (rdma_create_if_args_t));
+
+  /* Get a line of input. */
+  if (!unformat_user (input, unformat_line_input, line_input))
+    return 0;
+
+  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (line_input, "name %s", &args.ifname))
+       ;
+      else
+       return clib_error_return (0, "unknown input `%U'",
+                                 format_unformat_error, input);
+    }
+  unformat_free (line_input);
+
+  rdma_create_if (vm, &args);
+
+  vec_free (args.ifname);
+
+  return args.error;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (rdma_create_command, static) = {
+  .path = "create interface rdma",
+  .short_help = "create interface rdma <name ifname>",
+  .function = rdma_create_command_fn,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+rdma_delete_command_fn (vlib_main_t * vm, unformat_input_t * input,
+                       vlib_cli_command_t * cmd)
+{
+  unformat_input_t _line_input, *line_input = &_line_input;
+  u32 sw_if_index = ~0;
+  vnet_hw_interface_t *hw;
+  rdma_main_t *rm = &rdma_main;
+  rdma_device_t *rd;
+  vnet_main_t *vnm = vnet_get_main ();
+
+  /* Get a line of input. */
+  if (!unformat_user (input, unformat_line_input, line_input))
+    return 0;
+
+  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (line_input, "sw_if_index %d", &sw_if_index))
+       ;
+      else if (unformat (line_input, "%U", unformat_vnet_sw_interface,
+                        vnm, &sw_if_index))
+       ;
+      else
+       return clib_error_return (0, "unknown input `%U'",
+                                 format_unformat_error, input);
+    }
+  unformat_free (line_input);
+
+  if (sw_if_index == ~0)
+    return clib_error_return (0,
+                             "please specify interface name or sw_if_index");
+
+  hw = vnet_get_sup_hw_interface (vnm, sw_if_index);
+  if (hw == NULL || rdma_device_class.index != hw->dev_class_index)
+    return clib_error_return (0, "not an AVF interface");
+
+  rd = pool_elt_at_index (rm->devices, hw->dev_instance);
+
+  rdma_delete_if (vm, rd);
+
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (rdma_delete_command, static) = {
+  .path = "delete interface rdma",
+  .short_help = "delete interface rdma "
+    "{<interface> | sw_if_index <sw_idx>}",
+  .function = rdma_delete_command_fn,
+};
+/* *INDENT-ON* */
+
+clib_error_t *
+rdma_cli_init (vlib_main_t * vm)
+{
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (rdma_cli_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/rdma/device.c b/src/plugins/rdma/device.c
new file mode 100644 (file)
index 0000000..31112a9
--- /dev/null
@@ -0,0 +1,607 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <net/if.h>
+#include <linux/if_link.h>
+#include <linux/if_ether.h>
+
+#include <vppinfra/linux/sysfs.h>
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vlib/pci/pci.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <rdma/rdma.h>
+
+rdma_main_t rdma_main;
+
+#define rdma_log_debug(dev, f, ...) \
+{                                                                   \
+  vlib_log(VLIB_LOG_LEVEL_DEBUG, rdma_main.log_class, "%U: " f,      \
+          format_vlib_pci_addr, &rd->pci_addr, ##__VA_ARGS__);     \
+};
+
+static u32
+rdma_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags)
+{
+  rdma_main_t *rm = &rdma_main;
+  vlib_log_warn (rm->log_class, "TODO");
+  return 0;
+}
+
+static void
+rdma_update_state (vnet_main_t * vnm, rdma_device_t * rd, int port)
+{
+  struct ibv_port_attr attr;
+  u32 width = 0;
+  u32 speed = 0;
+
+  if (ibv_query_port (rd->ctx, port, &attr))
+    {
+      vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, 0);
+      vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
+      return;
+    }
+
+  /* update state */
+  switch (attr.state)
+    {
+    case IBV_PORT_ACTIVE:      /* fallthrough */
+    case IBV_PORT_ACTIVE_DEFER:
+      rd->flags |= RDMA_DEVICE_F_LINK_UP;
+      vnet_hw_interface_set_flags (vnm, rd->hw_if_index,
+                                  VNET_HW_INTERFACE_FLAG_LINK_UP);
+      break;
+    default:
+      rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
+      vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
+      break;
+    }
+
+  /* update speed */
+  switch (attr.active_width)
+    {
+    case 1:
+      width = 1;
+      break;
+    case 2:
+      width = 4;
+      break;
+    case 4:
+      width = 8;
+      break;
+    case 8:
+      width = 12;
+      break;
+    }
+  switch (attr.active_speed)
+    {
+    case 1:
+      speed = 2500000;
+      break;
+    case 2:
+      speed = 5000000;
+      break;
+    case 4:                    /* fallthrough */
+    case 8:
+      speed = 10000000;
+      break;
+    case 16:
+      speed = 14000000;
+      break;
+    case 32:
+      speed = 25000000;
+      break;
+    }
+  vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, width * speed);
+}
+
+static clib_error_t *
+rdma_async_event_error_ready (clib_file_t * f)
+{
+  rdma_main_t *rm = &rdma_main;
+  rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data);
+  return clib_error_return (0, "RDMA async event error for device %U",
+                           format_vlib_pci_addr, &rd->pci_addr);
+}
+
+static clib_error_t *
+rdma_async_event_read_ready (clib_file_t * f)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  rdma_main_t *rm = &rdma_main;
+  rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data);
+  int ret;
+  struct ibv_async_event event;
+  ret = ibv_get_async_event (rd->ctx, &event);
+  if (ret < 0)
+    {
+      return clib_error_return_unix (0, "ibv_get_async_event() failed");
+    }
+
+  switch (event.event_type)
+    {
+    case IBV_EVENT_PORT_ACTIVE:
+      rdma_update_state (vnm, rd, event.element.port_num);
+      break;
+    case IBV_EVENT_PORT_ERR:
+      rdma_update_state (vnm, rd, event.element.port_num);
+      break;
+    case IBV_EVENT_DEVICE_FATAL:
+      rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
+      vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
+      vlib_log_emerg (rm->log_class, "Fatal RDMA error for device %U",
+                     format_vlib_pci_addr, &rd->pci_addr);
+      break;
+    default:
+      vlib_log_warn (rm->log_class,
+                    "Unhandeld RDMA async event %i for device %U",
+                    event.event_type, format_vlib_pci_addr, &rd->pci_addr);
+      break;
+    }
+
+  ibv_ack_async_event (&event);
+  return 0;
+}
+
+static clib_error_t *
+rdma_async_event_init (rdma_device_t * rd)
+{
+  clib_file_t t = { 0 };
+  int ret;
+
+  /* make RDMA async event fd non-blocking */
+  ret = fcntl (rd->ctx->async_fd, F_GETFL);
+  if (ret < 0)
+    {
+      return clib_error_return_unix (0, "fcntl(F_GETFL) failed");
+    }
+  ret = fcntl (rd->ctx->async_fd, F_SETFL, ret | O_NONBLOCK);
+  if (ret < 0)
+    {
+      return clib_error_return_unix (0, "fcntl(F_SETFL, O_NONBLOCK) failed");
+    }
+
+  /* register RDMA async event fd */
+  t.read_function = rdma_async_event_read_ready;
+  t.file_descriptor = rd->ctx->async_fd;
+  t.error_function = rdma_async_event_error_ready;
+  t.private_data = rd->dev_instance;
+  t.description =
+    format (0, "RMDA %U async event", format_vlib_pci_addr, &rd->pci_addr);
+
+  rd->async_event_clib_file_index = clib_file_add (&file_main, &t);
+
+  return 0;
+}
+
+static void
+rdma_async_event_cleanup (rdma_device_t * rd)
+{
+  clib_file_del_by_index (&file_main, rd->async_event_clib_file_index);
+}
+
+static clib_error_t *
+rdma_register_interface (vnet_main_t * vnm, rdma_device_t * rd)
+{
+  return ethernet_register_interface (vnm, rdma_device_class.index,
+                                     rd->dev_instance, rd->hwaddr,
+                                     &rd->hw_if_index, rdma_flag_change);
+}
+
+static void
+rdma_unregister_interface (vnet_main_t * vnm, rdma_device_t * rd)
+{
+  vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
+  vnet_hw_interface_unassign_rx_thread (vnm, rd->hw_if_index, 0);
+  ethernet_delete_interface (vnm, rd->hw_if_index);
+}
+
+static void
+rdma_dev_cleanup (rdma_device_t * rd)
+{
+  rdma_main_t *rm = &rdma_main;
+  rdma_rxq_t *rxq;
+  rdma_txq_t *txq;
+
+#define _(fn, arg) if (arg) \
+  { \
+    int rv; \
+    if ((rv = fn (arg))) \
+       rdma_log_debug (rd, #fn "() failed (rv = %d)", rv); \
+  }
+
+  _(ibv_destroy_flow, rd->flow_mcast);
+  _(ibv_destroy_flow, rd->flow_ucast);
+  _(ibv_dereg_mr, rd->mr);
+  vec_foreach (txq, rd->txqs)
+  {
+    _(ibv_destroy_qp, txq->qp);
+    _(ibv_destroy_cq, txq->cq);
+  }
+  vec_foreach (rxq, rd->rxqs)
+  {
+    _(ibv_destroy_qp, rxq->qp);
+    _(ibv_destroy_cq, rxq->cq);
+  }
+  _(ibv_dealloc_pd, rd->pd);
+  _(ibv_close_device, rd->ctx);
+#undef _
+
+  clib_error_free (rd->error);
+
+  vec_free (rd->rxqs);
+  vec_free (rd->txqs);
+  pool_put (rm->devices, rd);
+}
+
+static clib_error_t *
+rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
+{
+  rdma_rxq_t *rxq;
+  struct ibv_qp_init_attr qpia;
+  struct ibv_qp_attr qpa;
+  int qp_flags;
+
+  vec_validate_aligned (rd->rxqs, qid, CLIB_CACHE_LINE_BYTES);
+  rxq = vec_elt_at_index (rd->rxqs, qid);
+  rxq->size = n_desc;
+
+  if ((rxq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
+    return clib_error_return_unix (0, "Create CQ Failed");
+
+  memset (&qpia, 0, sizeof (qpia));
+  qpia.qp_type = IBV_QPT_RAW_PACKET;
+  qpia.send_cq = rxq->cq;
+  qpia.recv_cq = rxq->cq;
+  qpia.cap.max_recv_wr = n_desc;
+  qpia.cap.max_recv_sge = 1;
+
+  if ((rxq->qp = ibv_create_qp (rd->pd, &qpia)) == 0)
+    return clib_error_return_unix (0, "Queue Pair create failed");
+
+  memset (&qpa, 0, sizeof (qpa));
+  qp_flags = IBV_QP_STATE | IBV_QP_PORT;
+  qpa.qp_state = IBV_QPS_INIT;
+  qpa.port_num = 1;
+  if (ibv_modify_qp (rxq->qp, &qpa, qp_flags) != 0)
+    return clib_error_return_unix (0, "Modify QP (init) Failed");
+
+  memset (&qpa, 0, sizeof (qpa));
+  qp_flags = IBV_QP_STATE;
+  qpa.qp_state = IBV_QPS_RTR;
+  if (ibv_modify_qp (rxq->qp, &qpa, qp_flags) != 0)
+    return clib_error_return_unix (0, "Modify QP (receive) Failed");
+
+  return 0;
+}
+
+static clib_error_t *
+rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
+{
+  rdma_txq_t *txq;
+  struct ibv_qp_init_attr qpia;
+  struct ibv_qp_attr qpa;
+  int qp_flags;
+
+  vec_validate_aligned (rd->txqs, qid, CLIB_CACHE_LINE_BYTES);
+  txq = vec_elt_at_index (rd->txqs, qid);
+  txq->size = n_desc;
+
+  if ((txq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
+    return clib_error_return_unix (0, "Create CQ Failed");
+
+  memset (&qpia, 0, sizeof (qpia));
+  qpia.qp_type = IBV_QPT_RAW_PACKET;
+  qpia.send_cq = txq->cq;
+  qpia.recv_cq = txq->cq;
+  qpia.cap.max_send_wr = n_desc;
+  qpia.cap.max_send_sge = 1;
+
+  if ((txq->qp = ibv_create_qp (rd->pd, &qpia)) == 0)
+    return clib_error_return_unix (0, "Queue Pair create failed");
+
+  memset (&qpa, 0, sizeof (qpa));
+  qp_flags = IBV_QP_STATE | IBV_QP_PORT;
+  qpa.qp_state = IBV_QPS_INIT;
+  qpa.port_num = 1;
+  if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
+    return clib_error_return_unix (0, "Modify QP (init) Failed");
+
+  memset (&qpa, 0, sizeof (qpa));
+  qp_flags = IBV_QP_STATE;
+  qpa.qp_state = IBV_QPS_RTR;
+  if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
+    return clib_error_return_unix (0, "Modify QP (receive) Failed");
+
+  memset (&qpa, 0, sizeof (qpa));
+  qp_flags = IBV_QP_STATE;
+  qpa.qp_state = IBV_QPS_RTS;
+  if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
+    return clib_error_return_unix (0, "Modify QP (send) Failed");
+  return 0;
+}
+
+static clib_error_t *
+rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd)
+{
+  clib_error_t *err;
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+  u16 i;
+
+  if (rd->ctx == 0)
+    return clib_error_return_unix (0, "Device Open Failed");
+
+  if ((rd->pd = ibv_alloc_pd (rd->ctx)) == 0)
+    return clib_error_return_unix (0, "PD Alloc Failed");
+
+  if ((err = rdma_rxq_init (vm, rd, 0, 512)))
+    return err;
+
+  for (i = 0; i < tm->n_vlib_mains; i++)
+    if ((err = rdma_txq_init (vm, rd, i, 512)))
+      return err;
+
+  if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
+                           bm->buffer_mem_size,
+                           IBV_ACCESS_LOCAL_WRITE)) == 0)
+    return clib_error_return_unix (0, "Register MR Failed");
+
+  ethernet_mac_address_generate (rd->hwaddr);
+
+  /*
+   * restrict packets steering to our MAC
+   * allows to share a single HW NIC with multiple RDMA ifaces
+   * and/or Linux
+   */
+  struct raw_eth_flow_attr
+  {
+    struct ibv_flow_attr attr;
+    struct ibv_flow_spec_eth spec_eth;
+  } __attribute__ ((packed)) fa;
+  memset (&fa, 0, sizeof (fa));
+  fa.attr.num_of_specs = 1;
+  fa.attr.port = 1;
+  fa.spec_eth.type = IBV_FLOW_SPEC_ETH;
+  fa.spec_eth.size = sizeof (struct ibv_flow_spec_eth);
+  memcpy (fa.spec_eth.val.dst_mac, rd->hwaddr,
+         sizeof (fa.spec_eth.val.dst_mac));
+  memset (fa.spec_eth.mask.dst_mac, 0xff, sizeof (fa.spec_eth.mask.dst_mac));
+  if ((rd->flow_ucast = ibv_create_flow (rd->rxqs[0].qp, &fa.attr)) == 0)
+    return clib_error_return_unix (0, "create Flow Failed");
+
+  /* receive multicast packets too */
+  memset (&fa, 0, sizeof (fa));
+  fa.attr.num_of_specs = 1;
+  fa.attr.port = 1;
+  fa.attr.flags = IBV_FLOW_ATTR_FLAGS_DONT_TRAP;       /* let others receive them too */
+  fa.spec_eth.type = IBV_FLOW_SPEC_ETH;
+  fa.spec_eth.size = sizeof (struct ibv_flow_spec_eth);
+  fa.spec_eth.val.dst_mac[0] = 1;
+  fa.spec_eth.mask.dst_mac[0] = 1;
+  if ((rd->flow_mcast = ibv_create_flow (rd->rxqs[0].qp, &fa.attr)) == 0)
+    return clib_error_return_unix (0, "create Flow Failed");
+
+  return 0;
+}
+
+static uword
+sysfs_path_to_pci_addr (char *path, vlib_pci_addr_t * addr)
+{
+  uword rv;
+  unformat_input_t in;
+  u8 *s;
+
+  s = clib_sysfs_link_to_name (path);
+  unformat_init_string (&in, (char *) s, strlen ((char *) s));
+  rv = unformat (&in, "%U", unformat_vlib_pci_addr, addr);
+  unformat_free (&in);
+  vec_free (s);
+  return rv;
+}
+
+void
+rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  rdma_main_t *rm = &rdma_main;
+  rdma_device_t *rd = 0;
+  struct ibv_device **dev_list = 0;
+  int n_devs;
+  u8 *s = 0, *s2 = 0;
+
+  pool_get_zero (rm->devices, rd);
+  rd->dev_instance = rd - rm->devices;
+  rd->per_interface_next_index = ~0;
+
+  /* check if device exist and if it is bound to mlx5_core */
+  s = format (s, "/sys/class/net/%s/device/driver/module%c", args->ifname, 0);
+  s2 = clib_sysfs_link_to_name ((char *) s);
+
+  if (s2 == 0 || strncmp ((char *) s2, "mlx5_core", 9) != 0)
+    {
+      args->error =
+       clib_error_return (0,
+                          "invalid interface (only mlx5 supported for now)");
+      goto err0;
+    }
+
+  /* extract PCI address */
+  vec_reset_length (s);
+  s = format (s, "/sys/class/net/%s/device%c", args->ifname, 0);
+  if (sysfs_path_to_pci_addr ((char *) s, &rd->pci_addr) == 0)
+    {
+      args->error = clib_error_return (0, "cannot find PCI address");
+      goto err0;
+    }
+
+  dev_list = ibv_get_device_list (&n_devs);
+  if (n_devs == 0)
+    {
+      args->error =
+       clib_error_return_unix (0,
+                               "no RDMA devices available, errno = %d. Is the ib_uverbs module loaded?",
+                               errno);
+      goto err1;
+    }
+
+  for (int i = 0; i < n_devs; i++)
+    {
+      vlib_pci_addr_t addr;
+
+      vec_reset_length (s);
+      s = format (s, "%s/device%c", dev_list[i]->dev_path, 0);
+
+      if (sysfs_path_to_pci_addr ((char *) s, &addr) == 0)
+       continue;
+
+      if (addr.as_u32 != rd->pci_addr.as_u32)
+       continue;
+
+      if ((rd->ctx = ibv_open_device (dev_list[i])))
+       break;
+    }
+
+  if ((args->error = rdma_dev_init (vm, rd)))
+    goto err2;
+
+  if ((args->error = rdma_register_interface (vnm, rd)))
+    goto err2;
+
+  if ((args->error = rdma_async_event_init (rd)))
+    goto err3;
+
+  rdma_update_state (vnm, rd, 1);
+
+  vnet_sw_interface_t *sw = vnet_get_hw_sw_interface (vnm, rd->hw_if_index);
+  args->sw_if_index = rd->sw_if_index = sw->sw_if_index;
+  /*
+   * FIXME: add support for interrupt mode
+   * vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, rd->hw_if_index);
+   * hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE;
+   */
+  vnet_hw_interface_set_input_node (vnm, rd->hw_if_index,
+                                   rdma_input_node.index);
+  vnet_hw_interface_assign_rx_thread (vnm, rd->hw_if_index, 0, ~0);
+  return;
+
+err3:
+  rdma_unregister_interface (vnm, rd);
+err2:
+  rdma_dev_cleanup (rd);
+err1:
+  ibv_free_device_list (dev_list);
+err0:
+  vec_free (s2);
+  vec_free (s);
+  args->rv = VNET_API_ERROR_INVALID_INTERFACE;
+  vlib_log_err (rm->log_class, "%U", format_clib_error, args->error);
+}
+
+void
+rdma_delete_if (vlib_main_t * vm, rdma_device_t * rd)
+{
+  rdma_async_event_cleanup (rd);
+  rdma_unregister_interface (vnet_get_main (), rd);
+  rdma_dev_cleanup (rd);
+}
+
+static clib_error_t *
+rdma_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
+{
+  vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, hw_if_index);
+  rdma_main_t *rm = &rdma_main;
+  rdma_device_t *rd = vec_elt_at_index (rm->devices, hi->dev_instance);
+  uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+
+  if (rd->flags & RDMA_DEVICE_F_ERROR)
+    return clib_error_return (0, "device is in error state");
+
+  if (is_up)
+    {
+      vnet_hw_interface_set_flags (vnm, rd->hw_if_index,
+                                  VNET_HW_INTERFACE_FLAG_LINK_UP);
+      rd->flags |= RDMA_DEVICE_F_ADMIN_UP;
+    }
+  else
+    {
+      vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
+      rd->flags &= ~RDMA_DEVICE_F_ADMIN_UP;
+    }
+  return 0;
+}
+
+static void
+rdma_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index,
+                             u32 node_index)
+{
+  rdma_main_t *rm = &rdma_main;
+  vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+  rdma_device_t *rd = pool_elt_at_index (rm->devices, hw->dev_instance);
+
+  /* Shut off redirection */
+  if (node_index == ~0)
+    {
+      rd->per_interface_next_index = node_index;
+      return;
+    }
+
+  rd->per_interface_next_index =
+    vlib_node_add_next (vlib_get_main (), rdma_input_node.index, node_index);
+}
+
+static char *rdma_tx_func_error_strings[] = {
+#define _(n,s) s,
+  foreach_rdma_tx_func_error
+#undef _
+};
+
+/* *INDENT-OFF* */
+VNET_DEVICE_CLASS (rdma_device_class,) =
+{
+  .name = "RDMA interface",
+  .format_device = format_rdma_device,
+  .format_device_name = format_rdma_device_name,
+  .admin_up_down_function = rdma_interface_admin_up_down,
+  .rx_redirect_to_node = rdma_set_interface_next_node,
+  .tx_function_n_errors = RDMA_TX_N_ERROR,
+  .tx_function_error_strings = rdma_tx_func_error_strings,
+};
+/* *INDENT-ON* */
+
+clib_error_t *
+rdma_init (vlib_main_t * vm)
+{
+  rdma_main_t *rm = &rdma_main;
+
+  rm->log_class = vlib_log_register_class ("rdma", 0);
+
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (rdma_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/rdma/format.c b/src/plugins/rdma/format.c
new file mode 100644 (file)
index 0000000..7ef65d4
--- /dev/null
@@ -0,0 +1,89 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vlib/pci/pci.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <rdma/rdma.h>
+
+u8 *
+format_rdma_device_name (u8 * s, va_list * args)
+{
+  u32 i = va_arg (*args, u32);
+  rdma_main_t *rm = &rdma_main;
+  rdma_device_t *rd = vec_elt_at_index (rm->devices, i);
+
+  s = format (s, "rdma-%u", rd->dev_instance);
+  return s;
+}
+
+u8 *
+format_rdma_device_flags (u8 * s, va_list * args)
+{
+  rdma_device_t *rd = va_arg (*args, rdma_device_t *);
+  u8 *t = 0;
+
+#define _(a, b, c) if (rd->flags & (1 << a)) \
+t = format (t, "%s%s", t ? " ":"", c);
+  foreach_rdma_device_flags
+#undef _
+    s = format (s, "%v", t);
+  vec_free (t);
+  return s;
+}
+
+u8 *
+format_rdma_device (u8 * s, va_list * args)
+{
+  u32 i = va_arg (*args, u32);
+  rdma_main_t *rm = &rdma_main;
+  rdma_device_t *rd = vec_elt_at_index (rm->devices, i);
+  u32 indent = format_get_indent (s);
+
+  s = format (s, "flags: %U", format_rdma_device_flags, rd);
+  if (rd->error)
+    s = format (s, "\n%Uerror %U", format_white_space, indent,
+               format_clib_error, rd->error);
+
+  return s;
+}
+
+u8 *
+format_rdma_input_trace (u8 * s, va_list * args)
+{
+  vlib_main_t *vm = va_arg (*args, vlib_main_t *);
+  vlib_node_t *node = va_arg (*args, vlib_node_t *);
+  rdma_input_trace_t *t = va_arg (*args, rdma_input_trace_t *);
+  vnet_main_t *vnm = vnet_get_main ();
+  vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, t->hw_if_index);
+
+  s = format (s, "rdma: %v (%d) next-node %U",
+             hi->name, t->hw_if_index, format_vlib_next_node_name, vm,
+             node->index, t->next_index);
+
+  return s;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/rdma/input.c b/src/plugins/rdma/input.c
new file mode 100644 (file)
index 0000000..001d1c5
--- /dev/null
@@ -0,0 +1,202 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vlib/pci/pci.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/devices.h>
+
+#include <rdma/rdma.h>
+
+#define foreach_rdma_input_error \
+  _(BUFFER_ALLOC, "buffer alloc error")
+
+typedef enum
+{
+#define _(f,s) RDMA_INPUT_ERROR_##f,
+  foreach_rdma_input_error
+#undef _
+    RDMA_INPUT_N_ERROR,
+} rdma_input_error_t;
+
+static __clib_unused char *rdma_input_error_strings[] = {
+#define _(n,s) s,
+  foreach_rdma_input_error
+#undef _
+};
+
+static_always_inline void
+rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd,
+                         rdma_rxq_t * rxq)
+{
+  u32 n_alloc, n;
+  struct ibv_sge sg_entry;
+  struct ibv_recv_wr wr, *bad_wr;
+  u32 buffers[VLIB_FRAME_SIZE];
+
+  if (rxq->n_enq >= rxq->size)
+    return;
+
+  n_alloc = clib_min (VLIB_FRAME_SIZE, rxq->size - rxq->n_enq);
+  n_alloc = vlib_buffer_alloc (vm, buffers, n_alloc);
+
+  sg_entry.length = vlib_buffer_get_default_data_size (vm);
+  sg_entry.lkey = rd->mr->lkey;
+  wr.num_sge = 1;
+  wr.sg_list = &sg_entry;
+  wr.next = NULL;
+  for (n = 0; n < n_alloc; n++)
+    {
+      vlib_buffer_t *b = vlib_get_buffer (vm, buffers[n]);
+      sg_entry.addr = vlib_buffer_get_va (b);
+      wr.wr_id = buffers[n];
+      if (ibv_post_recv (rxq->qp, &wr, &bad_wr) != 0)
+       vlib_buffer_free (vm, buffers + n, 1);
+      else
+       rxq->n_enq++;
+    }
+}
+
+static_always_inline uword
+rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
+                         vlib_frame_t * frame, rdma_device_t * rd, u16 qid)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  rdma_rxq_t *rxq = vec_elt_at_index (rd->rxqs, qid);
+  u32 n_trace;
+  struct ibv_wc wc[VLIB_FRAME_SIZE];
+  u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+  u32 *bi, *to_next, n_left_to_next;
+  int i;
+  u32 n_rx_packets = 0, n_rx_bytes = 0;
+
+  n_rx_packets = ibv_poll_cq (rxq->cq, VLIB_FRAME_SIZE, wc);
+
+  if (n_rx_packets <= 0)
+    rdma_device_input_refill (vm, rd, rxq);
+
+  if (PREDICT_FALSE (rd->per_interface_next_index != ~0))
+    next_index = rd->per_interface_next_index;
+
+  vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+  for (i = 0; i < n_rx_packets; i++)
+    {
+      u32 bi = wc[i].wr_id;
+      vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+      b->current_length = wc[i].byte_len;
+      vnet_buffer (b)->sw_if_index[VLIB_RX] = rd->sw_if_index;
+      vnet_buffer (b)->sw_if_index[VLIB_TX] = ~0;
+      to_next[i] = bi;
+      n_rx_bytes += wc[i].byte_len;
+    }
+
+  if (PREDICT_FALSE ((n_trace = vlib_get_trace_count (vm, node))))
+    {
+      u32 n_left = n_rx_packets, i = 0;
+      bi = to_next;
+
+      while (n_trace && n_left)
+       {
+         vlib_buffer_t *b;
+         rdma_input_trace_t *tr;
+         b = vlib_get_buffer (vm, bi[0]);
+         vlib_trace_buffer (vm, node, next_index, b, /* follow_chain */ 0);
+         tr = vlib_add_trace (vm, node, b, sizeof (*tr));
+         tr->next_index = next_index;
+         tr->hw_if_index = rd->hw_if_index;
+
+         /* next */
+         n_trace--;
+         n_left--;
+         bi++;
+         i++;
+       }
+      vlib_set_trace_count (vm, node, n_trace);
+    }
+
+  if (PREDICT_TRUE (next_index == VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT))
+    {
+      vlib_next_frame_t *nf;
+      vlib_frame_t *f;
+      ethernet_input_frame_t *ef;
+      nf = vlib_node_runtime_get_next_frame (vm, node, next_index);
+      f = vlib_get_frame (vm, nf->frame_index);
+      f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX;
+
+      ef = vlib_frame_scalar_args (f);
+      ef->sw_if_index = rd->sw_if_index;
+      ef->hw_if_index = rd->hw_if_index;
+      //f->flags |= ETH_INPUT_FRAME_F_IP4_CKSUM_OK;
+    }
+
+  n_left_to_next -= n_rx_packets;
+  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+
+  vlib_increment_combined_counter
+    (vnm->interface_main.combined_sw_if_counters +
+     VNET_INTERFACE_COUNTER_RX, vm->thread_index,
+     rd->hw_if_index, n_rx_packets, n_rx_bytes);
+
+  rxq->n_enq -= n_rx_packets;
+  rdma_device_input_refill (vm, rd, rxq);
+
+  return n_rx_packets;
+}
+
+VLIB_NODE_FN (rdma_input_node) (vlib_main_t * vm,
+                               vlib_node_runtime_t * node,
+                               vlib_frame_t * frame)
+{
+  u32 n_rx = 0;
+  rdma_main_t *rm = &rdma_main;
+  vnet_device_input_runtime_t *rt = (void *) node->runtime_data;
+  vnet_device_and_queue_t *dq;
+
+  foreach_device_and_queue (dq, rt->devices_and_queues)
+  {
+    rdma_device_t *rd;
+    rd = vec_elt_at_index (rm->devices, dq->dev_instance);
+    if ((rd->flags & RDMA_DEVICE_F_ADMIN_UP) == 0)
+      continue;
+    n_rx += rdma_device_input_inline (vm, node, frame, rd, dq->queue_id);
+  }
+  return n_rx;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (rdma_input_node) = {
+  .name = "rdma-input",
+  .sibling_of = "device-input",
+  .format_trace = format_rdma_input_trace,
+  .type = VLIB_NODE_TYPE_INPUT,
+  .state = VLIB_NODE_STATE_DISABLED,
+  .n_errors = RDMA_INPUT_N_ERROR,
+  .error_strings = rdma_input_error_strings,
+};
+
+/* *INDENT-ON* */
+
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/rdma/output.c b/src/plugins/rdma/output.c
new file mode 100644 (file)
index 0000000..4107843
--- /dev/null
@@ -0,0 +1,133 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vlib/pci/pci.h>
+#include <vppinfra/ring.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/devices.h>
+
+#include <rdma/rdma.h>
+
+static_always_inline u16
+rdma_device_output_tx (vlib_main_t * vm, rdma_device_t * rd, rdma_txq_t * txq,
+                      u32 * buffers, u16 n_left, u32 * n_tx_packets,
+                      u32 * n_tx_bytes)
+{
+  struct ibv_sge sg_entry;
+  struct ibv_send_wr wr, *bad_wr;
+  u16 i;
+
+  for (i = 0; i < n_left; i++)
+    {
+      vlib_buffer_t *b = vlib_get_buffer (vm, buffers[i]);
+      sg_entry.addr = vlib_buffer_get_current_va (b);
+      sg_entry.length = b->current_length;
+      sg_entry.lkey = rd->mr->lkey;
+
+      memset (&wr, 0, sizeof (wr));
+      wr.num_sge = 1;
+      wr.sg_list = &sg_entry;
+      wr.opcode = IBV_WR_SEND;
+      wr.send_flags = IBV_SEND_SIGNALED;
+      wr.wr_id = buffers[i];
+
+      if (ibv_post_send (txq->qp, &wr, &bad_wr) != 0)
+       break;
+
+      *n_tx_bytes += b->current_length;
+    }
+
+  *n_tx_packets += i;
+  return i;
+}
+
+static_always_inline void
+rdma_device_output_free (vlib_main_t * vm, rdma_txq_t * txq)
+{
+  struct ibv_wc wc[VLIB_FRAME_SIZE];
+  u32 to_free[VLIB_FRAME_SIZE];
+  int n_free;
+  int i;
+
+  n_free = ibv_poll_cq (txq->cq, VLIB_FRAME_SIZE, wc);
+  if (n_free <= 0)
+    return;
+
+  for (i = 0; i < n_free; i++)
+    to_free[i] = wc[i].wr_id;
+
+  vlib_buffer_free (vm, to_free, n_free);
+}
+
+VNET_DEVICE_CLASS_TX_FN (rdma_device_class) (vlib_main_t * vm,
+                                            vlib_node_runtime_t * node,
+                                            vlib_frame_t * frame)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  rdma_main_t *rm = &rdma_main;
+  vnet_interface_output_runtime_t *ord = (void *) node->runtime_data;
+  rdma_device_t *rd = pool_elt_at_index (rm->devices, ord->dev_instance);
+  u32 thread_index = vm->thread_index;
+  u8 qid = thread_index;
+  rdma_txq_t *txq = vec_elt_at_index (rd->txqs, qid % vec_len (rd->txqs));
+  u32 *buffers = vlib_frame_vector_args (frame);
+  u16 n_left;
+  u16 n_retry = 5;
+  u32 n_tx_packets = 0, n_tx_bytes = 0;
+
+  clib_spinlock_lock_if_init (&txq->lock);
+
+  n_left = frame->n_vectors;
+
+  while (n_left)
+    {
+      u16 n;
+      rdma_device_output_free (vm, txq);
+      n =
+       rdma_device_output_tx (vm, rd, txq, buffers, n_left, &n_tx_packets,
+                              &n_tx_bytes);
+      n_left -= n;
+      buffers += n;
+
+      if (n_left && n_retry--)
+       {
+         vlib_buffer_free (vm, buffers, n_left);
+         vlib_error_count (vm, node->node_index,
+                           RDMA_TX_ERROR_NO_FREE_SLOTS, n_left);
+         break;
+       }
+    }
+
+  clib_spinlock_unlock_if_init (&txq->lock);
+
+  vlib_increment_combined_counter
+    (vnm->interface_main.combined_sw_if_counters +
+     VNET_INTERFACE_COUNTER_TX, thread_index,
+     rd->hw_if_index, n_tx_packets, n_tx_bytes);
+
+  return frame->n_vectors - n_left;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/rdma/plugin.c b/src/plugins/rdma/plugin.c
new file mode 100644 (file)
index 0000000..f229b75
--- /dev/null
@@ -0,0 +1,35 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/plugin/plugin.h>
+#include <vpp/app/version.h>
+
+/* *INDENT-OFF* */
+VLIB_PLUGIN_REGISTER () = {
+  .version = VPP_BUILD_VER,
+  .description = "RDMA (ibverb) Device Plugin",
+};
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/rdma/rdma.h b/src/plugins/rdma/rdma.h
new file mode 100644 (file)
index 0000000..860ddab
--- /dev/null
@@ -0,0 +1,141 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef _RDMA_H_
+#define _RDMA_H_
+
+#include <infiniband/verbs.h>
+#include <vlib/log.h>
+
+#define foreach_rdma_device_flags \
+  _(0, INITIALIZED, "initialized") \
+  _(1, ERROR, "error") \
+  _(2, ADMIN_UP, "admin-up") \
+  _(3, VA_DMA, "vaddr-dma") \
+  _(4, LINK_UP, "link-up") \
+  _(5, SHARED_TXQ_LOCK, "shared-txq-lock") \
+  _(6, ELOG, "elog") \
+
+enum
+{
+#define _(a, b, c) RDMA_DEVICE_F_##b = (1 << a),
+  foreach_rdma_device_flags
+#undef _
+};
+
+typedef struct
+{
+  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+  u32 size;
+  u32 n_enq;
+  struct ibv_cq *cq;
+  struct ibv_qp *qp;
+} rdma_rxq_t;
+
+typedef struct
+{
+  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+  u32 size;
+  u32 n_enq;
+  struct ibv_cq *cq;
+  struct ibv_qp *qp;
+  clib_spinlock_t lock;
+} rdma_txq_t;
+
+typedef struct
+{
+  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+  u32 flags;
+  u32 per_interface_next_index;
+
+  u32 dev_instance;
+  u32 sw_if_index;
+  u32 hw_if_index;
+
+  u32 async_event_clib_file_index;
+
+  rdma_rxq_t *rxqs;
+  rdma_txq_t *txqs;
+
+  u8 hwaddr[6];
+  vlib_pci_addr_t pci_addr;
+
+  struct ibv_context *ctx;
+  struct ibv_pd *pd;
+  struct ibv_mr *mr;
+  struct ibv_flow *flow_ucast;
+  struct ibv_flow *flow_mcast;
+
+  /* error */
+  clib_error_t *error;
+} rdma_device_t;
+
+typedef struct
+{
+  rdma_device_t *devices;
+  vlib_log_class_t log_class;
+} rdma_main_t;
+
+extern rdma_main_t rdma_main;
+
+typedef struct
+{
+  u8 *ifname;
+
+  /* return */
+  int rv;
+  u32 sw_if_index;
+  clib_error_t *error;
+} rdma_create_if_args_t;
+
+void rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args);
+void rdma_delete_if (vlib_main_t * vm, rdma_device_t * rd);
+
+extern vlib_node_registration_t rdma_input_node;
+extern vnet_device_class_t rdma_device_class;
+
+/* format.c */
+format_function_t format_rdma_device;
+format_function_t format_rdma_device_name;
+format_function_t format_rdma_input_trace;
+
+typedef struct
+{
+  u32 next_index;
+  u32 hw_if_index;
+} rdma_input_trace_t;
+
+#define foreach_rdma_tx_func_error            \
+_(NO_FREE_SLOTS, "no free tx slots")
+
+typedef enum
+{
+#define _(f,s) RDMA_TX_ERROR_##f,
+  foreach_rdma_tx_func_error
+#undef _
+    RDMA_TX_N_ERROR,
+} rdma_tx_func_error_t;
+
+#endif /* AVF_H */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
index 0b2ebd6..2649f68 100644 (file)
@@ -347,16 +347,8 @@ tap_create_if (vlib_main_t * vm, tap_create_if_args_t * args)
     }
 
   if (!args->mac_addr_set)
-    {
-      f64 now = vlib_time_now (vm);
-      u32 rnd;
-      rnd = (u32) (now * 1e6);
-      rnd = random_u32 (&rnd);
-
-      memcpy (args->mac_addr + 2, &rnd, sizeof (rnd));
-      args->mac_addr[0] = 2;
-      args->mac_addr[1] = 0xfe;
-    }
+    ethernet_mac_address_generate (args->mac_addr);
+
   vif->rx_ring_sz = args->rx_ring_sz != 0 ? args->rx_ring_sz : 256;
   vif->tx_ring_sz = args->tx_ring_sz != 0 ? args->tx_ring_sz : 256;
   clib_memcpy (vif->mac_addr, args->mac_addr, 6);
index 87a66a2..01fb76e 100644 (file)
@@ -70,6 +70,17 @@ ethernet_mac_address_is_zero (const u8 * mac)
   return ((*((u32 *) mac) == 0) && (*((u16 *) (mac + 4)) == 0));
 }
 
+static inline void
+ethernet_mac_address_generate (u8 * mac)
+{
+  u32 rnd = clib_cpu_time_now ();
+  rnd = random_u32 (&rnd);
+
+  memcpy (mac + 2, &rnd, sizeof (rnd));
+  mac[0] = 2;
+  mac[1] = 0xfe;
+}
+
 static inline int
 ethernet_mac_address_equal (const u8 * a, const u8 * b)
 {