From: Jianfeng Tan Date: Mon, 3 Jun 2019 15:51:41 +0000 (+0800) Subject: some enhancements on TLDK X-Git-Url: https://gerrit.fd.io/r/gitweb?a=commitdiff_plain;h=refs%2Fchanges%2F48%2F19948%2F1;p=tldk.git some enhancements on TLDK See README. Change-Id: I4819cfece2d3d28afe619c5b173f0a749f86e6cd Signed-off-by: Jianfeng Tan Signed-off-by: Jielong Zhou Signed-off-by: Jian Zhang Signed-off-by: Chen Zhao --- diff --git a/Makefile b/Makefile index 474ada6..6470cb5 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,7 @@ endif RTE_TARGET ?= x86_64-native-linuxapp-gcc +DIRS-y += dpdk DIRS-y += lib DIRS-y += examples DIRS-y += test @@ -32,11 +33,18 @@ MAKEFLAGS += --no-print-directory O ?= $(TLDK_ROOT)/${RTE_TARGET} BASE_OUTPUT ?= $(abspath $(O)) +DPDK_LIBS_PATH := $(TLDK_ROOT)/dpdk/install/lib +TLDK_LIBS_PATH := $(TLDK_ROOT)/$(RTE_TARGET)/lib +LIBS := + .PHONY: all all: $(DIRS-y) .PHONY: clean -clean: $(DIRS-y) +clean: + @make clean -C test/packetdrill + @rm -rf $(RTE_TARGET) + @rm -rf libtldk.so libtldk.a .PHONY: $(DIRS-y) $(DIRS-y): $(RTE_SDK)/mk/rte.vars.mk @@ -50,6 +58,32 @@ $(DIRS-y): $(RTE_SDK)/mk/rte.vars.mk RTE_TARGET=$(RTE_TARGET) \ $(filter-out $(DIRS-y),$(MAKECMDGOALS)) +test: libtldk.a libtldk.so + +libtldk.so: lib + $(eval LIBS = $(wildcard $(DPDK_LIBS_PATH)/librte*.a $(TLDK_LIBS_PATH)/*.a)) + @gcc -shared -o libtldk.so -L$(DPDK_LIBS_PATH) -L$(TLDK_LIBS_PATH) \ + -Wl,--whole-archive $(LIBS) -Wl,--no-whole-archive \ + -lpthread -ldl -lnuma + +define repack +@echo -- repack $1 --- +@rm -rf tmpxyz; rm -f $1; mkdir tmpxyz; cd tmpxyz; \ + for f in $(LIBS) ; do \ + fn=$$(basename $$f) ; \ + echo $$fn ; \ + mkdir $$fn"_obj" ; \ + cd $$fn"_obj" ; \ + ar x $$f ; \ + cd .. ; \ + done; \ +ar cru ../$1 $$(find */*.o | paste -sd " " -); cd ..; rm -rf tmpxyz +endef + +libtldk.a: lib + $(eval LIBS = $(wildcard $(DPDK_LIBS_PATH)/librte*.a $(TLDK_LIBS_PATH)/*.a)) + $(call repack,libtldk.a) + $(RTE_SDK)/mk/rte.vars.mk: ifeq ($(RTE_SDK),$(LOCAL_RTE_SDK)) @make RTE_TARGET=$(RTE_TARGET) config all -C $(TLDK_ROOT)/dpdk/ diff --git a/README b/README index 16f96b9..6ec5e23 100644 --- a/README +++ b/README @@ -1,7 +1,5 @@ 1. OVERVIEW - TLDK project scope is as follows: - 1) To implement a set of libraries for L4 protocol processing (UDP, TCP etc.) for both IPv4 and IPv6. @@ -36,8 +34,18 @@ The library uses siphash logic from the below source https://github.com/veorq/SipHash +2. APIs + + TLDK provides three series of APIs: + - TLDK native APIs, provided by libtle_l4p. + - Posix APIs, provided by libtle_glue with PRELOAD compile macro. + - Posix APIs with changed symbol names, provided by libtle_glue without PRELOAD macro. + -2. INSTALLATION GUIDE +3. INSTALLATION GUIDE + + - Original guide + ---------------- 1) Obtain latest supported DPDK version and build it. (refer to http://dpdk.org for information how to download and build it). @@ -58,6 +66,30 @@ make all ./x86_64-native-linuxapp-gcc/app/l4fwd ... + + - For preload use + ----------------- + + Debug: + + $ make DPDK_DEBUG=y EXTRA_CFLAGS="-g -O0 -fPIC -DPRELOAD" all + + Release: + + $ make EXTRA_CFLAGS="-g -fPIC -DPRELOAD" all + + + - For TLDK API use + ------------------ + + Debug: + + $ make DPDK_DEBUG=y EXTRA_CFLAGS="-g -O0 -fPIC" all + + Release: + + $ make EXTRA_CFLAGS="-g -O3 -fPIC" all + 3. CONTENTS $(TLDK_ROOT) @@ -74,6 +106,8 @@ | +--libtle_l4p - implementation of the TCP/UDP packet processing | | | +--libtle_timer - implementation of the timer library + | | + | +--libtle_glue - socket glue layer with arp, icmp, epoll, etc | +----examples | | @@ -88,3 +122,145 @@ | | (googletest) | | | +--timer - UT for libtle_timer (standalone app) + | | + | +--packetdrill - UT for stack (standalone app) + + +5. Features + + Done: + - posix interface + - loopback device + - regression test + - multi-thread + - lightweight mem + - tcp_info (paritial) + - fd management + - arp request/reply + - icmp reply + - interrupt mode + - blocking recv/send + - TSO + - UFO + + TODO: + - fuzzing + - SACK + - RACK + - zerocopy APIs + - batching APIs + - multi-process + - numa awareness + - context recycle on thread exit + +5. Thread model + + - Multi-process is still not fully supported. + + - Symmetric multi-thread + + (io thread) (io thread) (io thread) + \ \ \ + / / / + \ \ \ + ---------------------------------------------------------- + | FD management | + ---------------------------------------------------------- + + ------------ ------------ ------------ + | | | | | | + | glue_ctx | | glue_ctx | | glue_ctx | + | | | | | | + ------------ ------------ ------------ + \__ | __/ + \__ | __/ + \__ | __/ + \__ | __/ + ------------------------- + | (RSS) NIC (FDIR) | + ------------------------- + + - Lookaside multi-thread + + (app thread) (app thread) (io thread) + \ \ \ + / / / + \ \ \ + ---------------------------------------------------------- + | FD management | + ---------------------------------------------------------- + + ---------------------------------------------------------- + | | + | glue_ctx | + | | + ---------------------------------------------------------- + | + | + ------------------------- + | NIC | + ------------------------- + +6. How to run + + We have two setups which need their own preparation. + + - virtio-user: test with virtio-user + vhost-kernel; + - physical NIC: test with physical NIC bound to vfio. + + If you are using physical NIC: + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + a. Set Linux boot options (Only needed if you will use physical NIC) + a1. Add below configuration into GRUB_CMDLINE_LINUX in /etc/default/grub + "intel_iommu=on iommu=pt" + + a2. Update grub + $ sudo grub2-mkconfig -o /boot/grub2/grub.cfg + + If you want to use 1GB hugepages, you can also add below content in the + boot cmdline: + "default_hugepagesz=1G hugepagesz=1G hugepages=2" + + b. Adjust RLIMIT_MEMLOCK (Only needed if you will use physical NIC) + Add below two lines into /etc/security/limits.conf + "* soft memlock 4194304 + * hard memlock 4194304" + + c. Reboot system + + d. Bind NIC to vfio-pci + + $ sudo modprobe vfio-pci + $ sudo ./usertools/dpdk-devbind.py -b vfio-pci 0000:01:00.1 + $ sudo chmod 666 /dev/vfio/16 (16 is just an example) + + If you are using virtio-user: + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + a. Prepare vhost-kernel + + $ sudo modprobe vhost-net + (if you don't have those modules, have to compile by yourself) + $ sudo chmod 666 /dev/vhost-net + $ sudo tunctl -u + + b. Prepare the vNIC + + $ export DPDK_VNIC="--vdev=virtio_user0,path=/dev/vhost-net,queue_size=1024,iface=tap0" + + For both cases, we need to: + ~~~~~~~~~~~~~~~~~~~~~~~~~~ + + $ sudo chmod 777 /dev/hugepages + $ export DPDK_IP=1.1.1.1 + + Note: for specific test example, you can refer to the example commit log. + +7. Tested Examples + + - examples/client + - examples/server + - wget (epoll) + - curl (poll) + - haproxy (multi-thread mode) diff --git a/dpdk/Makefile b/dpdk/Makefile index 63ddd6c..78bf7e2 100644 --- a/dpdk/Makefile +++ b/dpdk/Makefile @@ -14,13 +14,15 @@ # Scripts require non-POSIX parts of bash SHELL := /bin/bash -DPDK_VERSION ?= v17.11 +DPDK_VERSION ?= v18.11 DPDK_BUILD_DIR ?= $(CURDIR)/_build DPDK_INSTALL_DIR ?= $(DPDK_BUILD_DIR)/dpdk/$(RTE_TARGET) DPDK_PKTMBUF_HEADROOM ?= 128 DPDK_MARCH ?= native DPDK_TUNE ?= generic DPDK_DEBUG ?= n +DPDK_DESTDIR ?= $(CURDIR)/install +PACKETDRILL ?= n B := $(DPDK_BUILD_DIR) I := $(DPDK_INSTALL_DIR) @@ -78,6 +80,7 @@ DPDK_MAKE_ARGS := -C $(DPDK_SOURCE) -j $(JOBS) \ EXTRA_LDFLAGS="$(DPDK_EXTRA_LDFLAGS)" \ CPU_CFLAGS="$(DPDK_CPU_CFLAGS)" \ CPU_LDFLAGS="$(DPDK_CPU_LDFLAGS)" \ + DESTDIR="$(DPDK_DESTDIR)" \ $(DPDK_MAKE_EXTRA_ARGS) DPDK_SOURCE_FILES := $(shell [ -e $(DPDK_SOURCE) ] && \ @@ -99,13 +102,13 @@ $(B)/custom-config: $(B)/.patch.ok Makefile $(DPDK_SOURCE)/config/defconfig_$(DPDK_TARGET) $@ $(call set,RTE_MACHINE,$(DPDK_MACHINE)) @# modify options - $(call set,RTE_MAX_LCORE,256) + $(call set,RTE_MAX_LCORE,16) $(call set,RTE_PKTMBUF_HEADROOM,$(DPDK_PKTMBUF_HEADROOM)) $(call set,RTE_LIBEAL_USE_HPET,y) - $(call set,RTE_BUILD_COMBINE_LIBS,y) + $(call set,RTE_BUILD_COMBINE_LIBS,n) $(call set,RTE_LIBRTE_I40E_16BYTE_RX_DESC,y) $(call set,RTE_LIBRTE_I40E_ITR_INTERVAL,16) - $(call set,RTE_LIBRTE_PMD_PCAP,y) + $(call set,RTE_LIBRTE_PMD_PCAP,n) @# enable debug init for device drivers $(call set,RTE_LIBRTE_I40E_DEBUG_INIT,$(DPDK_DEBUG)) $(call set,RTE_LIBRTE_IXGBE_DEBUG_INIT,$(DPDK_DEBUG)) @@ -115,7 +118,7 @@ $(B)/custom-config: $(B)/.patch.ok Makefile $(call set,RTE_LIBRTE_PMD_BOND,y) $(call set,RTE_LIBRTE_IP_FRAG,y) @# not needed - $(call set,RTE_LIBRTE_TIMER,n) + $(call set,RTE_LIBRTE_TIMER,y) $(call set,RTE_LIBRTE_CFGFILE,n) $(call set,RTE_LIBRTE_LPM,y) $(call set,RTE_LIBRTE_ACL,n) @@ -128,6 +131,88 @@ $(B)/custom-config: $(B)/.patch.ok Makefile $(call set,RTE_LIBRTE_FLOW_CLASSIFY,n) $(call set,RTE_LIBRTE_PMD_CRYPTO_SCHEDULER,n) $(call set,RTE_KNI_KMOD,n) + $(call set,RTE_LIBRTE_ENA_PMD,n) + $(call set,RTE_LIBRTE_FM10K_PMD,n) + $(call set,RTE_LIBRTE_CXGBE_PMD,n) + $(call set,RTE_LIBRTE_ENIC_PMD,n) + $(call set,RTE_LIBRTE_BNXT_PMD,n) + $(call set,RTE_LIBRTE_SFC_EFX_PMD,n) + $(call set,RTE_LIBRTE_PMD_SOFTNIC,n) + $(call set,RTE_LIBRTE_THUNDERX_NICVF_PMD,n) + $(call set,RTE_LIBRTE_LIO_PMD,n) + $(call set,RTE_LIBRTE_OCTEONTX_PMD,n) + $(call set,RTE_LIBRTE_VMXNET3_PMD,n) + $(call set,RTE_LIBRTE_QEDE_PMD,n) + $(call set,RTE_LIBRTE_ARK_PMD,n) + $(call set,RTE_LIBRTE_PMD_NULL,n) + $(call set,RTE_LIBRTE_CRYPTODEV,n) + $(call set,RTE_LIBRTE_PMD_NULL_CRYPTO,n) + $(call set,RTE_LIBRTE_SECURITY,n) + $(call set,RTE_LIBRTE_EVENTDEV,n) + $(call set,RTE_LIBRTE_PMD_SKELETON_EVENTDEV,n) + $(call set,RTE_LIBRTE_PMD_OCTEONTX_SSOVF,n) + $(call set,RTE_LIBRTE_OCTEONTX_MEMPOOL,n) + $(call set,RTE_LIBRTE_EFD,n) + $(call set,RTE_LIBRTE_MEMBER,n) + $(call set,RTE_LIBRTE_JOBSTATS,n) + $(call set,RTE_LIBRTE_METER,n) + $(call set,RTE_LIBRTE_SCHED,n) + $(call set,RTE_APP_TEST,n) + $(call set,RTE_APP_CRYPTO_PERF,n) + $(call set,RTE_APP_EVENTDEV,n) + $(call set,RTE_LIBRTE_PMD_FAILSAFE,n) + $(call set,RTE_LIBRTE_EM_PMD,n) + $(call set,RTE_LIBRTE_IGB_PMD,n) + $(call set,RTE_LIBRTE_LATENCY_STATS,n) + $(call set,RTE_EAL_IGB_UIO,n) + $(call set,RTE_LIBRTE_KNI,n) + $(call set,RTE_LIBRTE_PMD_KNI,n) + $(call set,RTE_KNI_KMOD,n) + $(call set,RTE_KNI_KMOD_ETHTOOL,n) + $(call set,RTE_LIBRTE_BITRATE,n) + $(call set,RTE_LIBRTE_METRICS,y) + $(call set,RTE_LIBRTE_AVP_PMD,n) + $(call set,RTE_LIBRTE_NFP_PMD,n) + $(call set,RTE_LIBRTE_PMD_TAP,n) + $(call set,RTE_LIBRTE_VHOST,$(PACKETDRILL)) + $(call set,RTE_LIBRTE_IFC_PMD,n) + $(call set,RTE_LIBRTE_PMD_VHOST,n) + $(call set,RTE_PROC_INFO,n) + $(call set,RTE_TEST_PMD,n) + $(call set,RTE_LIBRTE_FSLMC_BUS,n) + $(call set,RTE_LIBRTE_DPAA_BUS,n) + $(call set,RTE_LIBRTE_VMBUS,n) + $(call set,RTE_LIBRTE_IFPGA_BUS,n) + $(call set,RTE_LIBRTE_BPF,n) + $(call set,RTE_LIBRTE_COMPRESSDEV,n) + $(call set,RTE_LIBRTE_VDEV_NETVSC_PMD,n) + $(call set,RTE_LIBRTE_NETVSC_PMD,n) + $(call set,RTE_LIBRTE_RAWDEV,n) + $(call set,RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT,n) + $(call set,RTE_LIBRTE_AXGBE_PMD,n) + $(call set,RTE_LIBRTE_AVF_PMD,n) + $(call set,RTE_LIBRTE_BBDEV,n) + $(call set,RTE_LIBRTE_IP_FRAG_MAX_FRAG,48) + $(call set,RTE_MAX_NUMA_NODES,2) + $(call set,RTE_MAX_ETHPORTS,4) + $(call set,RTE_MAX_QUEUES_PER_PORT,8) + $(call set,RTE_LIBRTE_I40E_PMD,n) + $(call set,RTE_LIBRTE_IXGBE_PMD,n) + $(call set,RTE_LIBRTE_ENETC_PMD,n) + $(call set,RTE_LIBRTE_PMD_BOND,n) + $(call set,RTE_LIBRTE_ATLANTIC_PMD,n) + $(call set,RTE_LIBRTE_LPM,n) + $(call set,RTE_LIBRTE_GSO,n) + $(call set,RTE_MAX_VFIO_GROUPS,4) + $(call set,RTE_MAX_VFIO_CONTAINERS,4) + $(call set,RTE_LIBRTE_COMMON_DPAAX,n) + $(call set,RTE_LIBRTE_PMD_OCTEONTX_CRYPTO,n) + $(call set,RTE_EAL_NUMA_AWARE_HUGEPAGES,n) + $(call set,RTE_DRIVER_MEMPOOL_STACK,y) + $(call set,RTE_DRIVER_MEMPOOL_BUCKET,n) + $(call set,RTE_LIBRTE_PMD_QAT,n) + $(call set,RTE_LIBRTE_PMD_AF_PACKET,n) + $(call set,RTE_MAX_MEM_MB,2048) @rm -f .config.ok $(B)/.download.ok: @@ -161,9 +246,7 @@ $(B)/.config.ok: $(B)/.patch.ok $(B)/custom-config .PHONY: config config: $(B)/.config.ok -$(B)/.build.ok: $(DPDK_SOURCE_FILES) - @if [ ! -e $(B)/.config.ok ] ; then echo 'Please run "make config" \ - first' && false ; fi +$(B)/.build.ok: $(DPDK_SOURCE_FILES) $(B)/.config.ok @make $(DPDK_MAKE_ARGS) install @cp $(I)/.config $(B)/.config @touch $@ @@ -173,4 +256,4 @@ build: $(B)/.build.ok .PHONY: clean clean: - @rm -rf $(B) $(I) + @rm -rf $(DPDK_BUILD_DIR) $(DPDK_DESTDIR) diff --git a/dpdk/dpdk-v18.11_patches/0001-mempool-prioritize-constructor.patch b/dpdk/dpdk-v18.11_patches/0001-mempool-prioritize-constructor.patch new file mode 100644 index 0000000..c941443 --- /dev/null +++ b/dpdk/dpdk-v18.11_patches/0001-mempool-prioritize-constructor.patch @@ -0,0 +1,30 @@ +From cd36895a4a7bfc342915b42e3856bd233452f0bd Mon Sep 17 00:00:00 2001 +From: Jianfeng Tan +Date: Fri, 13 Jul 2018 15:25:22 +0800 +Subject: [PATCH 1/9] mempool: prioritize constructor + +--- + lib/librte_mempool/rte_mempool.h | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h +index 7c9cd9a2f..bdc32d583 100644 +--- a/lib/librte_mempool/rte_mempool.h ++++ b/lib/librte_mempool/rte_mempool.h +@@ -833,10 +833,10 @@ int rte_mempool_register_ops(const struct rte_mempool_ops *ops); + * more than RTE_MEMPOOL_MAX_OPS_IDX is registered. + */ + #define MEMPOOL_REGISTER_OPS(ops) \ +- void mp_hdlr_init_##ops(void); \ +- void __attribute__((constructor, used)) mp_hdlr_init_##ops(void)\ ++ static void __attribute__((constructor(101), used)) \ ++ mp_hdlr_init_##ops(void) \ + { \ +- rte_mempool_register_ops(&ops); \ ++ rte_mempool_register_ops(&ops); \ + } + + /** +-- +2.17.1 + diff --git a/dpdk/dpdk-v18.11_patches/0002-eal-prioritize-constructor.patch b/dpdk/dpdk-v18.11_patches/0002-eal-prioritize-constructor.patch new file mode 100644 index 0000000..9d2959f --- /dev/null +++ b/dpdk/dpdk-v18.11_patches/0002-eal-prioritize-constructor.patch @@ -0,0 +1,25 @@ +From 7fe32567994a8ce782fa8406613bade1d2100dca Mon Sep 17 00:00:00 2001 +From: Jianfeng Tan +Date: Wed, 29 Aug 2018 14:14:09 +0000 +Subject: [PATCH 2/9] eal: prioritize constructor + +--- + lib/librte_eal/common/include/rte_common.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h +index 069c13ec7..a635f5be4 100644 +--- a/lib/librte_eal/common/include/rte_common.h ++++ b/lib/librte_eal/common/include/rte_common.h +@@ -84,7 +84,7 @@ typedef uint16_t unaligned_uint16_t; + #define RTE_PRIORITY_LOG 101 + #define RTE_PRIORITY_BUS 110 + #define RTE_PRIORITY_CLASS 120 +-#define RTE_PRIORITY_LAST 65535 ++#define RTE_PRIORITY_LAST 130 + + #define RTE_PRIO(prio) \ + RTE_PRIORITY_ ## prio +-- +2.17.1 + diff --git a/dpdk/dpdk-v18.11_patches/0003-mbuf-add-single-linked-list.patch b/dpdk/dpdk-v18.11_patches/0003-mbuf-add-single-linked-list.patch new file mode 100644 index 0000000..7430d1e --- /dev/null +++ b/dpdk/dpdk-v18.11_patches/0003-mbuf-add-single-linked-list.patch @@ -0,0 +1,33 @@ +From 1416ff5de58922dc32eb2fb9ce2b9b970282136c Mon Sep 17 00:00:00 2001 +From: Jianfeng Tan +Date: Wed, 29 Aug 2018 14:18:13 +0000 +Subject: [PATCH 3/9] mbuf: add single linked list + +--- + lib/librte_mbuf/rte_mbuf.h | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h +index 9ce5d76d7..0081bd6d7 100644 +--- a/lib/librte_mbuf/rte_mbuf.h ++++ b/lib/librte_mbuf/rte_mbuf.h +@@ -593,6 +593,8 @@ struct rte_mbuf { + */ + struct rte_mbuf_ext_shared_info *shinfo; + ++ struct rte_mbuf *next_pkt; ++ + } __rte_cache_aligned; + + /** +@@ -1237,6 +1239,7 @@ static inline void rte_pktmbuf_reset_headroom(struct rte_mbuf *m) + static inline void rte_pktmbuf_reset(struct rte_mbuf *m) + { + m->next = NULL; ++ m->next_pkt = NULL; + m->pkt_len = 0; + m->tx_offload = 0; + m->vlan_tci = 0; +-- +2.17.1 + diff --git a/dpdk/dpdk-v18.11_patches/0004-net-virtio-user-add-rss-update-for-virtio-user.patch b/dpdk/dpdk-v18.11_patches/0004-net-virtio-user-add-rss-update-for-virtio-user.patch new file mode 100644 index 0000000..e4eb8e7 --- /dev/null +++ b/dpdk/dpdk-v18.11_patches/0004-net-virtio-user-add-rss-update-for-virtio-user.patch @@ -0,0 +1,43 @@ +From 9bbe20eda858fd7fcbd8f137e5f96f51d571a556 Mon Sep 17 00:00:00 2001 +From: Jianfeng Tan +Date: Wed, 29 Aug 2018 14:20:51 +0000 +Subject: [PATCH 4/9] net/virtio-user: add rss update for virtio-user + +--- + drivers/net/virtio/virtio_ethdev.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c +index 614357da7..e7336cde9 100644 +--- a/drivers/net/virtio/virtio_ethdev.c ++++ b/drivers/net/virtio/virtio_ethdev.c +@@ -738,6 +738,18 @@ virtio_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id) + return 0; + } + ++static int ++virtio_rss_hash_update(struct rte_eth_dev *dev, ++ struct rte_eth_rss_conf *rss_conf __rte_unused) ++{ ++ struct virtio_hw *hw = dev->data->dev_private; ++ ++ if (hw->virtio_user_dev) ++ return 0; ++ ++ return -1; ++} ++ + /* + * dev_ops for virtio, bare necessities for basic operation + */ +@@ -772,6 +784,7 @@ static const struct eth_dev_ops virtio_eth_dev_ops = { + .mac_addr_add = virtio_mac_addr_add, + .mac_addr_remove = virtio_mac_addr_remove, + .mac_addr_set = virtio_mac_addr_set, ++ .rss_hash_update = virtio_rss_hash_update, + }; + + static void +-- +2.17.1 + diff --git a/dpdk/dpdk-v18.11_patches/0005-mbuf-add-offset-for-partly-consumed-data.patch b/dpdk/dpdk-v18.11_patches/0005-mbuf-add-offset-for-partly-consumed-data.patch new file mode 100644 index 0000000..6b21f26 --- /dev/null +++ b/dpdk/dpdk-v18.11_patches/0005-mbuf-add-offset-for-partly-consumed-data.patch @@ -0,0 +1,35 @@ +From 63dccf6d146552e2f46f27bae3cef07f4895b6aa Mon Sep 17 00:00:00 2001 +From: Jielong Zhou +Date: Wed, 17 Apr 2019 10:57:51 +0800 +Subject: [PATCH] mbuf: add offset for partly consumed data + +Signed-off-by: Jielong Zhou +--- + lib/librte_mbuf/rte_mbuf.h | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h +index b1972c6e7..6d950ab51 100644 +--- a/lib/librte_mbuf/rte_mbuf.h ++++ b/lib/librte_mbuf/rte_mbuf.h +@@ -658,6 +658,8 @@ struct rte_mbuf { + struct rte_mbuf_ext_shared_info *shinfo; + + struct rte_mbuf *next_pkt; ++ uint32_t una_offset; ++ uint16_t next_offset; + + } __rte_cache_aligned; + +@@ -1301,6 +1303,8 @@ static inline void rte_pktmbuf_reset(struct rte_mbuf *m) + { + m->next = NULL; + m->next_pkt = NULL; ++ m->una_offset = 0; ++ m->next_offset = 0; + m->pkt_len = 0; + m->tx_offload = 0; + m->vlan_tci = 0; +-- +2.19.0 + diff --git a/dpdk/dpdk-v18.11_patches/0007-eal-don-t-start-the-interrupt-mp-thread.patch b/dpdk/dpdk-v18.11_patches/0007-eal-don-t-start-the-interrupt-mp-thread.patch new file mode 100644 index 0000000..770bf05 --- /dev/null +++ b/dpdk/dpdk-v18.11_patches/0007-eal-don-t-start-the-interrupt-mp-thread.patch @@ -0,0 +1,35 @@ +From f68558b0ccbddb4cc81aca36befa0a7730ee051c Mon Sep 17 00:00:00 2001 +From: Jianfeng Tan +Date: Wed, 29 Aug 2018 14:24:01 +0000 +Subject: [PATCH 7/9] eal: don't start the interrupt mp thread + +--- + lib/librte_eal/common/eal_common_proc.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c +index 9fcb91219..79d9e6bbe 100644 +--- a/lib/librte_eal/common/eal_common_proc.c ++++ b/lib/librte_eal/common/eal_common_proc.c +@@ -615,6 +615,7 @@ rte_mp_channel_init(void) + return -1; + } + ++#if 0 + if (rte_ctrl_thread_create(&mp_handle_tid, "rte_mp_handle", + NULL, mp_handle, NULL) < 0) { + RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n", +@@ -624,6 +625,10 @@ rte_mp_channel_init(void) + mp_fd = -1; + return -1; + } ++#else ++ RTE_SET_USED(mp_handle); ++ RTE_SET_USED(mp_handle_tid); ++#endif + + /* unlock the directory */ + flock(dir_fd, LOCK_UN); +-- +2.17.1 + diff --git a/dpdk/dpdk-v18.11_patches/0008-mempool-add-dynamic-mempool-support.patch b/dpdk/dpdk-v18.11_patches/0008-mempool-add-dynamic-mempool-support.patch new file mode 100644 index 0000000..9a109b2 --- /dev/null +++ b/dpdk/dpdk-v18.11_patches/0008-mempool-add-dynamic-mempool-support.patch @@ -0,0 +1,247 @@ +From 9ea6201ad3f9323a076ce1ba553836329d649def Mon Sep 17 00:00:00 2001 +From: Jianfeng Tan +Date: Wed, 26 Dec 2018 14:39:24 +0000 +Subject: mempool: add dynamic mempool support + +Signed-off-by: Jianfeng Tan +--- + drivers/mempool/ring/rte_mempool_ring.c | 26 +++++++---- + lib/librte_mempool/rte_mempool.c | 27 +++++++++-- + lib/librte_mempool/rte_mempool.h | 62 ++++++++++++++++++++----- + 3 files changed, 92 insertions(+), 23 deletions(-) + +diff --git a/drivers/mempool/ring/rte_mempool_ring.c b/drivers/mempool/ring/rte_mempool_ring.c +index bc123fc52..e8fec9119 100644 +--- a/drivers/mempool/ring/rte_mempool_ring.c ++++ b/drivers/mempool/ring/rte_mempool_ring.c +@@ -49,30 +49,40 @@ common_ring_get_count(const struct rte_mempool *mp) + static int + common_ring_alloc(struct rte_mempool *mp) + { ++ int n; + int rg_flags = 0, ret; + char rg_name[RTE_RING_NAMESIZE]; + struct rte_ring *r; + +- ret = snprintf(rg_name, sizeof(rg_name), +- RTE_MEMPOOL_MZ_FORMAT, mp->name); +- if (ret < 0 || ret >= (int)sizeof(rg_name)) { +- rte_errno = ENAMETOOLONG; +- return -rte_errno; +- } +- + /* ring flags */ + if (mp->flags & MEMPOOL_F_SP_PUT) + rg_flags |= RING_F_SP_ENQ; + if (mp->flags & MEMPOOL_F_SC_GET) + rg_flags |= RING_F_SC_DEQ; + ++ if (mp->flags & MEMPOOL_F_DYNAMIC) { ++ n = RTE_MIN(mp->size, mp->populated_size + mp->dynamic_size); ++ ++ ret = snprintf(rg_name, sizeof(rg_name), ++ RTE_MEMPOOL_MZ_FORMAT"_%x", mp->name, n); ++ } else { ++ n = mp->size; ++ ret = snprintf(rg_name, sizeof(rg_name), ++ RTE_MEMPOOL_MZ_FORMAT, mp->name); ++ } ++ ++ if (ret < 0 || ret >= (int)sizeof(rg_name)) { ++ rte_errno = ENAMETOOLONG; ++ return -rte_errno; ++ } ++ + /* + * Allocate the ring that will be used to store objects. + * Ring functions will return appropriate errors if we are + * running as a secondary process etc., so no checks made + * in this function for that condition. + */ +- r = rte_ring_create(rg_name, rte_align32pow2(mp->size + 1), ++ r = rte_ring_create(rg_name, rte_align32pow2(n + 1), + mp->socket_id, rg_flags); + if (r == NULL) + return -rte_errno; +diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c +index 683b216f9..70039f6c3 100644 +--- a/lib/librte_mempool/rte_mempool.c ++++ b/lib/librte_mempool/rte_mempool.c +@@ -152,6 +152,8 @@ mempool_add_elem(struct rte_mempool *mp, __rte_unused void *opaque, + hdr->mp = mp; + hdr->iova = iova; + STAILQ_INSERT_TAIL(&mp->elt_list, hdr, next); ++ if (mp->flags & MEMPOOL_F_DYNAMIC && mp->dyn_obj_cb) ++ mp->dyn_obj_cb(mp, NULL, obj, mp->populated_size); + mp->populated_size++; + + #ifdef RTE_LIBRTE_MEMPOOL_DEBUG +@@ -426,9 +428,10 @@ rte_mempool_populate_default(struct rte_mempool *mp) + ssize_t mem_size; + size_t align, pg_sz, pg_shift; + rte_iova_t iova; +- unsigned mz_id, n; ++ unsigned mz_id, n, avail; + int ret; + bool no_contig, try_contig, no_pageshift, external; ++ bool dynamic = (mp->flags & MEMPOOL_F_DYNAMIC) ? true : false; + + ret = mempool_ops_alloc_once(mp); + if (ret != 0) +@@ -441,7 +444,7 @@ rte_mempool_populate_default(struct rte_mempool *mp) + external = ret; + + /* mempool must not be populated */ +- if (mp->nb_mem_chunks != 0) ++ if (mp->nb_mem_chunks != 0 && !dynamic) + return -EEXIST; + + no_contig = mp->flags & MEMPOOL_F_NO_IOVA_CONTIG; +@@ -512,7 +515,16 @@ rte_mempool_populate_default(struct rte_mempool *mp) + pg_shift = rte_bsf32(pg_sz); + } + +- for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) { ++ n = mp->size; ++ if (dynamic) { ++ n = RTE_MIN(mp->size - mp->populated_size, mp->dynamic_size); ++ if (mp->nb_mem_chunks != 0 && rte_mempool_ops_alloc(mp) != 0) ++ return -ENOMEM; ++ } ++ ++ avail = 0; ++ mz_id = mp->nb_mem_chunks; ++ for (; n > 0; mz_id++, n -= ret, avail += ret) { + size_t min_chunk_size; + unsigned int flags; + +@@ -607,9 +619,16 @@ rte_mempool_populate_default(struct rte_mempool *mp) + } + } + +- return mp->size; ++ return avail; + + fail: ++ if (dynamic) { ++ if (avail) ++ return avail; ++ ++ return ret; ++ } ++ + rte_mempool_free_memchunks(mp); + return ret; + } +diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h +index 7e1c9c1f7..c32d35822 100644 +--- a/lib/librte_mempool/rte_mempool.h ++++ b/lib/librte_mempool/rte_mempool.h +@@ -207,6 +207,16 @@ struct rte_mempool_info { + unsigned int contig_block_size; + } __rte_cache_aligned; + ++struct rte_mempool; ++/** ++ * An object callback function for mempool. ++ * ++ * Used by rte_mempool_create() and rte_mempool_obj_iter(). ++ */ ++typedef void (rte_mempool_obj_cb_t)(struct rte_mempool *mp, ++ void *opaque, void *obj, unsigned obj_idx); ++typedef rte_mempool_obj_cb_t rte_mempool_obj_ctor_t; /* compat */ ++ + /** + * The RTE mempool structure. + */ +@@ -247,6 +257,8 @@ struct rte_mempool { + struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */ + + uint32_t populated_size; /**< Number of populated objects. */ ++ uint32_t dynamic_size; /**< Number of dynamic populated objects. */ ++ rte_mempool_obj_cb_t *dyn_obj_cb; /**< elem cb for dynamic populated objects. */ + struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */ + uint32_t nb_mem_chunks; /**< Number of memory chunks */ + struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */ +@@ -264,6 +276,8 @@ struct rte_mempool { + #define MEMPOOL_F_POOL_CREATED 0x0010 /**< Internal: pool is created. */ + #define MEMPOOL_F_NO_IOVA_CONTIG 0x0020 /**< Don't need IOVA contiguous objs. */ + #define MEMPOOL_F_NO_PHYS_CONTIG MEMPOOL_F_NO_IOVA_CONTIG /* deprecated */ ++#define MEMPOOL_F_DYNAMIC 0x0040 /**< Don't populate element once for all */ ++#define MEMPOOL_F_DYNAMIC_NOW 0x0080 /**< It's is dynamically populated now */ + + /** + * @internal When debug is enabled, store some statistics. +@@ -839,15 +853,6 @@ int rte_mempool_register_ops(const struct rte_mempool_ops *ops); + rte_mempool_register_ops(&ops); \ + } + +-/** +- * An object callback function for mempool. +- * +- * Used by rte_mempool_create() and rte_mempool_obj_iter(). +- */ +-typedef void (rte_mempool_obj_cb_t)(struct rte_mempool *mp, +- void *opaque, void *obj, unsigned obj_idx); +-typedef rte_mempool_obj_cb_t rte_mempool_obj_ctor_t; /* compat */ +- + /** + * A memory callback function for mempool. + * +@@ -989,6 +994,22 @@ struct rte_mempool * + rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size, + unsigned cache_size, unsigned private_data_size, + int socket_id, unsigned flags); ++ ++static inline void ++rte_mempool_set_dynamic_size(struct rte_mempool *mp, int dynamic_size) ++{ ++ mp->flags |= MEMPOOL_F_DYNAMIC; ++ mp->dynamic_size = dynamic_size; ++} ++ ++static inline void ++rte_mempool_set_dynamic_cb(struct rte_mempool *mp, ++ rte_mempool_obj_cb_t *dyn_obj_cb) ++{ ++ mp->flags |= MEMPOOL_F_DYNAMIC; ++ mp->dyn_obj_cb = dyn_obj_cb; ++} ++ + /** + * Free a mempool + * +@@ -1390,9 +1411,28 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table, + /* get remaining objects from ring */ + ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n); + +- if (ret < 0) ++ if (ret < 0) { ++ if (mp->flags & MEMPOOL_F_DYNAMIC && ++ mp->populated_size < mp->size) { ++ int work; ++ ++ work = rte_atomic32_cmpset(&mp->flags, ++ mp->flags & ~MEMPOOL_F_DYNAMIC_NOW, ++ mp->flags | MEMPOOL_F_DYNAMIC_NOW); ++ if (work) { ++ int more; ++ ++ more = rte_mempool_populate_default(mp); ++ mp->flags &= ~MEMPOOL_F_DYNAMIC_NOW; ++ if (more > 0) ++ goto ring_dequeue; ++ } else { ++ /* mempool is populating, try again */ ++ goto ring_dequeue; ++ } ++ } + __MEMPOOL_STAT_ADD(mp, get_fail, n); +- else ++ } else + __MEMPOOL_STAT_ADD(mp, get_success, n); + + return ret; +-- +2.17.1 + diff --git a/dpdk/dpdk-v18.11_patches/0009-mbuf-add-dynamic-mbuf-mempool-support.patch b/dpdk/dpdk-v18.11_patches/0009-mbuf-add-dynamic-mbuf-mempool-support.patch new file mode 100644 index 0000000..ae1f21b --- /dev/null +++ b/dpdk/dpdk-v18.11_patches/0009-mbuf-add-dynamic-mbuf-mempool-support.patch @@ -0,0 +1,109 @@ +From 2eca8e47b535ef62430e073c6103352cfbb5c697 Mon Sep 17 00:00:00 2001 +From: Jianfeng Tan +Date: Wed, 26 Dec 2018 14:40:07 +0000 +Subject: mbuf: add dynamic mbuf mempool support + +Signed-off-by: Jianfeng Tan +--- + lib/librte_mbuf/rte_mbuf.c | 51 ++++++++++++++++++++++++++++ + lib/librte_mbuf/rte_mbuf.h | 5 +++ + lib/librte_mbuf/rte_mbuf_version.map | 8 ++++- + 3 files changed, 63 insertions(+), 1 deletion(-) + +diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c +index 9790b4fb1..b70abd88c 100644 +--- a/lib/librte_mbuf/rte_mbuf.c ++++ b/lib/librte_mbuf/rte_mbuf.c +@@ -167,6 +167,57 @@ rte_pktmbuf_pool_create(const char *name, unsigned int n, + data_room_size, socket_id, NULL); + } + ++struct rte_mempool * ++rte_pktmbuf_dynamic_pool_create(const char *name, unsigned int n, ++ unsigned int cache_size, uint16_t priv_size, ++ uint16_t data_room_size, int socket_id, int dynamic_size) ++{ ++ struct rte_mempool *mp; ++ struct rte_pktmbuf_pool_private mbp_priv; ++ const char *mp_ops_name; ++ unsigned elt_size; ++ int ret; ++ ++ if (RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) != priv_size) { ++ RTE_LOG(ERR, MBUF, "mbuf priv_size=%u is not aligned\n", ++ priv_size); ++ rte_errno = EINVAL; ++ return NULL; ++ } ++ elt_size = sizeof(struct rte_mbuf) + (unsigned)priv_size + ++ (unsigned)data_room_size; ++ mbp_priv.mbuf_data_room_size = data_room_size; ++ mbp_priv.mbuf_priv_size = priv_size; ++ ++ mp = rte_mempool_create_empty(name, n, elt_size, cache_size, ++ sizeof(struct rte_pktmbuf_pool_private), ++ socket_id, MEMPOOL_F_DYNAMIC); ++ if (mp == NULL) ++ return NULL; ++ ++ mp_ops_name = rte_mbuf_best_mempool_ops(); ++ ret = rte_mempool_set_ops_byname(mp, mp_ops_name, NULL); ++ if (ret != 0) { ++ RTE_LOG(ERR, MBUF, "error setting mempool handler\n"); ++ rte_mempool_free(mp); ++ rte_errno = -ret; ++ return NULL; ++ } ++ rte_pktmbuf_pool_init(mp, &mbp_priv); ++ ++ rte_mempool_set_dynamic_size(mp, dynamic_size); ++ rte_mempool_set_dynamic_cb(mp, rte_pktmbuf_init); ++ ++ ret = rte_mempool_populate_default(mp); ++ if (ret < 0) { ++ rte_mempool_free(mp); ++ rte_errno = -ret; ++ return NULL; ++ } ++ ++ return mp; ++} ++ + /* do some sanity checks on a mbuf: panic if it fails */ + void + rte_mbuf_sanity_check(const struct rte_mbuf *m, int is_header) +diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h +index 3dbc6695e..5a2d81605 100644 +--- a/lib/librte_mbuf/rte_mbuf.h ++++ b/lib/librte_mbuf/rte_mbuf.h +@@ -1183,6 +1183,11 @@ rte_pktmbuf_pool_create(const char *name, unsigned n, + unsigned cache_size, uint16_t priv_size, uint16_t data_room_size, + int socket_id); + ++struct rte_mempool * ++rte_pktmbuf_dynamic_pool_create(const char *name, unsigned int n, ++ unsigned int cache_size, uint16_t priv_size, ++ uint16_t data_room_size, int socket_id, int dynamic_size); ++ + /** + * Create a mbuf pool with a given mempool ops name + * +diff --git a/lib/librte_mbuf/rte_mbuf_version.map b/lib/librte_mbuf/rte_mbuf_version.map +index cae68db8d..d6d25af95 100644 +--- a/lib/librte_mbuf/rte_mbuf_version.map ++++ b/lib/librte_mbuf/rte_mbuf_version.map +@@ -44,4 +44,10 @@ DPDK_18.08 { + rte_mbuf_set_user_mempool_ops; + rte_mbuf_user_mempool_ops; + rte_pktmbuf_pool_create_by_ops; +-} DPDK_16.11; ++} DPDK_18.11; ++ ++DPDK_18.11 { ++ global: ++ ++ rte_pktmbuf_dynamic_pool_create; ++} DPDK_18.12; +-- +2.17.1 + diff --git a/examples/Makefile b/examples/Makefile index cf13574..7b88632 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -21,6 +21,17 @@ endif include $(RTE_SDK)/mk/rte.vars.mk -DIRS-y += l4fwd +#DIRS-y += l4fwd +DIRS-y += client +DIRS-y += server +DIRS-y += poll +DIRS-y += udp_echo +DIRS-y += udp_server +DIRS-y += udp_client +DIRS-y += tcp_echo +DIRS-y += tcp_client +DIRS-y += mt_tcp +DIRS-y += dynamic_streams +DIRS-y += tcp_echo_reuseport include $(TLDK_ROOT)/mk/tle.subdir.mk diff --git a/examples/client/Makefile b/examples/client/Makefile new file mode 100644 index 0000000..30c1d58 --- /dev/null +++ b/examples/client/Makefile @@ -0,0 +1,42 @@ +# Copyright (c) 2018 Ant Financial Services Group. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +ifeq ($(RTE_TARGET),) +$(error "Please define RTE_TARGET environment variable") +endif + +ifeq ($(TLDK_ROOT),) +$(error "Please define TLDK_ROOT environment variable") +endif + +include $(RTE_SDK)/mk/rte.vars.mk + +# binary name +APP = client + +CFLAGS += -Wno-unused-result + +# all source are stored in SRCS-y +SRCS-y += client.c + +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -I$(RTE_OUTPUT)/include + +LDLIBS += -L$(RTE_OUTPUT)/lib +LDLIBS += -ltle_glue -ltle_l4p -ltle_timer + +include $(TLDK_ROOT)/mk/tle.app.mk diff --git a/examples/client/client.c b/examples/client/client.c new file mode 100644 index 0000000..3e2e970 --- /dev/null +++ b/examples/client/client.c @@ -0,0 +1,83 @@ +#include +#include +#include + +#include +#include +#include +#include + +#include + +int main(int argc, char *argv[]) { + int sockfd, portno, n; + struct sockaddr_in serv_addr; +#define BUF_SIZE 2000 + char buffer[BUF_SIZE]; + + memset(buffer, 'a', sizeof(buffer)); + + if (argc < 3) { + fprintf(stderr,"usage %s hostname port\n", argv[0]); + exit(0); + } + + portno = atoi(argv[2]); + + sockfd = socket(AF_INET, SOCK_STREAM, 0); + + if (sockfd < 0) { + perror("ERROR opening socket"); + exit(1); + } + + bzero((char *) &serv_addr, sizeof(serv_addr)); + serv_addr.sin_family = AF_INET; +#if 0 + struct hostent *server; + server = gethostbyname(argv[1]); + + if (server == NULL) { + fprintf(stderr,"ERROR, no such host\n"); + exit(0); + } + + bcopy((char *)server->h_addr, (char *)&serv_addr.sin_addr.s_addr, server->h_length); +#else + inet_aton(argv[1], &serv_addr.sin_addr); +#endif + serv_addr.sin_port = htons(portno); + + /* Now connect to the server */ + if (connect(sockfd, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) < 0) { + perror("ERROR connecting"); + exit(1); + } + + printf("Please enter how many bytes to send: "); + if (scanf("%d", &n) < 0) { + perror("scanf error"); + exit(1); + } + + n = write(sockfd, buffer, n); + + if (n < 0) { + perror("ERROR writing to socket"); + exit(1); + } + + printf("Press any key to read: "); + fgets(buffer, sizeof(buffer) - 1, stdin); + + bzero(buffer, sizeof(buffer)); + n = read(sockfd, buffer, sizeof(buffer) - 1); + + if (n < 0) { + perror("ERROR reading from socket"); + exit(1); + } + + printf("%s\n",buffer); + return 0; +} diff --git a/examples/dynamic_streams/Makefile b/examples/dynamic_streams/Makefile new file mode 100644 index 0000000..2ca1a85 --- /dev/null +++ b/examples/dynamic_streams/Makefile @@ -0,0 +1,40 @@ +# Copyright (c) 2018 Ant Financial Services Group. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +ifeq ($(RTE_TARGET),) +$(error "Please define RTE_TARGET environment variable") +endif + +ifeq ($(TLDK_ROOT),) +$(error "Please define TLDK_ROOT environment variable") +endif + +include $(RTE_SDK)/mk/rte.vars.mk + +# binary name +APP = dynamic_streams + +# all source are stored in SRCS-y +SRCS-y += main.c + +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -I$(RTE_OUTPUT)/include + +LDLIBS += -L$(RTE_OUTPUT)/lib +LDLIBS += -ltle_glue -ltle_l4p -ltle_timer + +include $(TLDK_ROOT)/mk/tle.app.mk diff --git a/examples/dynamic_streams/main.c b/examples/dynamic_streams/main.c new file mode 100644 index 0000000..a91fb67 --- /dev/null +++ b/examples/dynamic_streams/main.c @@ -0,0 +1,169 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define HUGE_2M "/sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages" +#define HUGE_1G "/sys/kernel/mm/hugepages/hugepages-1048576kB/free_hugepages" + +static long int +get_value(const char *path) +{ + int fd, len; + long int value; + char buf[1024]; + + fd = open(path, O_RDONLY); + if (fd < 0) + return ULONG_MAX; + + len = read(fd, buf, sizeof(buf)); + + close(fd); + + if (len <= 0) { + return ULONG_MAX; + } + + value = strtol(buf, NULL, 10); + return value; +} + +static void +print_free_hugepages(void) +{ + printf("2M: %ld\t\t1G: %ld\n", get_value(HUGE_2M), get_value(HUGE_1G)); +} + +static int +make_socket_non_blocking(int sfd) +{ + int flags, s; + + flags = fcntl(sfd, F_GETFL, 0); + if (flags == -1) + { + perror("fcntl"); + return -1; + } + + flags |= O_NONBLOCK; + s = fcntl(sfd, F_SETFL, flags); + if (s == -1) + { + perror("fcntl"); + return -1; + } + + return 0; +} + +static int +create_and_bind(const char *addr, uint16_t port) +{ + int s, sfd; + struct sockaddr_in sin; + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = inet_addr(addr); + sin.sin_port = htons(port); + + + sfd = socket(AF_INET, SOCK_STREAM, 0); + if (sfd == -1) { + perror("socket failed"); + return -1; + } + + s = bind(sfd, (struct sockaddr *)&sin, sizeof(sin)); + if (s < 0) { + perror("bind failed"); + close(sfd); + return -1; + } + + return sfd; +} + +static int +create_server(const char *addr, uint16_t port) +{ + int sfd; + + sfd = create_and_bind(addr, port); + if (sfd == -1) + return -1; + + if (make_socket_non_blocking(sfd) < 0) + abort(); + + if (listen(sfd, SOMAXCONN) < 0) { + perror("listen"); + return -1; + } + + return sfd; +} + +static int sock_idx = 0; + +extern struct rte_mempool *get_mempool_by_socket(int32_t socket_id); + +int +main(void) +{ + int i; + int fd; + int nb_socks = 1024 * 64 - 1; + int nb_mbufs = 0x80000; + int port_start = 1; + struct rte_mbuf *m; + struct rte_mempool *mp = get_mempool_by_socket(0); + + for (i = 0; i < nb_socks; i++) { + sock_idx = i; + fd = create_server("0.0.0.0", port_start + i); + if (fd < 0) { + printf("failed to create socket: %s\n", strerror(errno)); + break; + } + + if ((i % 4096) == 1) { + print_free_hugepages(); + usleep(100 * 1000); + } + } + + printf("We have successfully created %d sockets\n", i); + + for (i = 0; i < nb_mbufs; i++) { + m = rte_pktmbuf_alloc(mp); + if (m == NULL) { + printf("failed to alloc mbuf: %s\n", strerror(rte_errno)); + break; + } + if ((i % 4096) == 1) { + print_free_hugepages(); + usleep(100 * 1000); + } + } + + printf("We have successfully allocated %d mbufs\n", i); + + return EXIT_SUCCESS; +} diff --git a/examples/mt_tcp/Makefile b/examples/mt_tcp/Makefile new file mode 100644 index 0000000..ce5f8da --- /dev/null +++ b/examples/mt_tcp/Makefile @@ -0,0 +1,40 @@ +# Copyright (c) 2018 Ant Financial Services Group. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +ifeq ($(RTE_TARGET),) +$(error "Please define RTE_TARGET environment variable") +endif + +ifeq ($(TLDK_ROOT),) +$(error "Please define TLDK_ROOT environment variable") +endif + +include $(RTE_SDK)/mk/rte.vars.mk + +# binary name +APP = mt_tcp + +# all source are stored in SRCS-y +SRCS-y += mt_tcp.c + +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -I$(RTE_OUTPUT)/include -pthread + +LDLIBS += -L$(RTE_OUTPUT)/lib +LDLIBS += -ltle_glue -ltle_misc -ltle_l4p -ltle_timer + +include $(TLDK_ROOT)/mk/tle.app.mk diff --git a/examples/mt_tcp/mt_tcp.c b/examples/mt_tcp/mt_tcp.c new file mode 100644 index 0000000..b227f0b --- /dev/null +++ b/examples/mt_tcp/mt_tcp.c @@ -0,0 +1,210 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define WORKER 2 + +#define MAXLINE 1500 +#define OPEN_MAX 100 +#define LISTENQ 20 +#define INFTIM 1000 +#define MAX_EVENT 10000 + +char *local; +char *lport; + +static void set_nonblocking(int sock) +{ + int opts; + opts = fcntl(sock, F_GETFL); + + if (opts < 0) { + perror("fcntl(sock,GETFL)"); + exit(1); + } + + opts = opts | O_NONBLOCK; + if (fcntl(sock, F_SETFL, opts) < 0) { + perror("fcntl(sock, SETFL, opts)"); + exit(1); + } +} + +static int create_and_bind(char *addr, char *port) +{ + int s, sfd; + struct sockaddr_in sin; + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = inet_addr(addr); + sin.sin_port = htons(atoi(port)); + + sfd = socket(AF_INET, SOCK_STREAM, 0); + if (sfd == -1) { + perror("socket failed"); + return -1; + } + + int val =1; + if (setsockopt(sfd, SOL_SOCKET, SO_REUSEPORT, &val, sizeof(val)) < 0) + perror("setsockopt()"); + + set_nonblocking(sfd); + + s = bind(sfd, (struct sockaddr *)&sin, sizeof(sin)); + if (s < 0) { + perror("bind failed"); + close(sfd); + return -1; + } + + return sfd; +} + +static void worker_handle(void *data) +{ + long w = (long)data; + int i, sfd, cfd, sockfd, epfd, nfds; + ssize_t n = 0; + int ret = 0; + char line[MAXLINE]; + char szAddr[1024] = "\0"; + struct sockaddr_in caddr; + socklen_t addrlen = sizeof(caddr); + struct epoll_event ev, events[20]; + + epfd = epoll_create(256); + if (epfd < 0) { + return; + } + + sfd = create_and_bind(local, lport); + if (sfd < 0) { + return; + } + + listen(sfd, 256); + + ev.data.fd = sfd; + ev.events = EPOLLIN | EPOLLET; + ret = epoll_ctl(epfd, EPOLL_CTL_ADD, sfd, &ev); + if (ret < 0) { + return; + } + + memset(line, 0, MAXLINE); + + for ( ; ; ) { + + nfds = epoll_wait(epfd, events, MAX_EVENT, 0); + for (i = 0; i < nfds; ++i) { + if (events[i].events & EPOLLIN) { + + sockfd = events[i].data.fd; + if (sockfd < 0) { + continue; + } + + if (sockfd == sfd) { + + /* accept process */ + printf("i am worker %ld, begin to accept connection.\n", w); + cfd = accept(sfd, (struct sockaddr *)&caddr, &addrlen); + if (cfd != -1) { + printf("worker %ld accept a connection sucess. ip:%s, port:%d\n", w, inet_ntoa(caddr.sin_addr), caddr.sin_port); + } else { + printf("worker %ld accept a connection failed, error: %s\n", w, strerror(errno)); + } + if (cfd < 0) { + continue; + } + + set_nonblocking(cfd); + + ev.data.fd = cfd; + ev.events = EPOLLIN | EPOLLET; + ret = epoll_ctl(epfd, EPOLL_CTL_ADD, cfd, &ev); + + + char* p = (char *)&caddr.sin_addr; + sprintf(szAddr, "%d.%d.%d.%d", *p, *(p + 1), *(p + 2), *(p + 3)); + printf("accept from %d:ip:%s port:%u\n", caddr.sin_family, szAddr, ntohs(caddr.sin_port)); + + } else { + + /* cfd process */ + n = read(sockfd, line, MAXLINE); + if (n < 0) { + if (errno == ECONNRESET) { + close(sockfd); + events[i].data.fd = -1; + } else { + printf("readline error\n"); + } + } else if (n == 0) { + perror("connfd = 0\n"); + close(sockfd); + events[i].data.fd = -1; + } + + printf("read len = %zi\n", n); + + ev.data.fd = sockfd; + ev.events = EPOLLOUT | EPOLLET; + epoll_ctl(epfd, EPOLL_CTL_MOD, sockfd, &ev); + } + } else if(events[i].events & EPOLLOUT) { + + sockfd = events[i].data.fd; + if(sockfd < 0) + continue; + + n = write(sockfd, line, n); + printf("write len = %zi\n\n\n\n", n); + + ev.data.fd = sockfd; + ev.events = EPOLLIN | EPOLLET; + epoll_ctl(epfd, EPOLL_CTL_MOD, sockfd, &ev); + + close(sockfd); + } + } + } +} + +int main(int argc, char *argv[]) +{ + if (argc != 3) { + fprintf (stderr, "Usage: %s \n", argv[0]); + exit (EXIT_FAILURE); + } + + local = argv[1]; + lport = argv[2]; + + long i = 0; + + pthread_t th[WORKER]; + for (i = 0; i < WORKER; i++) { + + if (pthread_create(&th[i], NULL, (void *)worker_handle, (void *)i)) { + perror("Failed to start all worker threads"); + return 1; + } + } + + for (i = 0; i < WORKER; i++) { + pthread_join(th[i], NULL); + } + + return 0; +} diff --git a/examples/poll/Makefile b/examples/poll/Makefile new file mode 100644 index 0000000..ceff7de --- /dev/null +++ b/examples/poll/Makefile @@ -0,0 +1,40 @@ +# Copyright (c) 2018 Ant Financial Services Group. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +ifeq ($(RTE_TARGET),) +$(error "Please define RTE_TARGET environment variable") +endif + +ifeq ($(TLDK_ROOT),) +$(error "Please define TLDK_ROOT environment variable") +endif + +include $(RTE_SDK)/mk/rte.vars.mk + +# binary name +APP = poll + +# all source are stored in SRCS-y +SRCS-y += poll.c + +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -I$(RTE_OUTPUT)/include + +LDLIBS += -L$(RTE_OUTPUT)/lib +LDLIBS += -ltle_glue -ltle_l4p -ltle_timer + +include $(TLDK_ROOT)/mk/tle.app.mk diff --git a/examples/poll/poll.c b/examples/poll/poll.c new file mode 100644 index 0000000..8e8e379 --- /dev/null +++ b/examples/poll/poll.c @@ -0,0 +1,39 @@ +#include +#include +#include + +#define TIMEOUT 5 + +int main (void) +{ + struct pollfd fds[2]; + int ret; + + /* watch stdin for input */ + fds[0].fd = STDIN_FILENO; + fds[0].events = POLLIN; + + /* watch stdout for ability to write */ + fds[1].fd = STDOUT_FILENO; + fds[1].events = POLLOUT; + + ret = poll(fds, 2, TIMEOUT * 1000); + + if (ret == -1) { + perror ("poll"); + return 1; + } + + if (!ret) { + printf ("%d seconds elapsed.\n", TIMEOUT); + return 0; + } + + if (fds[0].revents & POLLIN) + printf ("stdin is readable\n"); + + if (fds[1].revents & POLLOUT) + printf ("stdout is writable\n"); + + return 0; +} diff --git a/examples/server/Makefile b/examples/server/Makefile new file mode 100644 index 0000000..f962e16 --- /dev/null +++ b/examples/server/Makefile @@ -0,0 +1,40 @@ +# Copyright (c) 2018 Ant Financial Services Group. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +ifeq ($(RTE_TARGET),) +$(error "Please define RTE_TARGET environment variable") +endif + +ifeq ($(TLDK_ROOT),) +$(error "Please define TLDK_ROOT environment variable") +endif + +include $(RTE_SDK)/mk/rte.vars.mk + +# binary name +APP = server + +# all source are stored in SRCS-y +SRCS-y += server.c + +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -I$(RTE_OUTPUT)/include + +LDLIBS += -L$(RTE_OUTPUT)/lib +LDLIBS += -ltle_glue -ltle_l4p -ltle_timer + +include $(TLDK_ROOT)/mk/tle.app.mk diff --git a/examples/server/server.c b/examples/server/server.c new file mode 100644 index 0000000..915abd1 --- /dev/null +++ b/examples/server/server.c @@ -0,0 +1,251 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAXEVENTS 64 + +static int +make_socket_non_blocking(int sfd) +{ + int flags, s; + + flags = fcntl(sfd, F_GETFL, 0); + if (flags == -1) + { + perror("fcntl"); + return -1; + } + + flags |= O_NONBLOCK; + s = fcntl(sfd, F_SETFL, flags); + if (s == -1) + { + perror("fcntl"); + return -1; + } + + return 0; +} + +static int +create_and_bind(char *addr, char *port) +{ + int s, sfd; + struct sockaddr_in sin; + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = inet_addr(addr); + sin.sin_port = htons(atoi(port)); + + + sfd = socket(AF_INET, SOCK_STREAM, 0); + if (sfd == -1) { + perror("socket failed"); + return -1; + } + + s = bind(sfd, (struct sockaddr *)&sin, sizeof(sin)); + if (s < 0) { + perror("bind failed"); + close(sfd); + return -1; + } + + return sfd; +} + +int +main(int argc, char *argv[]) +{ + int sfd, s; + int efd; + struct epoll_event event; + struct epoll_event *events; + + if (argc != 3) + { + fprintf (stderr, "Usage: %s \n", argv[0]); + exit (EXIT_FAILURE); + } + + sfd = create_and_bind(argv[1], argv[2]); + if (sfd == -1) + abort(); + + s = make_socket_non_blocking(sfd); + if (s == -1) + abort(); + + s = listen(sfd, SOMAXCONN); + if (s == -1) + { + perror("listen"); + abort(); + } + + efd = epoll_create1(0); + if (efd == -1) + { + perror("epoll_create"); + abort(); + } + + event.data.fd = sfd; + event.events = EPOLLIN | EPOLLET; + s = epoll_ctl(efd, EPOLL_CTL_ADD, sfd, &event); + if (s == -1) + { + perror ("epoll_ctl"); + abort (); + } + + /* Buffer where events are returned */ + events = calloc(MAXEVENTS, sizeof event); + + /* The event loop */ + while (1) + { + int n, i; + + n = epoll_wait(efd, events, MAXEVENTS, -1); + for (i = 0; i < n; i++) + { + if ((events[i].events & EPOLLERR) || + (events[i].events & EPOLLHUP) || + (!(events[i].events & EPOLLIN))) + { + /* An error has occured on this fd, or the socket is not + ready for reading (why were we notified then?) */ + fprintf(stderr, "epoll error\n"); + close(events[i].data.fd); + continue; + } + + else if (sfd == events[i].data.fd) + { + /* We have a notification on the listening socket, which + means one or more incoming connections. */ + while (1) + { + struct sockaddr in_addr; + socklen_t in_len; + int infd; + char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV]; + + in_len = sizeof in_addr; + infd = accept (sfd, &in_addr, &in_len); + if (infd == -1) + { + if ((errno == EAGAIN) || + (errno == EWOULDBLOCK)) + { + /* We have processed all incoming + connections. */ + break; + } + else + { + perror ("accept"); + break; + } + } + + s = getnameinfo (&in_addr, in_len, + hbuf, sizeof hbuf, + sbuf, sizeof sbuf, + NI_NUMERICHOST | NI_NUMERICSERV); + if (s == 0) + { + printf("Accepted connection on descriptor %d " + "(host=%s, port=%s)\n", infd, hbuf, sbuf); + } + + /* Make the incoming socket non-blocking and add it to the + list of fds to monitor. */ + s = make_socket_non_blocking(infd); + if (s == -1) + abort (); + + event.data.fd = infd; + event.events = EPOLLIN | EPOLLET; + s = epoll_ctl(efd, EPOLL_CTL_ADD, infd, &event); + if (s == -1) + { + perror ("epoll_ctl"); + abort (); + } + } + continue; + } + else + { + /* We have data on the fd waiting to be read. Read and + display it. We must read whatever data is available + completely, as we are running in edge-triggered mode + and won't get a notification again for the same + data. */ + int done = 0; + + while (1) + { + ssize_t count; + char buf[512]; + + count = read (events[i].data.fd, buf, sizeof buf); + if (count == -1) + { + /* If errno == EAGAIN, that means we have read all + data. So go back to the main loop. */ + if (errno != EAGAIN) + { + perror ("read"); + done = 1; + } + break; + } + else if (count == 0) + { + /* End of file. The remote has closed the + connection. */ + done = 1; + break; + } + + /* Write the buffer to standard output */ + s = write(1, buf, count); + if (s == -1) + { + perror("write"); + abort(); + } + } + + if (done) + { + printf ("Closed connection on descriptor %d\n", + events[i].data.fd); + + /* Closing the descriptor will make epoll remove it + from the set of descriptors which are monitored. */ + close (events[i].data.fd); + } + } + } + } + + free(events); + + close(sfd); + + return EXIT_SUCCESS; +} diff --git a/examples/tcp_client/Makefile b/examples/tcp_client/Makefile new file mode 100644 index 0000000..d222f68 --- /dev/null +++ b/examples/tcp_client/Makefile @@ -0,0 +1,40 @@ +# Copyright (c) 2018 Ant Financial Services Group. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +ifeq ($(RTE_TARGET),) +$(error "Please define RTE_TARGET environment variable") +endif + +ifeq ($(TLDK_ROOT),) +$(error "Please define TLDK_ROOT environment variable") +endif + +include $(RTE_SDK)/mk/rte.vars.mk + +# binary name +APP = tcp_client + +# all source are stored in SRCS-y +SRCS-y += tcp_client.c + +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -I$(RTE_OUTPUT)/include + +LDLIBS += -L$(RTE_OUTPUT)/lib +LDLIBS += -ltle_glue -ltle_misc -ltle_l4p -ltle_timer + +include $(TLDK_ROOT)/mk/tle.app.mk diff --git a/examples/tcp_client/tcp_client.c b/examples/tcp_client/tcp_client.c new file mode 100644 index 0000000..06841d5 --- /dev/null +++ b/examples/tcp_client/tcp_client.c @@ -0,0 +1,156 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAXLINE 1400 +#define OPEN_MAX 100 +#define LISTENQ 20 +#define INFTIM 1000 +#define MAX_EVENTS 8 + +char *laddr; +char *lport; + +static void ip_show(const char *prefix, struct sockaddr_in *addr) +{ + char szAddr[20] = "\0"; + char* p = (char *)&addr->sin_addr; + + sprintf(szAddr, "%d.%d.%d.%d", *p, *(p + 1), *(p + 2), *(p + 3)); + printf("%s %d:ip:%s port:%u\n", prefix, addr->sin_family, szAddr, ntohs(addr->sin_port)); +} + +static void set_nonblocking(int sock) +{ + int opts; + opts = fcntl(sock, F_GETFL); + + if (opts < 0) { + perror("fcntl(sock,GETFL)"); + exit(EXIT_FAILURE); + } + + opts = opts | O_NONBLOCK; + if (fcntl(sock, F_SETFL, opts) < 0) { + perror("fcntl(sock, SETFL, opts)"); + exit(EXIT_FAILURE); + } +} + +int main(int argc, char *argv[]) +{ + int cfd; + struct sockaddr_in sin; + int i, sockfd, epfd, nfds; + char send_data[1500]; + char recv_data[1500]; + ssize_t n; + struct epoll_event ev, events[MAX_EVENTS]; + + if (argc != 3) { + fprintf (stderr, "Usage: %s \n", argv[0]); + exit(EXIT_FAILURE); + } + + laddr = argv[1]; + lport = argv[2]; + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = inet_addr(laddr); + sin.sin_port = htons(atoi(lport));; + + cfd = socket(AF_INET, SOCK_STREAM, 0); + if (cfd == -1) { + perror("socket failed"); + return -1; + } + + int opt = SO_REUSEADDR; + setsockopt(cfd, SOL_SOCKET, SO_REUSEADDR, &opt,sizeof(opt)); + + set_nonblocking(cfd); + + if (connect(cfd, (struct sockaddr *)&sin, sizeof(struct sockaddr)) == -1) { + if (errno != EINPROGRESS && errno != EINTR) { + perror("Connect"); + exit(EXIT_FAILURE); + } + } + + epfd = epoll_create(256); + + ev.data.fd = cfd; + ev.events = EPOLLOUT | EPOLLIN | EPOLLET; + + epoll_ctl(epfd, EPOLL_CTL_ADD, cfd, &ev); + for ( ; ; ) { + + nfds = epoll_wait(epfd, events, MAX_EVENTS, -1); + for (i = 0; i < nfds; ++i) { + if (events[i].events & EPOLLIN) { + + sockfd = events[i].data.fd; + if (sockfd < 0) { + continue; + } + + n = read(sockfd, recv_data, MAXLINE); + ip_show("read pkt", &sin); + + if (n < 0) { + perror("read"); + exit(EXIT_FAILURE); + } else if (n == 0) { + printf("peer closed"); + exit(EXIT_FAILURE); + } else { + recv_data[n] = '\0'; + printf("recv length = %zi data: [%s]\n", n, recv_data); + } + + ev.data.fd = cfd; + ev.events = EPOLLOUT | EPOLLET; + epoll_ctl(epfd, EPOLL_CTL_MOD, cfd, &ev); + } else if(events[i].events & EPOLLOUT) { + + printf("\nSEND (q or Q to quit) : "); + if (fgets(send_data, MAXLINE, stdin) == NULL) { + perror("fgets"); + exit(EXIT_FAILURE); + } + + if (strcmp(send_data , "q") == 0 && + strcmp(send_data , "Q") == 0) + return 0; + + sockfd = events[i].data.fd; + if (sockfd == -1) { + printf("sockfd < 0\n"); + continue; + } + + n = write(sockfd, send_data, strlen(send_data)); + ip_show("write pkt", &sin); + + printf("send length = %zi data: [%s]\n", n, send_data); + + ev.data.fd = sockfd; + ev.events = EPOLLIN | EPOLLET; + epoll_ctl(epfd, EPOLL_CTL_MOD, sockfd, &ev); + } else { + printf("\n receive handled events: 0x%x", events[i].events); + break; + } + } + } + + return 0; +} diff --git a/examples/tcp_echo/Makefile b/examples/tcp_echo/Makefile new file mode 100644 index 0000000..f92570d --- /dev/null +++ b/examples/tcp_echo/Makefile @@ -0,0 +1,40 @@ +# Copyright (c) 2018 Ant Financial Services Group. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +ifeq ($(RTE_TARGET),) +$(error "Please define RTE_TARGET environment variable") +endif + +ifeq ($(TLDK_ROOT),) +$(error "Please define TLDK_ROOT environment variable") +endif + +include $(RTE_SDK)/mk/rte.vars.mk + +# binary name +APP = tcp_echo + +# all source are stored in SRCS-y +SRCS-y += tcp_echo.c + +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -I$(RTE_OUTPUT)/include + +LDLIBS += -L$(RTE_OUTPUT)/lib +LDLIBS += -ltle_glue -ltle_misc -ltle_l4p -ltle_timer + +include $(TLDK_ROOT)/mk/tle.app.mk diff --git a/examples/tcp_echo/tcp_echo.c b/examples/tcp_echo/tcp_echo.c new file mode 100644 index 0000000..d804ea7 --- /dev/null +++ b/examples/tcp_echo/tcp_echo.c @@ -0,0 +1,161 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAXLINE 4096 +#define OPEN_MAX 100 +#define LISTENQ 20 +#define INFTIM 1000 +#define MAX_EVENT 10000 + +static char line[MAXLINE]; + +static void set_nonblocking(int sock) +{ + int opts; + opts = fcntl(sock, F_GETFL); + + if (opts < 0) { + perror("fcntl(sock,GETFL)"); + exit(1); + } + + opts = opts | O_NONBLOCK; + if (fcntl(sock, F_SETFL, opts) < 0) { + perror("fcntl(sock, SETFL, opts)"); + exit(1); + } +} + +static int create_and_bind(char *addr, char *port) +{ + int s, sfd; + struct sockaddr_in sin; + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = inet_addr(addr); + sin.sin_port = htons(atoi(port)); + + sfd = socket(AF_INET, SOCK_STREAM, 0); + if (sfd == -1) { + perror("socket failed"); + return -1; + } + printf("AF_INET = %d SOCK_STREAM socket, sfd = %d\n", AF_INET, sfd); + + set_nonblocking(sfd); + + s = bind(sfd, (struct sockaddr *)&sin, sizeof(sin)); + if (s < 0) { + perror("bind failed"); + close(sfd); + return -1; + } + + return sfd; +} + +int main(int argc, char *argv[]) +{ + int i, sfd, cfd, sockfd, epfd, nfds; + int ret; + ssize_t n = 0; + char szAddr[1024] = "\0"; + struct sockaddr_in caddr; + socklen_t addrlen = sizeof(caddr); + struct epoll_event ev, events[20]; + + if (argc != 3) { + fprintf (stderr, "Usage: %s \n", argv[0]); + exit (EXIT_FAILURE); + } + + epfd = epoll_create(256); + printf("epoll_create(256) return %d\n", epfd); + + sfd = create_and_bind(argv[1], argv[2]); + if (sfd < 0) { + return -1; + } + + listen(sfd, 256); + + ev.data.fd = sfd; + ev.events = EPOLLIN | EPOLLET; + ret = epoll_ctl(epfd, EPOLL_CTL_ADD, sfd, &ev); + printf("epoll_ctl return %d sfd = %d\n", ret, sfd); + + for ( ; ; ) { + + nfds = epoll_wait(epfd, events, MAX_EVENT, 1000); + for (i = 0; i < nfds; ++i) { + if (events[i].events & EPOLLIN) { + sockfd = events[i].data.fd; + if (sockfd < 0) + continue; + + if (sockfd == sfd) { + /* accept process */ + cfd = accept(sfd, (struct sockaddr *)&caddr, &addrlen); + if (cfd < 0) + continue; + + set_nonblocking(cfd); + + ev.data.fd = cfd; + ev.events = EPOLLIN | EPOLLET; + ret = epoll_ctl(epfd, EPOLL_CTL_ADD, cfd, &ev); + + char* p = (char *)&caddr.sin_addr; + sprintf(szAddr, "%d.%d.%d.%d", *p, *(p + 1), *(p + 2), *(p + 3)); + printf("accept from %d:ip:%s port:%u\n", caddr.sin_family, szAddr, ntohs(caddr.sin_port)); + } else { + /* cfd process */ + n = read(sockfd, line, MAXLINE); + if (n < 0) { + if (errno == ECONNRESET) { + close(sockfd); + events[i].data.fd = -1; + } else { + perror("read error"); + close(sockfd); + } + } else if (n == 0) { + perror("connfd = 0\n"); + close(sockfd); + events[i].data.fd = -1; + } else { + line[n] = '\0'; + printf("read len = %zi, data : %s\n", n, line); + } + + ev.data.fd = sockfd; + ev.events = EPOLLOUT | EPOLLET; + epoll_ctl(epfd, EPOLL_CTL_MOD, sockfd, &ev); + } + } else if (events[i].events & EPOLLOUT) { + sockfd = events[i].data.fd; + n = write(sockfd, line, n); + printf("write len = %zi, data : %s\n", n, line); + + ev.data.fd = sockfd; + ev.events = EPOLLIN | EPOLLET; + epoll_ctl(epfd, EPOLL_CTL_MOD, sockfd, &ev); + } else if (events[i].events & (EPOLLERR | EPOLLHUP)) { + sockfd = events[i].data.fd; + printf("peer closed\n"); + close(sockfd); + } + } + } + + return 0; +} diff --git a/examples/tcp_echo_reuseport/Makefile b/examples/tcp_echo_reuseport/Makefile new file mode 100644 index 0000000..b6d3197 --- /dev/null +++ b/examples/tcp_echo_reuseport/Makefile @@ -0,0 +1,40 @@ +# Copyright (c) 2018 Ant Financial Services Group. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +ifeq ($(RTE_TARGET),) +$(error "Please define RTE_TARGET environment variable") +endif + +ifeq ($(TLDK_ROOT),) +$(error "Please define TLDK_ROOT environment variable") +endif + +include $(RTE_SDK)/mk/rte.vars.mk + +# binary name +APP = tcp_echo_reuseport + +# all source are stored in SRCS-y +SRCS-y += main.c + +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -I$(RTE_OUTPUT)/include + +LDLIBS += -L$(RTE_OUTPUT)/lib +LDLIBS += -ltle_glue -ltle_misc -ltle_l4p -ltle_timer + +include $(TLDK_ROOT)/mk/tle.app.mk diff --git a/examples/tcp_echo_reuseport/main.c b/examples/tcp_echo_reuseport/main.c new file mode 100644 index 0000000..2e10d8e --- /dev/null +++ b/examples/tcp_echo_reuseport/main.c @@ -0,0 +1,191 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAXLINE 4096 +#define OPEN_MAX 100 +#define LISTENQ 20 +#define INFTIM 1000 +#define MAX_EVENT 10000 + +static char line[MAXLINE]; + +static void set_nonblocking(int sock) +{ + int opts; + opts = fcntl(sock, F_GETFL); + + if (opts < 0) { + perror("fcntl(sock,GETFL)"); + exit(1); + } + + opts = opts | O_NONBLOCK; + if (fcntl(sock, F_SETFL, opts) < 0) { + perror("fcntl(sock, SETFL, opts)"); + exit(1); + } +} + +static int create_and_bind(char *addr, char *port) +{ + int s, sfd; + struct sockaddr_in sin; + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = inet_addr(addr); + sin.sin_port = htons(atoi(port)); + + sfd = socket(AF_INET, SOCK_STREAM, 0); + if (sfd == -1) { + perror("socket failed"); + return -1; + } + printf("AF_INET = %d SOCK_STREAM socket, sfd = %d\n", AF_INET, sfd); + + set_nonblocking(sfd); + + s = bind(sfd, (struct sockaddr *)&sin, sizeof(sin)); + if (s < 0) { + perror("bind failed"); + close(sfd); + return -1; + } + + return sfd; +} + +#define NB_SOCK 8 +static int sfds[NB_SOCK]; + +static int +sfd_lookup(int fd) +{ + int i; + + for (i = 0; i < NB_SOCK; i++) + if (fd == sfds[i]) + return i; + return -1; +} + +int main(int argc, char *argv[]) +{ + int i, j, cfd, sockfd, epfd, nfds; + int ret; + ssize_t n = 0; + char szAddr[1024] = "\0"; + struct sockaddr_in caddr; + socklen_t addrlen = sizeof(caddr); + struct epoll_event ev, events[20]; + + if (argc != 3) { + fprintf(stderr, "Usage: %s \n", argv[0]); + exit(EXIT_FAILURE); + } + + epfd = epoll_create(256); + printf("epoll_create(256) return %d\n", epfd); + + for (i = 0; i < NB_SOCK; i++) { + sfds[i] = create_and_bind(argv[1], argv[2]); + if (sfds[i] < 0) { + perror("create_and_bind"); + exit(1); + } + + listen(sfds[i], 256); + + ev.data.fd = sfds[i]; + ev.events = EPOLLIN | EPOLLET; + ret = epoll_ctl(epfd, EPOLL_CTL_ADD, sfds[i], &ev); + if (ret < 0) { + perror("epoll_ctl sfd"); + exit(1); + } + printf("server #%d: fd = %d\n", i, sfds[i]); + } + + for ( ; ; ) { + + nfds = epoll_wait(epfd, events, MAX_EVENT, 1000); + for (i = 0; i < nfds; ++i) { + if (events[i].events & EPOLLIN) { + sockfd = events[i].data.fd; + if (sockfd < 0) + continue; + + j = sfd_lookup(sockfd); + if (j >= 0) { + /* accept process */ + cfd = accept(sockfd, (struct sockaddr *)&caddr, &addrlen); + if (cfd < 0) + continue; + + set_nonblocking(cfd); + + ev.data.fd = cfd; + ev.events = EPOLLIN | EPOLLET; + ret = epoll_ctl(epfd, EPOLL_CTL_ADD, cfd, &ev); + + char* p = (char *)&caddr.sin_addr; + sprintf(szAddr, "%d.%d.%d.%d", *p, *(p + 1), *(p + 2), *(p + 3)); + printf("accept from #%d (fd = %d): ip=%s port=%u\n", + j, sockfd, szAddr, ntohs(caddr.sin_port)); + printf("\tnow you can send \"close \" to close sfd\n"); + } else { + /* cfd process */ + n = read(sockfd, line, MAXLINE); + if (n < 0) { + if (errno == ECONNRESET) { + close(sockfd); + events[i].data.fd = -1; + } else { + perror("read error"); + close(sockfd); + } + } else if (n == 0) { + perror("connfd = 0\n"); + close(sockfd); + events[i].data.fd = -1; + } else { + line[n] = '\0'; + printf("read len = %zi, data : %s\n", n, line); + } + + ev.data.fd = sockfd; + ev.events = EPOLLOUT | EPOLLET; + epoll_ctl(epfd, EPOLL_CTL_MOD, sockfd, &ev); + + if (sscanf(line, "close %d", &j) == 1) { + printf("will close #%d sfd = %d\n", j, sfds[j]); + close(sfds[j]); + sfds[j] = -1; + } + } + } else if (events[i].events & EPOLLOUT) { + sockfd = events[i].data.fd; + n = write(sockfd, line, n); + printf("write len = %zi, data : %s\n", n, line); + + ev.data.fd = sockfd; + ev.events = EPOLLIN | EPOLLET; + epoll_ctl(epfd, EPOLL_CTL_MOD, sockfd, &ev); + } else if (events[i].events & (EPOLLERR | EPOLLHUP)) { + sockfd = events[i].data.fd; + printf("peer closed\n"); + close(sockfd); + } + } + } + + return 0; +} diff --git a/examples/udp_client/Makefile b/examples/udp_client/Makefile new file mode 100755 index 0000000..d349746 --- /dev/null +++ b/examples/udp_client/Makefile @@ -0,0 +1,40 @@ +# Copyright (c) 2018 Ant Financial Services Group. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +ifeq ($(RTE_TARGET),) +$(error "Please define RTE_TARGET environment variable") +endif + +ifeq ($(TLDK_ROOT),) +$(error "Please define TLDK_ROOT environment variable") +endif + +include $(RTE_SDK)/mk/rte.vars.mk + +# binary name +APP = udp_client + +# all source are stored in SRCS-y +SRCS-y += udp_client.c + +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -I$(RTE_OUTPUT)/include + +LDLIBS += -L$(RTE_OUTPUT)/lib +LDLIBS += -ltle_glue -ltle_misc -ltle_l4p -ltle_timer + +include $(TLDK_ROOT)/mk/tle.app.mk diff --git a/examples/udp_client/udp_client.c b/examples/udp_client/udp_client.c new file mode 100755 index 0000000..5601669 --- /dev/null +++ b/examples/udp_client/udp_client.c @@ -0,0 +1,166 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAXLINE 1400 +#define OPEN_MAX 100 +#define LISTENQ 20 +#define INFTIM 1000 +#define MAX_EVENT 10000 + +char *laddr; +char *lport; + +static void ip_show(const char *prefix, struct sockaddr_in *addr) +{ + char szAddr[20] = "\0"; + char* p = (char *)&addr->sin_addr; + + sprintf(szAddr, "%d.%d.%d.%d", *p, *(p + 1), *(p + 2), *(p + 3)); + printf("%s %d:ip:%s port:%u\n", prefix, addr->sin_family, szAddr, ntohs(addr->sin_port)); +} + +static void set_nonblocking(int sock) +{ + int opts; + opts = fcntl(sock, F_GETFL); + + if (opts < 0) { + perror("fcntl(sock,GETFL)"); + exit(1); + } + + opts = opts | O_NONBLOCK; + if (fcntl(sock, F_SETFL, opts) < 0) { + perror("fcntl(sock, SETFL, opts)"); + exit(1); + } +} + +int main(int argc, char *argv[]) +{ + int cfd; + struct sockaddr_in sin; + int i, sockfd, epfd, nfds; + char send_data[1500]; + char recv_data[1500]; + ssize_t n; + struct sockaddr_in caddr; + socklen_t addrlen = sizeof(sin); + struct epoll_event ev, events[20]; + + if (argc != 3) { + fprintf (stderr, "Usage: %s \n", argv[0]); + exit (EXIT_FAILURE); + } + + laddr = argv[1]; + lport = argv[2]; + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = inet_addr(laddr); + sin.sin_port = htons(atoi(lport));; + + cfd = socket(AF_INET, SOCK_DGRAM, 0); + if (cfd == -1) { + perror("socket failed"); + return -1; + } + printf("AF_INET = %d SOCK_DGRAM socket, cfd = %d\n", AF_INET, cfd); + + int opt = SO_REUSEADDR; + setsockopt(cfd, SOL_SOCKET, SO_REUSEADDR, &opt,sizeof(opt)); + + set_nonblocking(cfd); + + if (connect(cfd, (struct sockaddr *)&sin, sizeof(struct sockaddr)) == -1) { + perror("Connect"); + exit(1); + } + + epfd = epoll_create(256); + ev.data.fd = cfd; + ev.events = EPOLLOUT | EPOLLET; + + epoll_ctl(epfd, EPOLL_CTL_ADD, cfd, &ev); + for ( ; ; ) { + + nfds = epoll_wait(epfd, events, MAX_EVENT, 0); + for (i = 0; i < nfds; ++i) { + if (events[i].events & EPOLLIN) { + + sockfd = events[i].data.fd; + if (sockfd < 0) { + continue; + } + + if (send_data[0] == '0') { + n = recvfrom(sockfd, recv_data, MAXLINE, 0, (struct sockaddr *)&caddr, &addrlen); + + ip_show("0recvfrom pkt", &caddr); + } else { + n = read(sockfd, recv_data, MAXLINE); + ip_show("read pkt", &sin); + } + + if (n < 0) { + if (errno == ECONNRESET) { + close(sockfd); + return 0; + } else { + printf("recv error\n"); + } + } else if (n == 0) { + printf("close"); + } else { + recv_data[n] = '\0'; + printf("recv length = %zi data: [%s]\n", n, recv_data); + } + + ev.data.fd = cfd; + ev.events = EPOLLOUT | EPOLLET; + epoll_ctl(epfd, EPOLL_CTL_MOD, cfd, &ev); + } else if(events[i].events & EPOLLOUT) { + + printf("\nSEND (q or Q to quit) : "); + if (fgets(send_data, MAXLINE, stdin) == NULL) { + perror("fgets"); + exit(1); + } + + if (strcmp(send_data , "q") == 0 && strcmp(send_data , "Q") == 0) { + return 0; + } + + sockfd = events[i].data.fd; + if (sockfd == -1) { + continue; + } + + if (send_data[0] == '0') { + n = sendto(sockfd, send_data, strlen(send_data), 0, (struct sockaddr*)&sin, addrlen); + ip_show("sendto pkt", &sin); + } else { + n = write(sockfd, send_data, strlen(send_data)); + ip_show("write pkt", &sin); + } + + printf("send length = %zi data: [%s]\n", n, send_data); + + ev.data.fd = sockfd; + ev.events = EPOLLIN | EPOLLET; + epoll_ctl(epfd, EPOLL_CTL_MOD, sockfd, &ev); + } + } + } + + return 0; +} diff --git a/examples/udp_echo/Makefile b/examples/udp_echo/Makefile new file mode 100644 index 0000000..4858a9a --- /dev/null +++ b/examples/udp_echo/Makefile @@ -0,0 +1,40 @@ +# Copyright (c) 2018 Ant Financial Services Group. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +ifeq ($(RTE_TARGET),) +$(error "Please define RTE_TARGET environment variable") +endif + +ifeq ($(TLDK_ROOT),) +$(error "Please define TLDK_ROOT environment variable") +endif + +include $(RTE_SDK)/mk/rte.vars.mk + +# binary name +APP = udp_echo + +# all source are stored in SRCS-y +SRCS-y += udp_echo.c + +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -I$(RTE_OUTPUT)/include + +LDLIBS += -L$(RTE_OUTPUT)/lib +LDLIBS += -ltle_glue -ltle_misc -ltle_l4p -ltle_timer + +include $(TLDK_ROOT)/mk/tle.app.mk diff --git a/examples/udp_echo/udp_echo.c b/examples/udp_echo/udp_echo.c new file mode 100644 index 0000000..3198112 --- /dev/null +++ b/examples/udp_echo/udp_echo.c @@ -0,0 +1,143 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAXLINE 1500 +#define OPEN_MAX 100 +#define LISTENQ 20 +#define INFTIM 1000 +#define MAX_EVENT 10000 + +static void set_nonblocking(int sock) +{ + int opts; + opts = fcntl(sock, F_GETFL); + + if (opts < 0) { + perror("fcntl(sock,GETFL)"); + exit(1); + } + + opts = opts | O_NONBLOCK; + if (fcntl(sock, F_SETFL, opts) < 0) { + perror("fcntl(sock, SETFL, opts)"); + exit(1); + } +} + +static int create_and_bind(char *addr, char *port) +{ + int s, sfd; + struct sockaddr_in sin; + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = inet_addr(addr); + sin.sin_port = htons(atoi(port)); + + sfd = socket(AF_INET, SOCK_DGRAM, 0); + if (sfd == -1) { + perror("socket failed"); + return -1; + } + + set_nonblocking(sfd); + + s = bind(sfd, (struct sockaddr *)&sin, sizeof(sin)); + if (s < 0) { + perror("bind failed"); + close(sfd); + return -1; + } + + return sfd; +} + +int main(int argc, char *argv[]) +{ + int i, sfd, sockfd, epfd, nfds; + int ret; + ssize_t n = 0; + char line[MAXLINE]; + char szAddr[1024] = "\0"; + struct sockaddr_in caddr; + socklen_t addrlen = sizeof(caddr); + struct epoll_event ev, events[20]; + + if (argc != 3) { + fprintf (stderr, "Usage: %s \n", argv[0]); + exit (EXIT_FAILURE); + } + + epfd = epoll_create(256); + printf("epoll_create(256) return %d\n", epfd); + + sfd = create_and_bind(argv[1], argv[2]); + if (sfd < 0) { + return -1; + } + + ev.data.fd = sfd; + ev.events = EPOLLIN | EPOLLET; + ret = epoll_ctl(epfd, EPOLL_CTL_ADD, sfd, &ev); + printf("epoll_ctl return %d\n", ret); + + memset(line, 0, MAXLINE); + + for ( ; ; ) { + + nfds = epoll_wait(epfd, events, MAX_EVENT, 0); + for (i = 0; i < nfds; ++i) { + if (events[i].events & EPOLLIN) { + + if ((sockfd = events[i].data.fd) < 0) + continue; + + if ((n = recvfrom(sockfd, line, MAXLINE, 0, (struct sockaddr *)&caddr, &addrlen)) < 0) { + if (errno == ECONNRESET) { + close(sockfd); + events[i].data.fd = -1; + } else { + printf("readline error\n"); + } + } else if (n == 0) { + perror("connfd = 0\n"); + close(sockfd); + events[i].data.fd = -1; + } + + char* p = (char *)&caddr.sin_addr; + line[n] = '\0'; + sprintf(szAddr, "%d.%d.%d.%d", *p, *(p + 1), *(p + 2), *(p + 3)); + printf("recv %s from %d:ip:%s port:%u\n", line, caddr.sin_family, szAddr, ntohs(caddr.sin_port)); + + ev.data.fd = sockfd; + ev.events = EPOLLOUT | EPOLLET; + epoll_ctl(epfd, EPOLL_CTL_MOD, sockfd, &ev); + + } else if(events[i].events & EPOLLOUT) { + + if(events[i].data.fd == -1) { + continue; + } + + sockfd = events[i].data.fd; + printf("send %s\n", line); + ev.data.fd = sockfd; + ev.events = EPOLLIN | EPOLLET; + sendto(sockfd, line, n, 0, (struct sockaddr*)&caddr, addrlen); + epoll_ctl(epfd, EPOLL_CTL_MOD, sockfd, &ev); + } + } + } + + return 0; +} + diff --git a/examples/udp_proxy/Makefile b/examples/udp_proxy/Makefile new file mode 100644 index 0000000..fa79042 --- /dev/null +++ b/examples/udp_proxy/Makefile @@ -0,0 +1,19 @@ +TARGET = udp_proxy + +SRCS = $(wildcard *.c) +OBJS = $(SRCS:.c=.o) + +CC = gcc +INCLUDES = -I./libev/include +LDFLAGS = -L./libev/lib -lev -lm -lpthread +CFLAGS = -g -Wall -O2 + +$(TARGET) : $(OBJS) + $(CC) $^ $(LDFLAGS) $(LIBS) -o $@ + +%.o : %.c + $(CC) $(CFLAGS) $(INCLUDES) -c $< + +.PHONY : clean +clean: + rm -rf $(OBJS) $(TARGET) diff --git a/examples/udp_proxy/Makefile.bbk b/examples/udp_proxy/Makefile.bbk new file mode 100644 index 0000000..e090d90 --- /dev/null +++ b/examples/udp_proxy/Makefile.bbk @@ -0,0 +1,40 @@ +# Copyright (c) 2018 Ant Financial Services Group. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +ifeq ($(RTE_TARGET),) +$(error "Please define RTE_TARGET environment variable") +endif + +ifeq ($(TLDK_ROOT),) +$(error "Please define TLDK_ROOT environment variable") +endif + +include $(RTE_SDK)/mk/rte.vars.mk + +# binary name +APP = mt_tcp + +# all source are stored in SRCS-y +SRCS-y += main.c + +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -I$(RTE_OUTPUT)/include -I./libev/include -pthread + +LDLIBS += -L$(RTE_OUTPUT)/lib -L./libev/lib +LDLIBS += -ltle_glue -ltle_misc -ltle_l4p -ltle_timer -lev -lm + +include $(TLDK_ROOT)/mk/tle.app.mk diff --git a/examples/udp_proxy/libev/Makefile b/examples/udp_proxy/libev/Makefile new file mode 100644 index 0000000..666be8c --- /dev/null +++ b/examples/udp_proxy/libev/Makefile @@ -0,0 +1,35 @@ +TARGET = libev.so + +SRCS = ev.c event.c +OBJS = $(SRCS:.c=.o) + +EVINCPATH = `pwd`/include/ +EVLIBPATH = `pwd`/lib/ + +CC = gcc +INCLUDES = -I. +LIBS = -L. +CFLAGS = -g -O2 -DHAVE_CONFIG_H -MT ev.lo -MD -MP -MF .deps/ev.Tpo -fPIC -DPIC +LDFLAGS = -shared -fPIC -DPIC -lm -O2 +#gcc -DHAVE_CONFIG_H -I. -g -O3 -MT ev.lo -MD -MP -MF .deps/ev.Tpo -c ev.c -fPIC -DPIC -o .libs/ev.o + +#gcc -DHAVE_CONFIG_H -I. -g -O3 -MT event.lo -MD -MP -MF .deps/event.Tpo -c event.c -fPIC -DPIC -o event.o + +#libtool: link: gcc -shared -fPIC -DPIC .libs/ev.o .libs/event.o -lm -O3 -Wl,-soname -Wl,libev.so.4 -o .libs/libev.so.4.0.0 +#libtool: link: (cd ".libs" && rm -f "libev.so.4" && ln -s "libev.so.4.0.0" "libev.so.4") +#libtool: link: (cd ".libs" && rm -f "libev.so" && ln -s "libev.so.4.0.0" "libev.so") +#libtool: link: ar cru .libs/libev.a ev.o event.o + +$(TARGET) : $(OBJS) + $(CC) $(LDFLAGS) $^ $(INCLUDES) $(LIBS) -o $@ + ar cru libev.a $^ + cp -f ev.h $(EVINCPATH) + cp -f libev.a $(EVLIBPATH) + +%.o : %.c + $(CC) $(CFLAGS) $(INCLUDES) -c $< + +.PHONY : clean +clean: + rm -rf ./lib/* ./include/* + rm -rf $(OBJS) libev.so libev.a $(TARGET) diff --git a/examples/udp_proxy/libev/config.h b/examples/udp_proxy/libev/config.h new file mode 100644 index 0000000..c9d8a7d --- /dev/null +++ b/examples/udp_proxy/libev/config.h @@ -0,0 +1,126 @@ +/* config.h. Generated from config.h.in by configure. */ +/* config.h.in. Generated from configure.ac by autoheader. */ + +/* Define to 1 if you have the `clock_gettime' function. */ +#define HAVE_CLOCK_GETTIME 1 + +/* Define to 1 to use the syscall interface for clock_gettime */ +/* #undef HAVE_CLOCK_SYSCALL */ + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define to 1 if you have the `epoll_ctl' function. */ +#define HAVE_EPOLL_CTL 1 + +/* Define to 1 if you have the `eventfd' function. */ +#define HAVE_EVENTFD 1 + +/* Define to 1 if the floor function is available */ +#define HAVE_FLOOR 1 + +/* Define to 1 if you have the `inotify_init' function. */ +#define HAVE_INOTIFY_INIT 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the `kqueue' function. */ +/* #undef HAVE_KQUEUE */ + +/* Define to 1 if you have the `rt' library (-lrt). */ +/* #undef HAVE_LIBRT */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `nanosleep' function. */ +#define HAVE_NANOSLEEP 1 + +/* Define to 1 if you have the `poll' function. */ +#define HAVE_POLL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_POLL_H 1 + +/* Define to 1 if you have the `port_create' function. */ +/* #undef HAVE_PORT_CREATE */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_PORT_H */ + +/* Define to 1 if you have the `select' function. */ +#define HAVE_SELECT 1 + +/* Define to 1 if you have the `signalfd' function. */ +#define HAVE_SIGNALFD 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_EPOLL_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_EVENTFD_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_EVENT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_INOTIFY_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SELECT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SIGNALFD_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to the sub-directory in which libtool stores uninstalled libraries. + */ +#define LT_OBJDIR ".libs/" + +/* Name of package */ +#define PACKAGE "libev" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "" + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Version number of package */ +#define VERSION "4.24" diff --git a/examples/udp_proxy/libev/ev++.h b/examples/udp_proxy/libev/ev++.h new file mode 100644 index 0000000..4f0a36a --- /dev/null +++ b/examples/udp_proxy/libev/ev++.h @@ -0,0 +1,816 @@ +/* + * libev simple C++ wrapper classes + * + * Copyright (c) 2007,2008,2010 Marc Alexander Lehmann + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#ifndef EVPP_H__ +#define EVPP_H__ + +#ifdef EV_H +# include EV_H +#else +# include "ev.h" +#endif + +#ifndef EV_USE_STDEXCEPT +# define EV_USE_STDEXCEPT 1 +#endif + +#if EV_USE_STDEXCEPT +# include +#endif + +namespace ev { + + typedef ev_tstamp tstamp; + + enum { + UNDEF = EV_UNDEF, + NONE = EV_NONE, + READ = EV_READ, + WRITE = EV_WRITE, +#if EV_COMPAT3 + TIMEOUT = EV_TIMEOUT, +#endif + TIMER = EV_TIMER, + PERIODIC = EV_PERIODIC, + SIGNAL = EV_SIGNAL, + CHILD = EV_CHILD, + STAT = EV_STAT, + IDLE = EV_IDLE, + CHECK = EV_CHECK, + PREPARE = EV_PREPARE, + FORK = EV_FORK, + ASYNC = EV_ASYNC, + EMBED = EV_EMBED, +# undef ERROR // some systems stupidly #define ERROR + ERROR = EV_ERROR + }; + + enum + { + AUTO = EVFLAG_AUTO, + NOENV = EVFLAG_NOENV, + FORKCHECK = EVFLAG_FORKCHECK, + + SELECT = EVBACKEND_SELECT, + POLL = EVBACKEND_POLL, + EPOLL = EVBACKEND_EPOLL, + KQUEUE = EVBACKEND_KQUEUE, + DEVPOLL = EVBACKEND_DEVPOLL, + PORT = EVBACKEND_PORT + }; + + enum + { +#if EV_COMPAT3 + NONBLOCK = EVLOOP_NONBLOCK, + ONESHOT = EVLOOP_ONESHOT, +#endif + NOWAIT = EVRUN_NOWAIT, + ONCE = EVRUN_ONCE + }; + + enum how_t + { + ONE = EVBREAK_ONE, + ALL = EVBREAK_ALL + }; + + struct bad_loop +#if EV_USE_STDEXCEPT + : std::runtime_error +#endif + { +#if EV_USE_STDEXCEPT + bad_loop () + : std::runtime_error ("libev event loop cannot be initialized, bad value of LIBEV_FLAGS?") + { + } +#endif + }; + +#ifdef EV_AX +# undef EV_AX +#endif + +#ifdef EV_AX_ +# undef EV_AX_ +#endif + +#if EV_MULTIPLICITY +# define EV_AX raw_loop +# define EV_AX_ raw_loop, +#else +# define EV_AX +# define EV_AX_ +#endif + + struct loop_ref + { + loop_ref (EV_P) throw () +#if EV_MULTIPLICITY + : EV_AX (EV_A) +#endif + { + } + + bool operator == (const loop_ref &other) const throw () + { +#if EV_MULTIPLICITY + return EV_AX == other.EV_AX; +#else + return true; +#endif + } + + bool operator != (const loop_ref &other) const throw () + { +#if EV_MULTIPLICITY + return ! (*this == other); +#else + return false; +#endif + } + +#if EV_MULTIPLICITY + bool operator == (const EV_P) const throw () + { + return this->EV_AX == EV_A; + } + + bool operator != (const EV_P) const throw () + { + return (*this == EV_A); + } + + operator struct ev_loop * () const throw () + { + return EV_AX; + } + + operator const struct ev_loop * () const throw () + { + return EV_AX; + } + + bool is_default () const throw () + { + return EV_AX == ev_default_loop (0); + } +#endif + +#if EV_COMPAT3 + void loop (int flags = 0) + { + ev_run (EV_AX_ flags); + } + + void unloop (how_t how = ONE) throw () + { + ev_break (EV_AX_ how); + } +#endif + + void run (int flags = 0) + { + ev_run (EV_AX_ flags); + } + + void break_loop (how_t how = ONE) throw () + { + ev_break (EV_AX_ how); + } + + void post_fork () throw () + { + ev_loop_fork (EV_AX); + } + + unsigned int backend () const throw () + { + return ev_backend (EV_AX); + } + + tstamp now () const throw () + { + return ev_now (EV_AX); + } + + void ref () throw () + { + ev_ref (EV_AX); + } + + void unref () throw () + { + ev_unref (EV_AX); + } + +#if EV_FEATURE_API + unsigned int iteration () const throw () + { + return ev_iteration (EV_AX); + } + + unsigned int depth () const throw () + { + return ev_depth (EV_AX); + } + + void set_io_collect_interval (tstamp interval) throw () + { + ev_set_io_collect_interval (EV_AX_ interval); + } + + void set_timeout_collect_interval (tstamp interval) throw () + { + ev_set_timeout_collect_interval (EV_AX_ interval); + } +#endif + + // function callback + void once (int fd, int events, tstamp timeout, void (*cb)(int, void *), void *arg = 0) throw () + { + ev_once (EV_AX_ fd, events, timeout, cb, arg); + } + + // method callback + template + void once (int fd, int events, tstamp timeout, K *object) throw () + { + once (fd, events, timeout, method_thunk, object); + } + + // default method == operator () + template + void once (int fd, int events, tstamp timeout, K *object) throw () + { + once (fd, events, timeout, method_thunk, object); + } + + template + static void method_thunk (int revents, void *arg) + { + (static_cast(arg)->*method) + (revents); + } + + // no-argument method callback + template + void once (int fd, int events, tstamp timeout, K *object) throw () + { + once (fd, events, timeout, method_noargs_thunk, object); + } + + template + static void method_noargs_thunk (int revents, void *arg) + { + (static_cast(arg)->*method) + (); + } + + // simpler function callback + template + void once (int fd, int events, tstamp timeout) throw () + { + once (fd, events, timeout, simpler_func_thunk); + } + + template + static void simpler_func_thunk (int revents, void *arg) + { + (*cb) + (revents); + } + + // simplest function callback + template + void once (int fd, int events, tstamp timeout) throw () + { + once (fd, events, timeout, simplest_func_thunk); + } + + template + static void simplest_func_thunk (int revents, void *arg) + { + (*cb) + (); + } + + void feed_fd_event (int fd, int revents) throw () + { + ev_feed_fd_event (EV_AX_ fd, revents); + } + + void feed_signal_event (int signum) throw () + { + ev_feed_signal_event (EV_AX_ signum); + } + +#if EV_MULTIPLICITY + struct ev_loop* EV_AX; +#endif + + }; + +#if EV_MULTIPLICITY + struct dynamic_loop : loop_ref + { + + dynamic_loop (unsigned int flags = AUTO) throw (bad_loop) + : loop_ref (ev_loop_new (flags)) + { + if (!EV_AX) + throw bad_loop (); + } + + ~dynamic_loop () throw () + { + ev_loop_destroy (EV_AX); + EV_AX = 0; + } + + private: + + dynamic_loop (const dynamic_loop &); + + dynamic_loop & operator= (const dynamic_loop &); + + }; +#endif + + struct default_loop : loop_ref + { + default_loop (unsigned int flags = AUTO) throw (bad_loop) +#if EV_MULTIPLICITY + : loop_ref (ev_default_loop (flags)) +#endif + { + if ( +#if EV_MULTIPLICITY + !EV_AX +#else + !ev_default_loop (flags) +#endif + ) + throw bad_loop (); + } + + private: + default_loop (const default_loop &); + default_loop &operator = (const default_loop &); + }; + + inline loop_ref get_default_loop () throw () + { +#if EV_MULTIPLICITY + return ev_default_loop (0); +#else + return loop_ref (); +#endif + } + +#undef EV_AX +#undef EV_AX_ + +#undef EV_PX +#undef EV_PX_ +#if EV_MULTIPLICITY +# define EV_PX loop_ref EV_A +# define EV_PX_ loop_ref EV_A_ +#else +# define EV_PX +# define EV_PX_ +#endif + + template + struct base : ev_watcher + { + #if EV_MULTIPLICITY + EV_PX; + + // loop set + void set (EV_P) throw () + { + this->EV_A = EV_A; + } + #endif + + base (EV_PX) throw () + #if EV_MULTIPLICITY + : EV_A (EV_A) + #endif + { + ev_init (this, 0); + } + + void set_ (const void *data, void (*cb)(EV_P_ ev_watcher *w, int revents)) throw () + { + this->data = (void *)data; + ev_set_cb (static_cast(this), cb); + } + + // function callback + template + void set (void *data = 0) throw () + { + set_ (data, function_thunk); + } + + template + static void function_thunk (EV_P_ ev_watcher *w, int revents) + { + function + (*static_cast(w), revents); + } + + // method callback + template + void set (K *object) throw () + { + set_ (object, method_thunk); + } + + // default method == operator () + template + void set (K *object) throw () + { + set_ (object, method_thunk); + } + + template + static void method_thunk (EV_P_ ev_watcher *w, int revents) + { + (static_cast(w->data)->*method) + (*static_cast(w), revents); + } + + // no-argument callback + template + void set (K *object) throw () + { + set_ (object, method_noargs_thunk); + } + + template + static void method_noargs_thunk (EV_P_ ev_watcher *w, int revents) + { + (static_cast(w->data)->*method) + (); + } + + void operator ()(int events = EV_UNDEF) + { + return + ev_cb (static_cast(this)) + (static_cast(this), events); + } + + bool is_active () const throw () + { + return ev_is_active (static_cast(this)); + } + + bool is_pending () const throw () + { + return ev_is_pending (static_cast(this)); + } + + void feed_event (int revents) throw () + { + ev_feed_event (EV_A_ static_cast(this), revents); + } + }; + + inline tstamp now (EV_P) throw () + { + return ev_now (EV_A); + } + + inline void delay (tstamp interval) throw () + { + ev_sleep (interval); + } + + inline int version_major () throw () + { + return ev_version_major (); + } + + inline int version_minor () throw () + { + return ev_version_minor (); + } + + inline unsigned int supported_backends () throw () + { + return ev_supported_backends (); + } + + inline unsigned int recommended_backends () throw () + { + return ev_recommended_backends (); + } + + inline unsigned int embeddable_backends () throw () + { + return ev_embeddable_backends (); + } + + inline void set_allocator (void *(*cb)(void *ptr, long size) throw ()) throw () + { + ev_set_allocator (cb); + } + + inline void set_syserr_cb (void (*cb)(const char *msg) throw ()) throw () + { + ev_set_syserr_cb (cb); + } + + #if EV_MULTIPLICITY + #define EV_CONSTRUCT(cppstem,cstem) \ + (EV_PX = get_default_loop ()) throw () \ + : base (EV_A) \ + { \ + } + #else + #define EV_CONSTRUCT(cppstem,cstem) \ + () throw () \ + { \ + } + #endif + + /* using a template here would require quite a few more lines, + * so a macro solution was chosen */ + #define EV_BEGIN_WATCHER(cppstem,cstem) \ + \ + struct cppstem : base \ + { \ + void start () throw () \ + { \ + ev_ ## cstem ## _start (EV_A_ static_cast(this)); \ + } \ + \ + void stop () throw () \ + { \ + ev_ ## cstem ## _stop (EV_A_ static_cast(this)); \ + } \ + \ + cppstem EV_CONSTRUCT(cppstem,cstem) \ + \ + ~cppstem () throw () \ + { \ + stop (); \ + } \ + \ + using base::set; \ + \ + private: \ + \ + cppstem (const cppstem &o); \ + \ + cppstem &operator =(const cppstem &o); \ + \ + public: + + #define EV_END_WATCHER(cppstem,cstem) \ + }; + + EV_BEGIN_WATCHER (io, io) + void set (int fd, int events) throw () + { + int active = is_active (); + if (active) stop (); + ev_io_set (static_cast(this), fd, events); + if (active) start (); + } + + void set (int events) throw () + { + int active = is_active (); + if (active) stop (); + ev_io_set (static_cast(this), fd, events); + if (active) start (); + } + + void start (int fd, int events) throw () + { + set (fd, events); + start (); + } + EV_END_WATCHER (io, io) + + EV_BEGIN_WATCHER (timer, timer) + void set (ev_tstamp after, ev_tstamp repeat = 0.) throw () + { + int active = is_active (); + if (active) stop (); + ev_timer_set (static_cast(this), after, repeat); + if (active) start (); + } + + void start (ev_tstamp after, ev_tstamp repeat = 0.) throw () + { + set (after, repeat); + start (); + } + + void again () throw () + { + ev_timer_again (EV_A_ static_cast(this)); + } + + ev_tstamp remaining () + { + return ev_timer_remaining (EV_A_ static_cast(this)); + } + EV_END_WATCHER (timer, timer) + + #if EV_PERIODIC_ENABLE + EV_BEGIN_WATCHER (periodic, periodic) + void set (ev_tstamp at, ev_tstamp interval = 0.) throw () + { + int active = is_active (); + if (active) stop (); + ev_periodic_set (static_cast(this), at, interval, 0); + if (active) start (); + } + + void start (ev_tstamp at, ev_tstamp interval = 0.) throw () + { + set (at, interval); + start (); + } + + void again () throw () + { + ev_periodic_again (EV_A_ static_cast(this)); + } + EV_END_WATCHER (periodic, periodic) + #endif + + #if EV_SIGNAL_ENABLE + EV_BEGIN_WATCHER (sig, signal) + void set (int signum) throw () + { + int active = is_active (); + if (active) stop (); + ev_signal_set (static_cast(this), signum); + if (active) start (); + } + + void start (int signum) throw () + { + set (signum); + start (); + } + EV_END_WATCHER (sig, signal) + #endif + + #if EV_CHILD_ENABLE + EV_BEGIN_WATCHER (child, child) + void set (int pid, int trace = 0) throw () + { + int active = is_active (); + if (active) stop (); + ev_child_set (static_cast(this), pid, trace); + if (active) start (); + } + + void start (int pid, int trace = 0) throw () + { + set (pid, trace); + start (); + } + EV_END_WATCHER (child, child) + #endif + + #if EV_STAT_ENABLE + EV_BEGIN_WATCHER (stat, stat) + void set (const char *path, ev_tstamp interval = 0.) throw () + { + int active = is_active (); + if (active) stop (); + ev_stat_set (static_cast(this), path, interval); + if (active) start (); + } + + void start (const char *path, ev_tstamp interval = 0.) throw () + { + stop (); + set (path, interval); + start (); + } + + void update () throw () + { + ev_stat_stat (EV_A_ static_cast(this)); + } + EV_END_WATCHER (stat, stat) + #endif + + #if EV_IDLE_ENABLE + EV_BEGIN_WATCHER (idle, idle) + void set () throw () { } + EV_END_WATCHER (idle, idle) + #endif + + #if EV_PREPARE_ENABLE + EV_BEGIN_WATCHER (prepare, prepare) + void set () throw () { } + EV_END_WATCHER (prepare, prepare) + #endif + + #if EV_CHECK_ENABLE + EV_BEGIN_WATCHER (check, check) + void set () throw () { } + EV_END_WATCHER (check, check) + #endif + + #if EV_EMBED_ENABLE + EV_BEGIN_WATCHER (embed, embed) + void set_embed (struct ev_loop *embedded_loop) throw () + { + int active = is_active (); + if (active) stop (); + ev_embed_set (static_cast(this), embedded_loop); + if (active) start (); + } + + void start (struct ev_loop *embedded_loop) throw () + { + set (embedded_loop); + start (); + } + + void sweep () + { + ev_embed_sweep (EV_A_ static_cast(this)); + } + EV_END_WATCHER (embed, embed) + #endif + + #if EV_FORK_ENABLE + EV_BEGIN_WATCHER (fork, fork) + void set () throw () { } + EV_END_WATCHER (fork, fork) + #endif + + #if EV_ASYNC_ENABLE + EV_BEGIN_WATCHER (async, async) + void send () throw () + { + ev_async_send (EV_A_ static_cast(this)); + } + + bool async_pending () throw () + { + return ev_async_pending (static_cast(this)); + } + EV_END_WATCHER (async, async) + #endif + + #undef EV_PX + #undef EV_PX_ + #undef EV_CONSTRUCT + #undef EV_BEGIN_WATCHER + #undef EV_END_WATCHER +} + +#endif + diff --git a/examples/udp_proxy/libev/ev.c b/examples/udp_proxy/libev/ev.c new file mode 100644 index 0000000..0dec443 --- /dev/null +++ b/examples/udp_proxy/libev/ev.c @@ -0,0 +1,5145 @@ +/* + * libev event processing core, watcher management + * + * Copyright (c) 2007,2008,2009,2010,2011,2012,2013 Marc Alexander Lehmann + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +/* this big block deduces configuration from config.h */ +#ifndef EV_STANDALONE +# ifdef EV_CONFIG_H +# include EV_CONFIG_H +# else +# include "config.h" +# endif + +# if HAVE_FLOOR +# ifndef EV_USE_FLOOR +# define EV_USE_FLOOR 1 +# endif +# endif + +# if HAVE_CLOCK_SYSCALL +# ifndef EV_USE_CLOCK_SYSCALL +# define EV_USE_CLOCK_SYSCALL 1 +# ifndef EV_USE_REALTIME +# define EV_USE_REALTIME 0 +# endif +# ifndef EV_USE_MONOTONIC +# define EV_USE_MONOTONIC 1 +# endif +# endif +# elif !defined EV_USE_CLOCK_SYSCALL +# define EV_USE_CLOCK_SYSCALL 0 +# endif + +# if HAVE_CLOCK_GETTIME +# ifndef EV_USE_MONOTONIC +# define EV_USE_MONOTONIC 1 +# endif +# ifndef EV_USE_REALTIME +# define EV_USE_REALTIME 0 +# endif +# else +# ifndef EV_USE_MONOTONIC +# define EV_USE_MONOTONIC 0 +# endif +# ifndef EV_USE_REALTIME +# define EV_USE_REALTIME 0 +# endif +# endif + +# if HAVE_NANOSLEEP +# ifndef EV_USE_NANOSLEEP +# define EV_USE_NANOSLEEP EV_FEATURE_OS +# endif +# else +# undef EV_USE_NANOSLEEP +# define EV_USE_NANOSLEEP 0 +# endif + +# if HAVE_SELECT && HAVE_SYS_SELECT_H +# ifndef EV_USE_SELECT +# define EV_USE_SELECT EV_FEATURE_BACKENDS +# endif +# else +# undef EV_USE_SELECT +# define EV_USE_SELECT 0 +# endif + +# if HAVE_POLL && HAVE_POLL_H +# ifndef EV_USE_POLL +# define EV_USE_POLL EV_FEATURE_BACKENDS +# endif +# else +# undef EV_USE_POLL +# define EV_USE_POLL 0 +# endif + +# if HAVE_EPOLL_CTL && HAVE_SYS_EPOLL_H +# ifndef EV_USE_EPOLL +# define EV_USE_EPOLL EV_FEATURE_BACKENDS +# endif +# else +# undef EV_USE_EPOLL +# define EV_USE_EPOLL 0 +# endif + +# if HAVE_KQUEUE && HAVE_SYS_EVENT_H +# ifndef EV_USE_KQUEUE +# define EV_USE_KQUEUE EV_FEATURE_BACKENDS +# endif +# else +# undef EV_USE_KQUEUE +# define EV_USE_KQUEUE 0 +# endif + +# if HAVE_PORT_H && HAVE_PORT_CREATE +# ifndef EV_USE_PORT +# define EV_USE_PORT EV_FEATURE_BACKENDS +# endif +# else +# undef EV_USE_PORT +# define EV_USE_PORT 0 +# endif + +# if HAVE_INOTIFY_INIT && HAVE_SYS_INOTIFY_H +# ifndef EV_USE_INOTIFY +# define EV_USE_INOTIFY EV_FEATURE_OS +# endif +# else +# undef EV_USE_INOTIFY +# define EV_USE_INOTIFY 0 +# endif + +# if HAVE_SIGNALFD && HAVE_SYS_SIGNALFD_H +# ifndef EV_USE_SIGNALFD +# define EV_USE_SIGNALFD EV_FEATURE_OS +# endif +# else +# undef EV_USE_SIGNALFD +# define EV_USE_SIGNALFD 0 +# endif + +# if HAVE_EVENTFD +# ifndef EV_USE_EVENTFD +# define EV_USE_EVENTFD EV_FEATURE_OS +# endif +# else +# undef EV_USE_EVENTFD +# define EV_USE_EVENTFD 0 +# endif + +#endif + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include + +#ifdef EV_H +# include EV_H +#else +# include "ev.h" +#endif + +#if EV_NO_THREADS +# undef EV_NO_SMP +# define EV_NO_SMP 1 +# undef ECB_NO_THREADS +# define ECB_NO_THREADS 1 +#endif +#if EV_NO_SMP +# undef EV_NO_SMP +# define ECB_NO_SMP 1 +#endif + +#ifndef _WIN32 +# include +# include +# include +#else +# include +# define WIN32_LEAN_AND_MEAN +# include +# include +# ifndef EV_SELECT_IS_WINSOCKET +# define EV_SELECT_IS_WINSOCKET 1 +# endif +# undef EV_AVOID_STDIO +#endif + +/* OS X, in its infinite idiocy, actually HARDCODES + * a limit of 1024 into their select. Where people have brains, + * OS X engineers apparently have a vacuum. Or maybe they were + * ordered to have a vacuum, or they do anything for money. + * This might help. Or not. + */ +#define _DARWIN_UNLIMITED_SELECT 1 + +/* this block tries to deduce configuration from header-defined symbols and defaults */ + +/* try to deduce the maximum number of signals on this platform */ +#if defined EV_NSIG +/* use what's provided */ +#elif defined NSIG +# define EV_NSIG (NSIG) +#elif defined _NSIG +# define EV_NSIG (_NSIG) +#elif defined SIGMAX +# define EV_NSIG (SIGMAX+1) +#elif defined SIG_MAX +# define EV_NSIG (SIG_MAX+1) +#elif defined _SIG_MAX +# define EV_NSIG (_SIG_MAX+1) +#elif defined MAXSIG +# define EV_NSIG (MAXSIG+1) +#elif defined MAX_SIG +# define EV_NSIG (MAX_SIG+1) +#elif defined SIGARRAYSIZE +# define EV_NSIG (SIGARRAYSIZE) /* Assume ary[SIGARRAYSIZE] */ +#elif defined _sys_nsig +# define EV_NSIG (_sys_nsig) /* Solaris 2.5 */ +#else +# define EV_NSIG (8 * sizeof (sigset_t) + 1) +#endif + +#ifndef EV_USE_FLOOR +# define EV_USE_FLOOR 0 +#endif + +#ifndef EV_USE_CLOCK_SYSCALL +# if __linux && __GLIBC__ == 2 && __GLIBC_MINOR__ < 17 +# define EV_USE_CLOCK_SYSCALL EV_FEATURE_OS +# else +# define EV_USE_CLOCK_SYSCALL 0 +# endif +#endif + +#if !(_POSIX_TIMERS > 0) +# ifndef EV_USE_MONOTONIC +# define EV_USE_MONOTONIC 0 +# endif +# ifndef EV_USE_REALTIME +# define EV_USE_REALTIME 0 +# endif +#endif + +#ifndef EV_USE_MONOTONIC +# if defined _POSIX_MONOTONIC_CLOCK && _POSIX_MONOTONIC_CLOCK >= 0 +# define EV_USE_MONOTONIC EV_FEATURE_OS +# else +# define EV_USE_MONOTONIC 0 +# endif +#endif + +#ifndef EV_USE_REALTIME +# define EV_USE_REALTIME !EV_USE_CLOCK_SYSCALL +#endif + +#ifndef EV_USE_NANOSLEEP +# if _POSIX_C_SOURCE >= 199309L +# define EV_USE_NANOSLEEP EV_FEATURE_OS +# else +# define EV_USE_NANOSLEEP 0 +# endif +#endif + +#ifndef EV_USE_SELECT +# define EV_USE_SELECT EV_FEATURE_BACKENDS +#endif + +#ifndef EV_USE_POLL +# ifdef _WIN32 +# define EV_USE_POLL 0 +# else +# define EV_USE_POLL EV_FEATURE_BACKENDS +# endif +#endif + +#ifndef EV_USE_EPOLL +# if __linux && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 4)) +# define EV_USE_EPOLL EV_FEATURE_BACKENDS +# else +# define EV_USE_EPOLL 0 +# endif +#endif + +#ifndef EV_USE_KQUEUE +# define EV_USE_KQUEUE 0 +#endif + +#ifndef EV_USE_PORT +# define EV_USE_PORT 0 +#endif + +#ifndef EV_USE_INOTIFY +# if __linux && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 4)) +# define EV_USE_INOTIFY EV_FEATURE_OS +# else +# define EV_USE_INOTIFY 0 +# endif +#endif + +#ifndef EV_PID_HASHSIZE +# define EV_PID_HASHSIZE EV_FEATURE_DATA ? 16 : 1 +#endif + +#ifndef EV_INOTIFY_HASHSIZE +# define EV_INOTIFY_HASHSIZE EV_FEATURE_DATA ? 16 : 1 +#endif + +#ifndef EV_USE_EVENTFD +# if __linux && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 7)) +# define EV_USE_EVENTFD EV_FEATURE_OS +# else +# define EV_USE_EVENTFD 0 +# endif +#endif + +#ifndef EV_USE_SIGNALFD +# if __linux && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 7)) +# define EV_USE_SIGNALFD EV_FEATURE_OS +# else +# define EV_USE_SIGNALFD 0 +# endif +#endif + +#if 0 /* debugging */ +# define EV_VERIFY 3 +# define EV_USE_4HEAP 1 +# define EV_HEAP_CACHE_AT 1 +#endif + +#ifndef EV_VERIFY +# define EV_VERIFY (EV_FEATURE_API ? 1 : 0) +#endif + +#ifndef EV_USE_4HEAP +# define EV_USE_4HEAP EV_FEATURE_DATA +#endif + +#ifndef EV_HEAP_CACHE_AT +# define EV_HEAP_CACHE_AT EV_FEATURE_DATA +#endif + +#ifdef ANDROID +/* supposedly, android doesn't typedef fd_mask */ +# undef EV_USE_SELECT +# define EV_USE_SELECT 0 +/* supposedly, we need to include syscall.h, not sys/syscall.h, so just disable */ +# undef EV_USE_CLOCK_SYSCALL +# define EV_USE_CLOCK_SYSCALL 0 +#endif + +/* aix's poll.h seems to cause lots of trouble */ +#ifdef _AIX +/* AIX has a completely broken poll.h header */ +# undef EV_USE_POLL +# define EV_USE_POLL 0 +#endif + +/* on linux, we can use a (slow) syscall to avoid a dependency on pthread, */ +/* which makes programs even slower. might work on other unices, too. */ +#if EV_USE_CLOCK_SYSCALL +# include +# ifdef SYS_clock_gettime +# define clock_gettime(id, ts) syscall (SYS_clock_gettime, (id), (ts)) +# undef EV_USE_MONOTONIC +# define EV_USE_MONOTONIC 1 +# else +# undef EV_USE_CLOCK_SYSCALL +# define EV_USE_CLOCK_SYSCALL 0 +# endif +#endif + +/* this block fixes any misconfiguration where we know we run into trouble otherwise */ + +#ifndef CLOCK_MONOTONIC +# undef EV_USE_MONOTONIC +# define EV_USE_MONOTONIC 0 +#endif + +#ifndef CLOCK_REALTIME +# undef EV_USE_REALTIME +# define EV_USE_REALTIME 0 +#endif + +#if !EV_STAT_ENABLE +# undef EV_USE_INOTIFY +# define EV_USE_INOTIFY 0 +#endif + +#if !EV_USE_NANOSLEEP +/* hp-ux has it in sys/time.h, which we unconditionally include above */ +# if !defined _WIN32 && !defined __hpux +# include +# endif +#endif + +#if EV_USE_INOTIFY +# include +# include +/* some very old inotify.h headers don't have IN_DONT_FOLLOW */ +# ifndef IN_DONT_FOLLOW +# undef EV_USE_INOTIFY +# define EV_USE_INOTIFY 0 +# endif +#endif + +#if EV_USE_EVENTFD +/* our minimum requirement is glibc 2.7 which has the stub, but not the header */ +# include +# ifndef EFD_NONBLOCK +# define EFD_NONBLOCK O_NONBLOCK +# endif +# ifndef EFD_CLOEXEC +# ifdef O_CLOEXEC +# define EFD_CLOEXEC O_CLOEXEC +# else +# define EFD_CLOEXEC 02000000 +# endif +# endif +EV_CPP(extern "C") int (eventfd) (unsigned int initval, int flags); +#endif + +#if EV_USE_SIGNALFD +/* our minimum requirement is glibc 2.7 which has the stub, but not the header */ +# include +# ifndef SFD_NONBLOCK +# define SFD_NONBLOCK O_NONBLOCK +# endif +# ifndef SFD_CLOEXEC +# ifdef O_CLOEXEC +# define SFD_CLOEXEC O_CLOEXEC +# else +# define SFD_CLOEXEC 02000000 +# endif +# endif +EV_CPP (extern "C") int signalfd (int fd, const sigset_t *mask, int flags); + +struct signalfd_siginfo +{ + uint32_t ssi_signo; + char pad[128 - sizeof (uint32_t)]; +}; +#endif + +/**/ + +#if EV_VERIFY >= 3 +# define EV_FREQUENT_CHECK ev_verify (EV_A) +#else +# define EV_FREQUENT_CHECK do { } while (0) +#endif + +/* + * This is used to work around floating point rounding problems. + * This value is good at least till the year 4000. + */ +#define MIN_INTERVAL 0.0001220703125 /* 1/2**13, good till 4000 */ +/*#define MIN_INTERVAL 0.00000095367431640625 /* 1/2**20, good till 2200 */ + +#define MIN_TIMEJUMP 1. /* minimum timejump that gets detected (if monotonic clock available) */ +#define MAX_BLOCKTIME 59.743 /* never wait longer than this time (to detect time jumps) */ + +#define EV_TV_SET(tv,t) do { tv.tv_sec = (long)t; tv.tv_usec = (long)((t - tv.tv_sec) * 1e6); } while (0) +#define EV_TS_SET(ts,t) do { ts.tv_sec = (long)t; ts.tv_nsec = (long)((t - ts.tv_sec) * 1e9); } while (0) + +/* the following is ecb.h embedded into libev - use update_ev_c to update from an external copy */ +/* ECB.H BEGIN */ +/* + * libecb - http://software.schmorp.de/pkg/libecb + * + * Copyright (©) 2009-2015 Marc Alexander Lehmann + * Copyright (©) 2011 Emanuele Giaquinta + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#ifndef ECB_H +#define ECB_H + +/* 16 bits major, 16 bits minor */ +#define ECB_VERSION 0x00010005 + +#ifdef _WIN32 + typedef signed char int8_t; + typedef unsigned char uint8_t; + typedef signed short int16_t; + typedef unsigned short uint16_t; + typedef signed int int32_t; + typedef unsigned int uint32_t; + #if __GNUC__ + typedef signed long long int64_t; + typedef unsigned long long uint64_t; + #else /* _MSC_VER || __BORLANDC__ */ + typedef signed __int64 int64_t; + typedef unsigned __int64 uint64_t; + #endif + #ifdef _WIN64 + #define ECB_PTRSIZE 8 + typedef uint64_t uintptr_t; + typedef int64_t intptr_t; + #else + #define ECB_PTRSIZE 4 + typedef uint32_t uintptr_t; + typedef int32_t intptr_t; + #endif +#else + #include + #if (defined INTPTR_MAX ? INTPTR_MAX : ULONG_MAX) > 0xffffffffU + #define ECB_PTRSIZE 8 + #else + #define ECB_PTRSIZE 4 + #endif +#endif + +#define ECB_GCC_AMD64 (__amd64 || __amd64__ || __x86_64 || __x86_64__) +#define ECB_MSVC_AMD64 (_M_AMD64 || _M_X64) + +/* work around x32 idiocy by defining proper macros */ +#if ECB_GCC_AMD64 || ECB_MSVC_AMD64 + #if _ILP32 + #define ECB_AMD64_X32 1 + #else + #define ECB_AMD64 1 + #endif +#endif + +/* many compilers define _GNUC_ to some versions but then only implement + * what their idiot authors think are the "more important" extensions, + * causing enormous grief in return for some better fake benchmark numbers. + * or so. + * we try to detect these and simply assume they are not gcc - if they have + * an issue with that they should have done it right in the first place. + */ +#if !defined __GNUC_MINOR__ || defined __INTEL_COMPILER || defined __SUNPRO_C || defined __SUNPRO_CC || defined __llvm__ || defined __clang__ + #define ECB_GCC_VERSION(major,minor) 0 +#else + #define ECB_GCC_VERSION(major,minor) (__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))) +#endif + +#define ECB_CLANG_VERSION(major,minor) (__clang_major__ > (major) || (__clang_major__ == (major) && __clang_minor__ >= (minor))) + +#if __clang__ && defined __has_builtin + #define ECB_CLANG_BUILTIN(x) __has_builtin (x) +#else + #define ECB_CLANG_BUILTIN(x) 0 +#endif + +#if __clang__ && defined __has_extension + #define ECB_CLANG_EXTENSION(x) __has_extension (x) +#else + #define ECB_CLANG_EXTENSION(x) 0 +#endif + +#define ECB_CPP (__cplusplus+0) +#define ECB_CPP11 (__cplusplus >= 201103L) + +#if ECB_CPP + #define ECB_C 0 + #define ECB_STDC_VERSION 0 +#else + #define ECB_C 1 + #define ECB_STDC_VERSION __STDC_VERSION__ +#endif + +#define ECB_C99 (ECB_STDC_VERSION >= 199901L) +#define ECB_C11 (ECB_STDC_VERSION >= 201112L) + +#if ECB_CPP + #define ECB_EXTERN_C extern "C" + #define ECB_EXTERN_C_BEG ECB_EXTERN_C { + #define ECB_EXTERN_C_END } +#else + #define ECB_EXTERN_C extern + #define ECB_EXTERN_C_BEG + #define ECB_EXTERN_C_END +#endif + +/*****************************************************************************/ + +/* ECB_NO_THREADS - ecb is not used by multiple threads, ever */ +/* ECB_NO_SMP - ecb might be used in multiple threads, but only on a single cpu */ + +#if ECB_NO_THREADS + #define ECB_NO_SMP 1 +#endif + +#if ECB_NO_SMP + #define ECB_MEMORY_FENCE do { } while (0) +#endif + +/* http://www-01.ibm.com/support/knowledgecenter/SSGH3R_13.1.0/com.ibm.xlcpp131.aix.doc/compiler_ref/compiler_builtins.html */ +#if __xlC__ && ECB_CPP + #include +#endif + +#if 1400 <= _MSC_VER + #include /* fence functions _ReadBarrier, also bit search functions _BitScanReverse */ +#endif + +#ifndef ECB_MEMORY_FENCE + #if ECB_GCC_VERSION(2,5) || defined __INTEL_COMPILER || (__llvm__ && __GNUC__) || __SUNPRO_C >= 0x5110 || __SUNPRO_CC >= 0x5110 + #if __i386 || __i386__ + #define ECB_MEMORY_FENCE __asm__ __volatile__ ("lock; orb $0, -1(%%esp)" : : : "memory") + #define ECB_MEMORY_FENCE_ACQUIRE __asm__ __volatile__ ("" : : : "memory") + #define ECB_MEMORY_FENCE_RELEASE __asm__ __volatile__ ("") + #elif ECB_GCC_AMD64 + #define ECB_MEMORY_FENCE __asm__ __volatile__ ("mfence" : : : "memory") + #define ECB_MEMORY_FENCE_ACQUIRE __asm__ __volatile__ ("" : : : "memory") + #define ECB_MEMORY_FENCE_RELEASE __asm__ __volatile__ ("") + #elif __powerpc__ || __ppc__ || __powerpc64__ || __ppc64__ + #define ECB_MEMORY_FENCE __asm__ __volatile__ ("sync" : : : "memory") + #elif defined __ARM_ARCH_2__ \ + || defined __ARM_ARCH_3__ || defined __ARM_ARCH_3M__ \ + || defined __ARM_ARCH_4__ || defined __ARM_ARCH_4T__ \ + || defined __ARM_ARCH_5__ || defined __ARM_ARCH_5E__ \ + || defined __ARM_ARCH_5T__ || defined __ARM_ARCH_5TE__ \ + || defined __ARM_ARCH_5TEJ__ + /* should not need any, unless running old code on newer cpu - arm doesn't support that */ + #elif defined __ARM_ARCH_6__ || defined __ARM_ARCH_6J__ \ + || defined __ARM_ARCH_6K__ || defined __ARM_ARCH_6ZK__ \ + || defined __ARM_ARCH_6T2__ + #define ECB_MEMORY_FENCE __asm__ __volatile__ ("mcr p15,0,%0,c7,c10,5" : : "r" (0) : "memory") + #elif defined __ARM_ARCH_7__ || defined __ARM_ARCH_7A__ \ + || defined __ARM_ARCH_7R__ || defined __ARM_ARCH_7M__ + #define ECB_MEMORY_FENCE __asm__ __volatile__ ("dmb" : : : "memory") + #elif __aarch64__ + #define ECB_MEMORY_FENCE __asm__ __volatile__ ("dmb ish" : : : "memory") + #elif (__sparc || __sparc__) && !(__sparc_v8__ || defined __sparcv8) + #define ECB_MEMORY_FENCE __asm__ __volatile__ ("membar #LoadStore | #LoadLoad | #StoreStore | #StoreLoad" : : : "memory") + #define ECB_MEMORY_FENCE_ACQUIRE __asm__ __volatile__ ("membar #LoadStore | #LoadLoad" : : : "memory") + #define ECB_MEMORY_FENCE_RELEASE __asm__ __volatile__ ("membar #LoadStore | #StoreStore") + #elif defined __s390__ || defined __s390x__ + #define ECB_MEMORY_FENCE __asm__ __volatile__ ("bcr 15,0" : : : "memory") + #elif defined __mips__ + /* GNU/Linux emulates sync on mips1 architectures, so we force its use */ + /* anybody else who still uses mips1 is supposed to send in their version, with detection code. */ + #define ECB_MEMORY_FENCE __asm__ __volatile__ (".set mips2; sync; .set mips0" : : : "memory") + #elif defined __alpha__ + #define ECB_MEMORY_FENCE __asm__ __volatile__ ("mb" : : : "memory") + #elif defined __hppa__ + #define ECB_MEMORY_FENCE __asm__ __volatile__ ("" : : : "memory") + #define ECB_MEMORY_FENCE_RELEASE __asm__ __volatile__ ("") + #elif defined __ia64__ + #define ECB_MEMORY_FENCE __asm__ __volatile__ ("mf" : : : "memory") + #elif defined __m68k__ + #define ECB_MEMORY_FENCE __asm__ __volatile__ ("" : : : "memory") + #elif defined __m88k__ + #define ECB_MEMORY_FENCE __asm__ __volatile__ ("tb1 0,%%r0,128" : : : "memory") + #elif defined __sh__ + #define ECB_MEMORY_FENCE __asm__ __volatile__ ("" : : : "memory") + #endif + #endif +#endif + +#ifndef ECB_MEMORY_FENCE + #if ECB_GCC_VERSION(4,7) + /* see comment below (stdatomic.h) about the C11 memory model. */ + #define ECB_MEMORY_FENCE __atomic_thread_fence (__ATOMIC_SEQ_CST) + #define ECB_MEMORY_FENCE_ACQUIRE __atomic_thread_fence (__ATOMIC_ACQUIRE) + #define ECB_MEMORY_FENCE_RELEASE __atomic_thread_fence (__ATOMIC_RELEASE) + + #elif ECB_CLANG_EXTENSION(c_atomic) + /* see comment below (stdatomic.h) about the C11 memory model. */ + #define ECB_MEMORY_FENCE __c11_atomic_thread_fence (__ATOMIC_SEQ_CST) + #define ECB_MEMORY_FENCE_ACQUIRE __c11_atomic_thread_fence (__ATOMIC_ACQUIRE) + #define ECB_MEMORY_FENCE_RELEASE __c11_atomic_thread_fence (__ATOMIC_RELEASE) + + #elif ECB_GCC_VERSION(4,4) || defined __INTEL_COMPILER || defined __clang__ + #define ECB_MEMORY_FENCE __sync_synchronize () + #elif _MSC_VER >= 1500 /* VC++ 2008 */ + /* apparently, microsoft broke all the memory barrier stuff in Visual Studio 2008... */ + #pragma intrinsic(_ReadBarrier,_WriteBarrier,_ReadWriteBarrier) + #define ECB_MEMORY_FENCE _ReadWriteBarrier (); MemoryBarrier() + #define ECB_MEMORY_FENCE_ACQUIRE _ReadWriteBarrier (); MemoryBarrier() /* according to msdn, _ReadBarrier is not a load fence */ + #define ECB_MEMORY_FENCE_RELEASE _WriteBarrier (); MemoryBarrier() + #elif _MSC_VER >= 1400 /* VC++ 2005 */ + #pragma intrinsic(_ReadBarrier,_WriteBarrier,_ReadWriteBarrier) + #define ECB_MEMORY_FENCE _ReadWriteBarrier () + #define ECB_MEMORY_FENCE_ACQUIRE _ReadWriteBarrier () /* according to msdn, _ReadBarrier is not a load fence */ + #define ECB_MEMORY_FENCE_RELEASE _WriteBarrier () + #elif defined _WIN32 + #include + #define ECB_MEMORY_FENCE MemoryBarrier () /* actually just xchg on x86... scary */ + #elif __SUNPRO_C >= 0x5110 || __SUNPRO_CC >= 0x5110 + #include + #define ECB_MEMORY_FENCE __machine_rw_barrier () + #define ECB_MEMORY_FENCE_ACQUIRE __machine_r_barrier () + #define ECB_MEMORY_FENCE_RELEASE __machine_w_barrier () + #elif __xlC__ + #define ECB_MEMORY_FENCE __sync () + #endif +#endif + +#ifndef ECB_MEMORY_FENCE + #if ECB_C11 && !defined __STDC_NO_ATOMICS__ + /* we assume that these memory fences work on all variables/all memory accesses, */ + /* not just C11 atomics and atomic accesses */ + #include + /* Unfortunately, neither gcc 4.7 nor clang 3.1 generate any instructions for */ + /* any fence other than seq_cst, which isn't very efficient for us. */ + /* Why that is, we don't know - either the C11 memory model is quite useless */ + /* for most usages, or gcc and clang have a bug */ + /* I *currently* lean towards the latter, and inefficiently implement */ + /* all three of ecb's fences as a seq_cst fence */ + /* Update, gcc-4.8 generates mfence for all c++ fences, but nothing */ + /* for all __atomic_thread_fence's except seq_cst */ + #define ECB_MEMORY_FENCE atomic_thread_fence (memory_order_seq_cst) + #endif +#endif + +#ifndef ECB_MEMORY_FENCE + #if !ECB_AVOID_PTHREADS + /* + * if you get undefined symbol references to pthread_mutex_lock, + * or failure to find pthread.h, then you should implement + * the ECB_MEMORY_FENCE operations for your cpu/compiler + * OR provide pthread.h and link against the posix thread library + * of your system. + */ + #include + #define ECB_NEEDS_PTHREADS 1 + #define ECB_MEMORY_FENCE_NEEDS_PTHREADS 1 + + static pthread_mutex_t ecb_mf_lock = PTHREAD_MUTEX_INITIALIZER; + #define ECB_MEMORY_FENCE do { pthread_mutex_lock (&ecb_mf_lock); pthread_mutex_unlock (&ecb_mf_lock); } while (0) + #endif +#endif + +#if !defined ECB_MEMORY_FENCE_ACQUIRE && defined ECB_MEMORY_FENCE + #define ECB_MEMORY_FENCE_ACQUIRE ECB_MEMORY_FENCE +#endif + +#if !defined ECB_MEMORY_FENCE_RELEASE && defined ECB_MEMORY_FENCE + #define ECB_MEMORY_FENCE_RELEASE ECB_MEMORY_FENCE +#endif + +/*****************************************************************************/ + +#if ECB_CPP + #define ecb_inline static inline +#elif ECB_GCC_VERSION(2,5) + #define ecb_inline static __inline__ +#elif ECB_C99 + #define ecb_inline static inline +#else + #define ecb_inline static +#endif + +#if ECB_GCC_VERSION(3,3) + #define ecb_restrict __restrict__ +#elif ECB_C99 + #define ecb_restrict restrict +#else + #define ecb_restrict +#endif + +typedef int ecb_bool; + +#define ECB_CONCAT_(a, b) a ## b +#define ECB_CONCAT(a, b) ECB_CONCAT_(a, b) +#define ECB_STRINGIFY_(a) # a +#define ECB_STRINGIFY(a) ECB_STRINGIFY_(a) +#define ECB_STRINGIFY_EXPR(expr) ((expr), ECB_STRINGIFY_ (expr)) + +#define ecb_function_ ecb_inline + +#if ECB_GCC_VERSION(3,1) || ECB_CLANG_VERSION(2,8) + #define ecb_attribute(attrlist) __attribute__ (attrlist) +#else + #define ecb_attribute(attrlist) +#endif + +#if ECB_GCC_VERSION(3,1) || ECB_CLANG_BUILTIN(__builtin_constant_p) + #define ecb_is_constant(expr) __builtin_constant_p (expr) +#else + /* possible C11 impl for integral types + typedef struct ecb_is_constant_struct ecb_is_constant_struct; + #define ecb_is_constant(expr) _Generic ((1 ? (struct ecb_is_constant_struct *)0 : (void *)((expr) - (expr)), ecb_is_constant_struct *: 0, default: 1)) */ + + #define ecb_is_constant(expr) 0 +#endif + +#if ECB_GCC_VERSION(3,1) || ECB_CLANG_BUILTIN(__builtin_expect) + #define ecb_expect(expr,value) __builtin_expect ((expr),(value)) +#else + #define ecb_expect(expr,value) (expr) +#endif + +#if ECB_GCC_VERSION(3,1) || ECB_CLANG_BUILTIN(__builtin_prefetch) + #define ecb_prefetch(addr,rw,locality) __builtin_prefetch (addr, rw, locality) +#else + #define ecb_prefetch(addr,rw,locality) +#endif + +/* no emulation for ecb_decltype */ +#if ECB_CPP11 + // older implementations might have problems with decltype(x)::type, work around it + template struct ecb_decltype_t { typedef T type; }; + #define ecb_decltype(x) ecb_decltype_t::type +#elif ECB_GCC_VERSION(3,0) || ECB_CLANG_VERSION(2,8) + #define ecb_decltype(x) __typeof__ (x) +#endif + +#if _MSC_VER >= 1300 + #define ecb_deprecated __declspec (deprecated) +#else + #define ecb_deprecated ecb_attribute ((__deprecated__)) +#endif + +#if _MSC_VER >= 1500 + #define ecb_deprecated_message(msg) __declspec (deprecated (msg)) +#elif ECB_GCC_VERSION(4,5) + #define ecb_deprecated_message(msg) ecb_attribute ((__deprecated__ (msg)) +#else + #define ecb_deprecated_message(msg) ecb_deprecated +#endif + +#if _MSC_VER >= 1400 + #define ecb_noinline __declspec (noinline) +#else + #define ecb_noinline ecb_attribute ((__noinline__)) +#endif + +#define ecb_unused ecb_attribute ((__unused__)) +#define ecb_const ecb_attribute ((__const__)) +#define ecb_pure ecb_attribute ((__pure__)) + +#if ECB_C11 || __IBMC_NORETURN + /* http://www-01.ibm.com/support/knowledgecenter/SSGH3R_13.1.0/com.ibm.xlcpp131.aix.doc/language_ref/noreturn.html */ + #define ecb_noreturn _Noreturn +#elif ECB_CPP11 + #define ecb_noreturn [[noreturn]] +#elif _MSC_VER >= 1200 + /* http://msdn.microsoft.com/en-us/library/k6ktzx3s.aspx */ + #define ecb_noreturn __declspec (noreturn) +#else + #define ecb_noreturn ecb_attribute ((__noreturn__)) +#endif + +#if ECB_GCC_VERSION(4,3) + #define ecb_artificial ecb_attribute ((__artificial__)) + #define ecb_hot ecb_attribute ((__hot__)) + #define ecb_cold ecb_attribute ((__cold__)) +#else + #define ecb_artificial + #define ecb_hot + #define ecb_cold +#endif + +/* put around conditional expressions if you are very sure that the */ +/* expression is mostly true or mostly false. note that these return */ +/* booleans, not the expression. */ +#define ecb_expect_false(expr) ecb_expect (!!(expr), 0) +#define ecb_expect_true(expr) ecb_expect (!!(expr), 1) +/* for compatibility to the rest of the world */ +#define ecb_likely(expr) ecb_expect_true (expr) +#define ecb_unlikely(expr) ecb_expect_false (expr) + +/* count trailing zero bits and count # of one bits */ +#if ECB_GCC_VERSION(3,4) \ + || (ECB_CLANG_BUILTIN(__builtin_clz) && ECB_CLANG_BUILTIN(__builtin_clzll) \ + && ECB_CLANG_BUILTIN(__builtin_ctz) && ECB_CLANG_BUILTIN(__builtin_ctzll) \ + && ECB_CLANG_BUILTIN(__builtin_popcount)) + /* we assume int == 32 bit, long == 32 or 64 bit and long long == 64 bit */ + #define ecb_ld32(x) (__builtin_clz (x) ^ 31) + #define ecb_ld64(x) (__builtin_clzll (x) ^ 63) + #define ecb_ctz32(x) __builtin_ctz (x) + #define ecb_ctz64(x) __builtin_ctzll (x) + #define ecb_popcount32(x) __builtin_popcount (x) + /* no popcountll */ +#else + ecb_function_ ecb_const int ecb_ctz32 (uint32_t x); + ecb_function_ ecb_const int + ecb_ctz32 (uint32_t x) + { +#if 1400 <= _MSC_VER && (_M_IX86 || _M_X64 || _M_IA64 || _M_ARM) + unsigned long r; + _BitScanForward (&r, x); + return (int)r; +#else + int r = 0; + + x &= ~x + 1; /* this isolates the lowest bit */ + +#if ECB_branchless_on_i386 + r += !!(x & 0xaaaaaaaa) << 0; + r += !!(x & 0xcccccccc) << 1; + r += !!(x & 0xf0f0f0f0) << 2; + r += !!(x & 0xff00ff00) << 3; + r += !!(x & 0xffff0000) << 4; +#else + if (x & 0xaaaaaaaa) r += 1; + if (x & 0xcccccccc) r += 2; + if (x & 0xf0f0f0f0) r += 4; + if (x & 0xff00ff00) r += 8; + if (x & 0xffff0000) r += 16; +#endif + + return r; +#endif + } + + ecb_function_ ecb_const int ecb_ctz64 (uint64_t x); + ecb_function_ ecb_const int + ecb_ctz64 (uint64_t x) + { +#if 1400 <= _MSC_VER && (_M_X64 || _M_IA64 || _M_ARM) + unsigned long r; + _BitScanForward64 (&r, x); + return (int)r; +#else + int shift = x & 0xffffffff ? 0 : 32; + return ecb_ctz32 (x >> shift) + shift; +#endif + } + + ecb_function_ ecb_const int ecb_popcount32 (uint32_t x); + ecb_function_ ecb_const int + ecb_popcount32 (uint32_t x) + { + x -= (x >> 1) & 0x55555555; + x = ((x >> 2) & 0x33333333) + (x & 0x33333333); + x = ((x >> 4) + x) & 0x0f0f0f0f; + x *= 0x01010101; + + return x >> 24; + } + + ecb_function_ ecb_const int ecb_ld32 (uint32_t x); + ecb_function_ ecb_const int ecb_ld32 (uint32_t x) + { +#if 1400 <= _MSC_VER && (_M_IX86 || _M_X64 || _M_IA64 || _M_ARM) + unsigned long r; + _BitScanReverse (&r, x); + return (int)r; +#else + int r = 0; + + if (x >> 16) { x >>= 16; r += 16; } + if (x >> 8) { x >>= 8; r += 8; } + if (x >> 4) { x >>= 4; r += 4; } + if (x >> 2) { x >>= 2; r += 2; } + if (x >> 1) { r += 1; } + + return r; +#endif + } + + ecb_function_ ecb_const int ecb_ld64 (uint64_t x); + ecb_function_ ecb_const int ecb_ld64 (uint64_t x) + { +#if 1400 <= _MSC_VER && (_M_X64 || _M_IA64 || _M_ARM) + unsigned long r; + _BitScanReverse64 (&r, x); + return (int)r; +#else + int r = 0; + + if (x >> 32) { x >>= 32; r += 32; } + + return r + ecb_ld32 (x); +#endif + } +#endif + +ecb_function_ ecb_const ecb_bool ecb_is_pot32 (uint32_t x); +ecb_function_ ecb_const ecb_bool ecb_is_pot32 (uint32_t x) { return !(x & (x - 1)); } +ecb_function_ ecb_const ecb_bool ecb_is_pot64 (uint64_t x); +ecb_function_ ecb_const ecb_bool ecb_is_pot64 (uint64_t x) { return !(x & (x - 1)); } + +ecb_function_ ecb_const uint8_t ecb_bitrev8 (uint8_t x); +ecb_function_ ecb_const uint8_t ecb_bitrev8 (uint8_t x) +{ + return ( (x * 0x0802U & 0x22110U) + | (x * 0x8020U & 0x88440U)) * 0x10101U >> 16; +} + +ecb_function_ ecb_const uint16_t ecb_bitrev16 (uint16_t x); +ecb_function_ ecb_const uint16_t ecb_bitrev16 (uint16_t x) +{ + x = ((x >> 1) & 0x5555) | ((x & 0x5555) << 1); + x = ((x >> 2) & 0x3333) | ((x & 0x3333) << 2); + x = ((x >> 4) & 0x0f0f) | ((x & 0x0f0f) << 4); + x = ( x >> 8 ) | ( x << 8); + + return x; +} + +ecb_function_ ecb_const uint32_t ecb_bitrev32 (uint32_t x); +ecb_function_ ecb_const uint32_t ecb_bitrev32 (uint32_t x) +{ + x = ((x >> 1) & 0x55555555) | ((x & 0x55555555) << 1); + x = ((x >> 2) & 0x33333333) | ((x & 0x33333333) << 2); + x = ((x >> 4) & 0x0f0f0f0f) | ((x & 0x0f0f0f0f) << 4); + x = ((x >> 8) & 0x00ff00ff) | ((x & 0x00ff00ff) << 8); + x = ( x >> 16 ) | ( x << 16); + + return x; +} + +/* popcount64 is only available on 64 bit cpus as gcc builtin */ +/* so for this version we are lazy */ +ecb_function_ ecb_const int ecb_popcount64 (uint64_t x); +ecb_function_ ecb_const int +ecb_popcount64 (uint64_t x) +{ + return ecb_popcount32 (x) + ecb_popcount32 (x >> 32); +} + +ecb_inline ecb_const uint8_t ecb_rotl8 (uint8_t x, unsigned int count); +ecb_inline ecb_const uint8_t ecb_rotr8 (uint8_t x, unsigned int count); +ecb_inline ecb_const uint16_t ecb_rotl16 (uint16_t x, unsigned int count); +ecb_inline ecb_const uint16_t ecb_rotr16 (uint16_t x, unsigned int count); +ecb_inline ecb_const uint32_t ecb_rotl32 (uint32_t x, unsigned int count); +ecb_inline ecb_const uint32_t ecb_rotr32 (uint32_t x, unsigned int count); +ecb_inline ecb_const uint64_t ecb_rotl64 (uint64_t x, unsigned int count); +ecb_inline ecb_const uint64_t ecb_rotr64 (uint64_t x, unsigned int count); + +ecb_inline ecb_const uint8_t ecb_rotl8 (uint8_t x, unsigned int count) { return (x >> ( 8 - count)) | (x << count); } +ecb_inline ecb_const uint8_t ecb_rotr8 (uint8_t x, unsigned int count) { return (x << ( 8 - count)) | (x >> count); } +ecb_inline ecb_const uint16_t ecb_rotl16 (uint16_t x, unsigned int count) { return (x >> (16 - count)) | (x << count); } +ecb_inline ecb_const uint16_t ecb_rotr16 (uint16_t x, unsigned int count) { return (x << (16 - count)) | (x >> count); } +ecb_inline ecb_const uint32_t ecb_rotl32 (uint32_t x, unsigned int count) { return (x >> (32 - count)) | (x << count); } +ecb_inline ecb_const uint32_t ecb_rotr32 (uint32_t x, unsigned int count) { return (x << (32 - count)) | (x >> count); } +ecb_inline ecb_const uint64_t ecb_rotl64 (uint64_t x, unsigned int count) { return (x >> (64 - count)) | (x << count); } +ecb_inline ecb_const uint64_t ecb_rotr64 (uint64_t x, unsigned int count) { return (x << (64 - count)) | (x >> count); } + +#if ECB_GCC_VERSION(4,3) || (ECB_CLANG_BUILTIN(__builtin_bswap32) && ECB_CLANG_BUILTIN(__builtin_bswap64)) + #if ECB_GCC_VERSION(4,8) || ECB_CLANG_BUILTIN(__builtin_bswap16) + #define ecb_bswap16(x) __builtin_bswap16 (x) + #else + #define ecb_bswap16(x) (__builtin_bswap32 (x) >> 16) + #endif + #define ecb_bswap32(x) __builtin_bswap32 (x) + #define ecb_bswap64(x) __builtin_bswap64 (x) +#elif _MSC_VER + #include + #define ecb_bswap16(x) ((uint16_t)_byteswap_ushort ((uint16_t)(x))) + #define ecb_bswap32(x) ((uint32_t)_byteswap_ulong ((uint32_t)(x))) + #define ecb_bswap64(x) ((uint64_t)_byteswap_uint64 ((uint64_t)(x))) +#else + ecb_function_ ecb_const uint16_t ecb_bswap16 (uint16_t x); + ecb_function_ ecb_const uint16_t + ecb_bswap16 (uint16_t x) + { + return ecb_rotl16 (x, 8); + } + + ecb_function_ ecb_const uint32_t ecb_bswap32 (uint32_t x); + ecb_function_ ecb_const uint32_t + ecb_bswap32 (uint32_t x) + { + return (((uint32_t)ecb_bswap16 (x)) << 16) | ecb_bswap16 (x >> 16); + } + + ecb_function_ ecb_const uint64_t ecb_bswap64 (uint64_t x); + ecb_function_ ecb_const uint64_t + ecb_bswap64 (uint64_t x) + { + return (((uint64_t)ecb_bswap32 (x)) << 32) | ecb_bswap32 (x >> 32); + } +#endif + +#if ECB_GCC_VERSION(4,5) || ECB_CLANG_BUILTIN(__builtin_unreachable) + #define ecb_unreachable() __builtin_unreachable () +#else + /* this seems to work fine, but gcc always emits a warning for it :/ */ + ecb_inline ecb_noreturn void ecb_unreachable (void); + ecb_inline ecb_noreturn void ecb_unreachable (void) { } +#endif + +/* try to tell the compiler that some condition is definitely true */ +#define ecb_assume(cond) if (!(cond)) ecb_unreachable (); else 0 + +ecb_inline ecb_const uint32_t ecb_byteorder_helper (void); +ecb_inline ecb_const uint32_t +ecb_byteorder_helper (void) +{ + /* the union code still generates code under pressure in gcc, */ + /* but less than using pointers, and always seems to */ + /* successfully return a constant. */ + /* the reason why we have this horrible preprocessor mess */ + /* is to avoid it in all cases, at least on common architectures */ + /* or when using a recent enough gcc version (>= 4.6) */ +#if (defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ + || ((__i386 || __i386__ || _M_IX86 || ECB_GCC_AMD64 || ECB_MSVC_AMD64) && !__VOS__) + #define ECB_LITTLE_ENDIAN 1 + return 0x44332211; +#elif (defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) \ + || ((__AARCH64EB__ || __MIPSEB__ || __ARMEB__) && !__VOS__) + #define ECB_BIG_ENDIAN 1 + return 0x11223344; +#else + union + { + uint8_t c[4]; + uint32_t u; + } u = { 0x11, 0x22, 0x33, 0x44 }; + return u.u; +#endif +} + +ecb_inline ecb_const ecb_bool ecb_big_endian (void); +ecb_inline ecb_const ecb_bool ecb_big_endian (void) { return ecb_byteorder_helper () == 0x11223344; } +ecb_inline ecb_const ecb_bool ecb_little_endian (void); +ecb_inline ecb_const ecb_bool ecb_little_endian (void) { return ecb_byteorder_helper () == 0x44332211; } + +#if ECB_GCC_VERSION(3,0) || ECB_C99 + #define ecb_mod(m,n) ((m) % (n) + ((m) % (n) < 0 ? (n) : 0)) +#else + #define ecb_mod(m,n) ((m) < 0 ? ((n) - 1 - ((-1 - (m)) % (n))) : ((m) % (n))) +#endif + +#if ECB_CPP + template + static inline T ecb_div_rd (T val, T div) + { + return val < 0 ? - ((-val + div - 1) / div) : (val ) / div; + } + template + static inline T ecb_div_ru (T val, T div) + { + return val < 0 ? - ((-val ) / div) : (val + div - 1) / div; + } +#else + #define ecb_div_rd(val,div) ((val) < 0 ? - ((-(val) + (div) - 1) / (div)) : ((val) ) / (div)) + #define ecb_div_ru(val,div) ((val) < 0 ? - ((-(val) ) / (div)) : ((val) + (div) - 1) / (div)) +#endif + +#if ecb_cplusplus_does_not_suck + /* does not work for local types (http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2657.htm) */ + template + static inline int ecb_array_length (const T (&arr)[N]) + { + return N; + } +#else + #define ecb_array_length(name) (sizeof (name) / sizeof (name [0])) +#endif + +ecb_function_ ecb_const uint32_t ecb_binary16_to_binary32 (uint32_t x); +ecb_function_ ecb_const uint32_t +ecb_binary16_to_binary32 (uint32_t x) +{ + unsigned int s = (x & 0x8000) << (31 - 15); + int e = (x >> 10) & 0x001f; + unsigned int m = x & 0x03ff; + + if (ecb_expect_false (e == 31)) + /* infinity or NaN */ + e = 255 - (127 - 15); + else if (ecb_expect_false (!e)) + { + if (ecb_expect_true (!m)) + /* zero, handled by code below by forcing e to 0 */ + e = 0 - (127 - 15); + else + { + /* subnormal, renormalise */ + unsigned int s = 10 - ecb_ld32 (m); + + m = (m << s) & 0x3ff; /* mask implicit bit */ + e -= s - 1; + } + } + + /* e and m now are normalised, or zero, (or inf or nan) */ + e += 127 - 15; + + return s | (e << 23) | (m << (23 - 10)); +} + +ecb_function_ ecb_const uint16_t ecb_binary32_to_binary16 (uint32_t x); +ecb_function_ ecb_const uint16_t +ecb_binary32_to_binary16 (uint32_t x) +{ + unsigned int s = (x >> 16) & 0x00008000; /* sign bit, the easy part */ + unsigned int e = ((x >> 23) & 0x000000ff) - (127 - 15); /* the desired exponent */ + unsigned int m = x & 0x007fffff; + + x &= 0x7fffffff; + + /* if it's within range of binary16 normals, use fast path */ + if (ecb_expect_true (0x38800000 <= x && x <= 0x477fefff)) + { + /* mantissa round-to-even */ + m += 0x00000fff + ((m >> (23 - 10)) & 1); + + /* handle overflow */ + if (ecb_expect_false (m >= 0x00800000)) + { + m >>= 1; + e += 1; + } + + return s | (e << 10) | (m >> (23 - 10)); + } + + /* handle large numbers and infinity */ + if (ecb_expect_true (0x477fefff < x && x <= 0x7f800000)) + return s | 0x7c00; + + /* handle zero, subnormals and small numbers */ + if (ecb_expect_true (x < 0x38800000)) + { + /* zero */ + if (ecb_expect_true (!x)) + return s; + + /* handle subnormals */ + + /* too small, will be zero */ + if (e < (14 - 24)) /* might not be sharp, but is good enough */ + return s; + + m |= 0x00800000; /* make implicit bit explicit */ + + /* very tricky - we need to round to the nearest e (+10) bit value */ + { + unsigned int bits = 14 - e; + unsigned int half = (1 << (bits - 1)) - 1; + unsigned int even = (m >> bits) & 1; + + /* if this overflows, we will end up with a normalised number */ + m = (m + half + even) >> bits; + } + + return s | m; + } + + /* handle NaNs, preserve leftmost nan bits, but make sure we don't turn them into infinities */ + m >>= 13; + + return s | 0x7c00 | m | !m; +} + +/*******************************************************************************/ +/* floating point stuff, can be disabled by defining ECB_NO_LIBM */ + +/* basically, everything uses "ieee pure-endian" floating point numbers */ +/* the only noteworthy exception is ancient armle, which uses order 43218765 */ +#if 0 \ + || __i386 || __i386__ \ + || ECB_GCC_AMD64 \ + || __powerpc__ || __ppc__ || __powerpc64__ || __ppc64__ \ + || defined __s390__ || defined __s390x__ \ + || defined __mips__ \ + || defined __alpha__ \ + || defined __hppa__ \ + || defined __ia64__ \ + || defined __m68k__ \ + || defined __m88k__ \ + || defined __sh__ \ + || defined _M_IX86 || defined ECB_MSVC_AMD64 || defined _M_IA64 \ + || (defined __arm__ && (defined __ARM_EABI__ || defined __EABI__ || defined __VFP_FP__ || defined _WIN32_WCE || defined __ANDROID__)) \ + || defined __aarch64__ + #define ECB_STDFP 1 + #include /* for memcpy */ +#else + #define ECB_STDFP 0 +#endif + +#ifndef ECB_NO_LIBM + + #include /* for frexp*, ldexp*, INFINITY, NAN */ + + /* only the oldest of old doesn't have this one. solaris. */ + #ifdef INFINITY + #define ECB_INFINITY INFINITY + #else + #define ECB_INFINITY HUGE_VAL + #endif + + #ifdef NAN + #define ECB_NAN NAN + #else + #define ECB_NAN ECB_INFINITY + #endif + + #if ECB_C99 || _XOPEN_VERSION >= 600 || _POSIX_VERSION >= 200112L + #define ecb_ldexpf(x,e) ldexpf ((x), (e)) + #define ecb_frexpf(x,e) frexpf ((x), (e)) + #else + #define ecb_ldexpf(x,e) (float) ldexp ((double) (x), (e)) + #define ecb_frexpf(x,e) (float) frexp ((double) (x), (e)) + #endif + + /* convert a float to ieee single/binary32 */ + ecb_function_ ecb_const uint32_t ecb_float_to_binary32 (float x); + ecb_function_ ecb_const uint32_t + ecb_float_to_binary32 (float x) + { + uint32_t r; + + #if ECB_STDFP + memcpy (&r, &x, 4); + #else + /* slow emulation, works for anything but -0 */ + uint32_t m; + int e; + + if (x == 0e0f ) return 0x00000000U; + if (x > +3.40282346638528860e+38f) return 0x7f800000U; + if (x < -3.40282346638528860e+38f) return 0xff800000U; + if (x != x ) return 0x7fbfffffU; + + m = ecb_frexpf (x, &e) * 0x1000000U; + + r = m & 0x80000000U; + + if (r) + m = -m; + + if (e <= -126) + { + m &= 0xffffffU; + m >>= (-125 - e); + e = -126; + } + + r |= (e + 126) << 23; + r |= m & 0x7fffffU; + #endif + + return r; + } + + /* converts an ieee single/binary32 to a float */ + ecb_function_ ecb_const float ecb_binary32_to_float (uint32_t x); + ecb_function_ ecb_const float + ecb_binary32_to_float (uint32_t x) + { + float r; + + #if ECB_STDFP + memcpy (&r, &x, 4); + #else + /* emulation, only works for normals and subnormals and +0 */ + int neg = x >> 31; + int e = (x >> 23) & 0xffU; + + x &= 0x7fffffU; + + if (e) + x |= 0x800000U; + else + e = 1; + + /* we distrust ldexpf a bit and do the 2**-24 scaling by an extra multiply */ + r = ecb_ldexpf (x * (0.5f / 0x800000U), e - 126); + + r = neg ? -r : r; + #endif + + return r; + } + + /* convert a double to ieee double/binary64 */ + ecb_function_ ecb_const uint64_t ecb_double_to_binary64 (double x); + ecb_function_ ecb_const uint64_t + ecb_double_to_binary64 (double x) + { + uint64_t r; + + #if ECB_STDFP + memcpy (&r, &x, 8); + #else + /* slow emulation, works for anything but -0 */ + uint64_t m; + int e; + + if (x == 0e0 ) return 0x0000000000000000U; + if (x > +1.79769313486231470e+308) return 0x7ff0000000000000U; + if (x < -1.79769313486231470e+308) return 0xfff0000000000000U; + if (x != x ) return 0X7ff7ffffffffffffU; + + m = frexp (x, &e) * 0x20000000000000U; + + r = m & 0x8000000000000000;; + + if (r) + m = -m; + + if (e <= -1022) + { + m &= 0x1fffffffffffffU; + m >>= (-1021 - e); + e = -1022; + } + + r |= ((uint64_t)(e + 1022)) << 52; + r |= m & 0xfffffffffffffU; + #endif + + return r; + } + + /* converts an ieee double/binary64 to a double */ + ecb_function_ ecb_const double ecb_binary64_to_double (uint64_t x); + ecb_function_ ecb_const double + ecb_binary64_to_double (uint64_t x) + { + double r; + + #if ECB_STDFP + memcpy (&r, &x, 8); + #else + /* emulation, only works for normals and subnormals and +0 */ + int neg = x >> 63; + int e = (x >> 52) & 0x7ffU; + + x &= 0xfffffffffffffU; + + if (e) + x |= 0x10000000000000U; + else + e = 1; + + /* we distrust ldexp a bit and do the 2**-53 scaling by an extra multiply */ + r = ldexp (x * (0.5 / 0x10000000000000U), e - 1022); + + r = neg ? -r : r; + #endif + + return r; + } + + /* convert a float to ieee half/binary16 */ + ecb_function_ ecb_const uint16_t ecb_float_to_binary16 (float x); + ecb_function_ ecb_const uint16_t + ecb_float_to_binary16 (float x) + { + return ecb_binary32_to_binary16 (ecb_float_to_binary32 (x)); + } + + /* convert an ieee half/binary16 to float */ + ecb_function_ ecb_const float ecb_binary16_to_float (uint16_t x); + ecb_function_ ecb_const float + ecb_binary16_to_float (uint16_t x) + { + return ecb_binary32_to_float (ecb_binary16_to_binary32 (x)); + } + +#endif + +#endif + +/* ECB.H END */ + +#if ECB_MEMORY_FENCE_NEEDS_PTHREADS +/* if your architecture doesn't need memory fences, e.g. because it is + * single-cpu/core, or if you use libev in a project that doesn't use libev + * from multiple threads, then you can define ECB_AVOID_PTHREADS when compiling + * libev, in which cases the memory fences become nops. + * alternatively, you can remove this #error and link against libpthread, + * which will then provide the memory fences. + */ +# error "memory fences not defined for your architecture, please report" +#endif + +#ifndef ECB_MEMORY_FENCE +# define ECB_MEMORY_FENCE do { } while (0) +# define ECB_MEMORY_FENCE_ACQUIRE ECB_MEMORY_FENCE +# define ECB_MEMORY_FENCE_RELEASE ECB_MEMORY_FENCE +#endif + +#define expect_false(cond) ecb_expect_false (cond) +#define expect_true(cond) ecb_expect_true (cond) +#define noinline ecb_noinline + +#define inline_size ecb_inline + +#if EV_FEATURE_CODE +# define inline_speed ecb_inline +#else +# define inline_speed noinline static +#endif + +#define NUMPRI (EV_MAXPRI - EV_MINPRI + 1) + +#if EV_MINPRI == EV_MAXPRI +# define ABSPRI(w) (((W)w), 0) +#else +# define ABSPRI(w) (((W)w)->priority - EV_MINPRI) +#endif + +#define EMPTY /* required for microsofts broken pseudo-c compiler */ +#define EMPTY2(a,b) /* used to suppress some warnings */ + +typedef ev_watcher *W; +typedef ev_watcher_list *WL; +typedef ev_watcher_time *WT; + +#define ev_active(w) ((W)(w))->active +#define ev_at(w) ((WT)(w))->at + +#if EV_USE_REALTIME +/* sig_atomic_t is used to avoid per-thread variables or locking but still */ +/* giving it a reasonably high chance of working on typical architectures */ +static EV_ATOMIC_T have_realtime; /* did clock_gettime (CLOCK_REALTIME) work? */ +#endif + +#if EV_USE_MONOTONIC +static EV_ATOMIC_T have_monotonic; /* did clock_gettime (CLOCK_MONOTONIC) work? */ +#endif + +#ifndef EV_FD_TO_WIN32_HANDLE +# define EV_FD_TO_WIN32_HANDLE(fd) _get_osfhandle (fd) +#endif +#ifndef EV_WIN32_HANDLE_TO_FD +# define EV_WIN32_HANDLE_TO_FD(handle) _open_osfhandle (handle, 0) +#endif +#ifndef EV_WIN32_CLOSE_FD +# define EV_WIN32_CLOSE_FD(fd) close (fd) +#endif + +#ifdef _WIN32 +# include "ev_win32.c" +#endif + +/*****************************************************************************/ + +/* define a suitable floor function (only used by periodics atm) */ + +#if EV_USE_FLOOR +# include +# define ev_floor(v) floor (v) +#else + +#include + +/* a floor() replacement function, should be independent of ev_tstamp type */ +noinline +static ev_tstamp +ev_floor (ev_tstamp v) +{ + /* the choice of shift factor is not terribly important */ +#if FLT_RADIX != 2 /* assume FLT_RADIX == 10 */ + const ev_tstamp shift = sizeof (unsigned long) >= 8 ? 10000000000000000000. : 1000000000.; +#else + const ev_tstamp shift = sizeof (unsigned long) >= 8 ? 18446744073709551616. : 4294967296.; +#endif + + /* argument too large for an unsigned long? */ + if (expect_false (v >= shift)) + { + ev_tstamp f; + + if (v == v - 1.) + return v; /* very large number */ + + f = shift * ev_floor (v * (1. / shift)); + return f + ev_floor (v - f); + } + + /* special treatment for negative args? */ + if (expect_false (v < 0.)) + { + ev_tstamp f = -ev_floor (-v); + + return f - (f == v ? 0 : 1); + } + + /* fits into an unsigned long */ + return (unsigned long)v; +} + +#endif + +/*****************************************************************************/ + +#ifdef __linux +# include +#endif + +noinline ecb_cold +static unsigned int +ev_linux_version (void) +{ +#ifdef __linux + unsigned int v = 0; + struct utsname buf; + int i; + char *p = buf.release; + + if (uname (&buf)) + return 0; + + for (i = 3+1; --i; ) + { + unsigned int c = 0; + + for (;;) + { + if (*p >= '0' && *p <= '9') + c = c * 10 + *p++ - '0'; + else + { + p += *p == '.'; + break; + } + } + + v = (v << 8) | c; + } + + return v; +#else + return 0; +#endif +} + +/*****************************************************************************/ + +#if EV_AVOID_STDIO +noinline ecb_cold +static void +ev_printerr (const char *msg) +{ + write (STDERR_FILENO, msg, strlen (msg)); +} +#endif + +static void (*syserr_cb)(const char *msg) EV_THROW; + +ecb_cold +void +ev_set_syserr_cb (void (*cb)(const char *msg) EV_THROW) EV_THROW +{ + syserr_cb = cb; +} + +noinline ecb_cold +static void +ev_syserr (const char *msg) +{ + if (!msg) + msg = "(libev) system error"; + + if (syserr_cb) + syserr_cb (msg); + else + { +#if EV_AVOID_STDIO + ev_printerr (msg); + ev_printerr (": "); + ev_printerr (strerror (errno)); + ev_printerr ("\n"); +#else + perror (msg); +#endif + abort (); + } +} + +static void * +ev_realloc_emul (void *ptr, long size) EV_THROW +{ + /* some systems, notably openbsd and darwin, fail to properly + * implement realloc (x, 0) (as required by both ansi c-89 and + * the single unix specification, so work around them here. + * recently, also (at least) fedora and debian started breaking it, + * despite documenting it otherwise. + */ + + if (size) + return realloc (ptr, size); + + free (ptr); + return 0; +} + +static void *(*alloc)(void *ptr, long size) EV_THROW = ev_realloc_emul; + +ecb_cold +void +ev_set_allocator (void *(*cb)(void *ptr, long size) EV_THROW) EV_THROW +{ + alloc = cb; +} + +inline_speed void * +ev_realloc (void *ptr, long size) +{ + ptr = alloc (ptr, size); + + if (!ptr && size) + { +#if EV_AVOID_STDIO + ev_printerr ("(libev) memory allocation failed, aborting.\n"); +#else + fprintf (stderr, "(libev) cannot allocate %ld bytes, aborting.", size); +#endif + abort (); + } + + return ptr; +} + +#define ev_malloc(size) ev_realloc (0, (size)) +#define ev_free(ptr) ev_realloc ((ptr), 0) + +/*****************************************************************************/ + +/* set in reify when reification needed */ +#define EV_ANFD_REIFY 1 + +/* file descriptor info structure */ +typedef struct +{ + WL head; + unsigned char events; /* the events watched for */ + unsigned char reify; /* flag set when this ANFD needs reification (EV_ANFD_REIFY, EV__IOFDSET) */ + unsigned char emask; /* the epoll backend stores the actual kernel mask in here */ + unsigned char unused; +#if EV_USE_EPOLL + unsigned int egen; /* generation counter to counter epoll bugs */ +#endif +#if EV_SELECT_IS_WINSOCKET || EV_USE_IOCP + SOCKET handle; +#endif +#if EV_USE_IOCP + OVERLAPPED or, ow; +#endif +} ANFD; + +/* stores the pending event set for a given watcher */ +typedef struct +{ + W w; + int events; /* the pending event set for the given watcher */ +} ANPENDING; + +#if EV_USE_INOTIFY +/* hash table entry per inotify-id */ +typedef struct +{ + WL head; +} ANFS; +#endif + +/* Heap Entry */ +#if EV_HEAP_CACHE_AT + /* a heap element */ + typedef struct { + ev_tstamp at; + WT w; + } ANHE; + + #define ANHE_w(he) (he).w /* access watcher, read-write */ + #define ANHE_at(he) (he).at /* access cached at, read-only */ + #define ANHE_at_cache(he) (he).at = (he).w->at /* update at from watcher */ +#else + /* a heap element */ + typedef WT ANHE; + + #define ANHE_w(he) (he) + #define ANHE_at(he) (he)->at + #define ANHE_at_cache(he) +#endif + +#if EV_MULTIPLICITY + + struct ev_loop + { + ev_tstamp ev_rt_now; + #define ev_rt_now ((loop)->ev_rt_now) + #define VAR(name,decl) decl; + #include "ev_vars.h" + #undef VAR + }; + #include "ev_wrap.h" + + static struct ev_loop default_loop_struct; + EV_API_DECL struct ev_loop *ev_default_loop_ptr = NULL; /* needs to be initialised to make it a definition despite extern */ + extern struct ev_loop *ev_default_loop_ptr; /* needs to be initialised to make it a definition despite extern */ + +#else + + EV_API_DECL ev_tstamp ev_rt_now = 0; /* needs to be initialised to make it a definition despite extern */ + #define VAR(name,decl) static decl; + #include "ev_vars.h" + #undef VAR + + static int ev_default_loop_ptr; + +#endif + +#if EV_FEATURE_API +# define EV_RELEASE_CB if (expect_false (release_cb)) release_cb (EV_A) +# define EV_ACQUIRE_CB if (expect_false (acquire_cb)) acquire_cb (EV_A) +# define EV_INVOKE_PENDING invoke_cb (EV_A) +#else +# define EV_RELEASE_CB (void)0 +# define EV_ACQUIRE_CB (void)0 +# define EV_INVOKE_PENDING ev_invoke_pending (EV_A) +#endif + +#define EVBREAK_RECURSE 0x80 + +/*****************************************************************************/ + +#ifndef EV_HAVE_EV_TIME +ev_tstamp +ev_time (void) EV_THROW +{ +#if EV_USE_REALTIME + if (expect_true (have_realtime)) + { + struct timespec ts; + clock_gettime (CLOCK_REALTIME, &ts); + return ts.tv_sec + ts.tv_nsec * 1e-9; + } +#endif + + struct timeval tv; + gettimeofday (&tv, 0); + return tv.tv_sec + tv.tv_usec * 1e-6; +} +#endif + +inline_size ev_tstamp +get_clock (void) +{ +#if EV_USE_MONOTONIC + if (expect_true (have_monotonic)) + { + struct timespec ts; + clock_gettime (CLOCK_MONOTONIC, &ts); + return ts.tv_sec + ts.tv_nsec * 1e-9; + } +#endif + + return ev_time (); +} + +#if EV_MULTIPLICITY +ev_tstamp +ev_now (EV_P) EV_THROW +{ + return ev_rt_now; +} +#endif + +void +ev_sleep (ev_tstamp delay) EV_THROW +{ + if (delay > 0.) + { +#if EV_USE_NANOSLEEP + struct timespec ts; + + EV_TS_SET (ts, delay); + nanosleep (&ts, 0); +#elif defined _WIN32 + Sleep ((unsigned long)(delay * 1e3)); +#else + struct timeval tv; + + /* here we rely on sys/time.h + sys/types.h + unistd.h providing select */ + /* something not guaranteed by newer posix versions, but guaranteed */ + /* by older ones */ + EV_TV_SET (tv, delay); + select (0, 0, 0, 0, &tv); +#endif + } +} + +/*****************************************************************************/ + +#define MALLOC_ROUND 4096 /* prefer to allocate in chunks of this size, must be 2**n and >> 4 longs */ + +/* find a suitable new size for the given array, */ +/* hopefully by rounding to a nice-to-malloc size */ +inline_size int +array_nextsize (int elem, int cur, int cnt) +{ + int ncur = cur + 1; + + do + ncur <<= 1; + while (cnt > ncur); + + /* if size is large, round to MALLOC_ROUND - 4 * longs to accommodate malloc overhead */ + if (elem * ncur > MALLOC_ROUND - sizeof (void *) * 4) + { + ncur *= elem; + ncur = (ncur + elem + (MALLOC_ROUND - 1) + sizeof (void *) * 4) & ~(MALLOC_ROUND - 1); + ncur = ncur - sizeof (void *) * 4; + ncur /= elem; + } + + return ncur; +} + +noinline ecb_cold +static void * +array_realloc (int elem, void *base, int *cur, int cnt) +{ + *cur = array_nextsize (elem, *cur, cnt); + return ev_realloc (base, elem * *cur); +} + +#define array_init_zero(base,count) \ + memset ((void *)(base), 0, sizeof (*(base)) * (count)) + +#define array_needsize(type,base,cur,cnt,init) \ + if (expect_false ((cnt) > (cur))) \ + { \ + ecb_unused int ocur_ = (cur); \ + (base) = (type *)array_realloc \ + (sizeof (type), (base), &(cur), (cnt)); \ + init ((base) + (ocur_), (cur) - ocur_); \ + } + +#if 0 +#define array_slim(type,stem) \ + if (stem ## max < array_roundsize (stem ## cnt >> 2)) \ + { \ + stem ## max = array_roundsize (stem ## cnt >> 1); \ + base = (type *)ev_realloc (base, sizeof (type) * (stem ## max));\ + fprintf (stderr, "slimmed down " # stem " to %d\n", stem ## max);/*D*/\ + } +#endif + +#define array_free(stem, idx) \ + ev_free (stem ## s idx); stem ## cnt idx = stem ## max idx = 0; stem ## s idx = 0 + +/*****************************************************************************/ + +/* dummy callback for pending events */ +noinline +static void +pendingcb (EV_P_ ev_prepare *w, int revents) +{ +} + +noinline +void +ev_feed_event (EV_P_ void *w, int revents) EV_THROW +{ + W w_ = (W)w; + int pri = ABSPRI (w_); + + if (expect_false (w_->pending)) + pendings [pri][w_->pending - 1].events |= revents; + else + { + w_->pending = ++pendingcnt [pri]; + array_needsize (ANPENDING, pendings [pri], pendingmax [pri], w_->pending, EMPTY2); + pendings [pri][w_->pending - 1].w = w_; + pendings [pri][w_->pending - 1].events = revents; + } + + pendingpri = NUMPRI - 1; +} + +inline_speed void +feed_reverse (EV_P_ W w) +{ + array_needsize (W, rfeeds, rfeedmax, rfeedcnt + 1, EMPTY2); + rfeeds [rfeedcnt++] = w; +} + +inline_size void +feed_reverse_done (EV_P_ int revents) +{ + do + ev_feed_event (EV_A_ rfeeds [--rfeedcnt], revents); + while (rfeedcnt); +} + +inline_speed void +queue_events (EV_P_ W *events, int eventcnt, int type) +{ + int i; + + for (i = 0; i < eventcnt; ++i) + ev_feed_event (EV_A_ events [i], type); +} + +/*****************************************************************************/ + +inline_speed void +fd_event_nocheck (EV_P_ int fd, int revents) +{ + ANFD *anfd = anfds + fd; + ev_io *w; + + for (w = (ev_io *)anfd->head; w; w = (ev_io *)((WL)w)->next) + { + int ev = w->events & revents; + + if (ev) + ev_feed_event (EV_A_ (W)w, ev); + } +} + +/* do not submit kernel events for fds that have reify set */ +/* because that means they changed while we were polling for new events */ +inline_speed void +fd_event (EV_P_ int fd, int revents) +{ + ANFD *anfd = anfds + fd; + + if (expect_true (!anfd->reify)) + fd_event_nocheck (EV_A_ fd, revents); +} + +void +ev_feed_fd_event (EV_P_ int fd, int revents) EV_THROW +{ + if (fd >= 0 && fd < anfdmax) + fd_event_nocheck (EV_A_ fd, revents); +} + +/* make sure the external fd watch events are in-sync */ +/* with the kernel/libev internal state */ +inline_size void +fd_reify (EV_P) +{ + int i; + +#if EV_SELECT_IS_WINSOCKET || EV_USE_IOCP + for (i = 0; i < fdchangecnt; ++i) + { + int fd = fdchanges [i]; + ANFD *anfd = anfds + fd; + + if (anfd->reify & EV__IOFDSET && anfd->head) + { + SOCKET handle = EV_FD_TO_WIN32_HANDLE (fd); + + if (handle != anfd->handle) + { + unsigned long arg; + + assert (("libev: only socket fds supported in this configuration", ioctlsocket (handle, FIONREAD, &arg) == 0)); + + /* handle changed, but fd didn't - we need to do it in two steps */ + backend_modify (EV_A_ fd, anfd->events, 0); + anfd->events = 0; + anfd->handle = handle; + } + } + } +#endif + + for (i = 0; i < fdchangecnt; ++i) + { + int fd = fdchanges [i]; + ANFD *anfd = anfds + fd; + ev_io *w; + + unsigned char o_events = anfd->events; + unsigned char o_reify = anfd->reify; + + anfd->reify = 0; + + /*if (expect_true (o_reify & EV_ANFD_REIFY)) probably a deoptimisation */ + { + anfd->events = 0; + + for (w = (ev_io *)anfd->head; w; w = (ev_io *)((WL)w)->next) + anfd->events |= (unsigned char)w->events; + + if (o_events != anfd->events) + o_reify = EV__IOFDSET; /* actually |= */ + } + + if (o_reify & EV__IOFDSET) { + backend_modify (EV_A_ fd, o_events, anfd->events); + } + } + + fdchangecnt = 0; +} + +/* something about the given fd changed */ +inline_size +void +fd_change (EV_P_ int fd, int flags) +{ + unsigned char reify = anfds [fd].reify; + anfds [fd].reify |= flags; + + if (expect_true (!reify)) + { + ++fdchangecnt; + array_needsize (int, fdchanges, fdchangemax, fdchangecnt, EMPTY2); + fdchanges [fdchangecnt - 1] = fd; + } +} + +/* the given fd is invalid/unusable, so make sure it doesn't hurt us anymore */ +inline_speed ecb_cold void +fd_kill (EV_P_ int fd) +{ + ev_io *w; + + while ((w = (ev_io *)anfds [fd].head)) + { + ev_io_stop (EV_A_ w); + ev_feed_event (EV_A_ (W)w, EV_ERROR | EV_READ | EV_WRITE); + } +} + +/* check whether the given fd is actually valid, for error recovery */ +inline_size ecb_cold int +fd_valid (int fd) +{ +#ifdef _WIN32 + return EV_FD_TO_WIN32_HANDLE (fd) != -1; +#else + return fcntl (fd, F_GETFD) != -1; +#endif +} + +/* called on EBADF to verify fds */ +noinline ecb_cold +static void +fd_ebadf (EV_P) +{ + int fd; + + for (fd = 0; fd < anfdmax; ++fd) + if (anfds [fd].events) + if (!fd_valid (fd) && errno == EBADF) + fd_kill (EV_A_ fd); +} + +/* called on ENOMEM in select/poll to kill some fds and retry */ +noinline ecb_cold +static void +fd_enomem (EV_P) +{ + int fd; + + for (fd = anfdmax; fd--; ) + if (anfds [fd].events) + { + fd_kill (EV_A_ fd); + break; + } +} + +/* usually called after fork if backend needs to re-arm all fds from scratch */ +noinline +static void +fd_rearm_all (EV_P) +{ + int fd; + + for (fd = 0; fd < anfdmax; ++fd) + if (anfds [fd].events) + { + anfds [fd].events = 0; + anfds [fd].emask = 0; + fd_change (EV_A_ fd, EV__IOFDSET | EV_ANFD_REIFY); + } +} + +/* used to prepare libev internal fd's */ +/* this is not fork-safe */ +inline_speed void +fd_intern (int fd) +{ +#ifdef _WIN32 + unsigned long arg = 1; + ioctlsocket (EV_FD_TO_WIN32_HANDLE (fd), FIONBIO, &arg); +#else + fcntl (fd, F_SETFD, FD_CLOEXEC); + fcntl (fd, F_SETFL, O_NONBLOCK); +#endif +} + +/*****************************************************************************/ + +/* + * the heap functions want a real array index. array index 0 is guaranteed to not + * be in-use at any time. the first heap entry is at array [HEAP0]. DHEAP gives + * the branching factor of the d-tree. + */ + +/* + * at the moment we allow libev the luxury of two heaps, + * a small-code-size 2-heap one and a ~1.5kb larger 4-heap + * which is more cache-efficient. + * the difference is about 5% with 50000+ watchers. + */ +#if EV_USE_4HEAP + +#define DHEAP 4 +#define HEAP0 (DHEAP - 1) /* index of first element in heap */ +#define HPARENT(k) ((((k) - HEAP0 - 1) / DHEAP) + HEAP0) +#define UPHEAP_DONE(p,k) ((p) == (k)) + +/* away from the root */ +inline_speed void +downheap (ANHE *heap, int N, int k) +{ + ANHE he = heap [k]; + ANHE *E = heap + N + HEAP0; + + for (;;) + { + ev_tstamp minat; + ANHE *minpos; + ANHE *pos = heap + DHEAP * (k - HEAP0) + HEAP0 + 1; + + /* find minimum child */ + if (expect_true (pos + DHEAP - 1 < E)) + { + /* fast path */ (minpos = pos + 0), (minat = ANHE_at (*minpos)); + if ( ANHE_at (pos [1]) < minat) (minpos = pos + 1), (minat = ANHE_at (*minpos)); + if ( ANHE_at (pos [2]) < minat) (minpos = pos + 2), (minat = ANHE_at (*minpos)); + if ( ANHE_at (pos [3]) < minat) (minpos = pos + 3), (minat = ANHE_at (*minpos)); + } + else if (pos < E) + { + /* slow path */ (minpos = pos + 0), (minat = ANHE_at (*minpos)); + if (pos + 1 < E && ANHE_at (pos [1]) < minat) (minpos = pos + 1), (minat = ANHE_at (*minpos)); + if (pos + 2 < E && ANHE_at (pos [2]) < minat) (minpos = pos + 2), (minat = ANHE_at (*minpos)); + if (pos + 3 < E && ANHE_at (pos [3]) < minat) (minpos = pos + 3), (minat = ANHE_at (*minpos)); + } + else + break; + + if (ANHE_at (he) <= minat) + break; + + heap [k] = *minpos; + ev_active (ANHE_w (*minpos)) = k; + + k = minpos - heap; + } + + heap [k] = he; + ev_active (ANHE_w (he)) = k; +} + +#else /* 4HEAP */ + +#define HEAP0 1 +#define HPARENT(k) ((k) >> 1) +#define UPHEAP_DONE(p,k) (!(p)) + +/* away from the root */ +inline_speed void +downheap (ANHE *heap, int N, int k) +{ + ANHE he = heap [k]; + + for (;;) + { + int c = k << 1; + + if (c >= N + HEAP0) + break; + + c += c + 1 < N + HEAP0 && ANHE_at (heap [c]) > ANHE_at (heap [c + 1]) + ? 1 : 0; + + if (ANHE_at (he) <= ANHE_at (heap [c])) + break; + + heap [k] = heap [c]; + ev_active (ANHE_w (heap [k])) = k; + + k = c; + } + + heap [k] = he; + ev_active (ANHE_w (he)) = k; +} +#endif + +/* towards the root */ +inline_speed void +upheap (ANHE *heap, int k) +{ + ANHE he = heap [k]; + + for (;;) + { + int p = HPARENT (k); + + if (UPHEAP_DONE (p, k) || ANHE_at (heap [p]) <= ANHE_at (he)) + break; + + heap [k] = heap [p]; + ev_active (ANHE_w (heap [k])) = k; + k = p; + } + + heap [k] = he; + ev_active (ANHE_w (he)) = k; +} + +/* move an element suitably so it is in a correct place */ +inline_size void +adjustheap (ANHE *heap, int N, int k) +{ + if (k > HEAP0 && ANHE_at (heap [k]) <= ANHE_at (heap [HPARENT (k)])) + upheap (heap, k); + else + downheap (heap, N, k); +} + +/* rebuild the heap: this function is used only once and executed rarely */ +inline_size void +reheap (ANHE *heap, int N) +{ + int i; + + /* we don't use floyds algorithm, upheap is simpler and is more cache-efficient */ + /* also, this is easy to implement and correct for both 2-heaps and 4-heaps */ + for (i = 0; i < N; ++i) + upheap (heap, i + HEAP0); +} + +/*****************************************************************************/ + +/* associate signal watchers to a signal signal */ +typedef struct +{ + EV_ATOMIC_T pending; +#if EV_MULTIPLICITY + EV_P; +#endif + WL head; +} ANSIG; + +static ANSIG signals [EV_NSIG - 1]; + +/*****************************************************************************/ + +#if EV_SIGNAL_ENABLE || EV_ASYNC_ENABLE + +noinline ecb_cold +static void +evpipe_init (EV_P) +{ + if (!ev_is_active (&pipe_w)) + { + int fds [2]; + +# if EV_USE_EVENTFD + fds [0] = -1; + fds [1] = eventfd (0, EFD_NONBLOCK | EFD_CLOEXEC); + if (fds [1] < 0 && errno == EINVAL) + fds [1] = eventfd (0, 0); + + if (fds [1] < 0) +# endif + { + while (pipe (fds)) + ev_syserr ("(libev) error creating signal/async pipe"); + + fd_intern (fds [0]); + } + + evpipe [0] = fds [0]; + + if (evpipe [1] < 0) + evpipe [1] = fds [1]; /* first call, set write fd */ + else + { + /* on subsequent calls, do not change evpipe [1] */ + /* so that evpipe_write can always rely on its value. */ + /* this branch does not do anything sensible on windows, */ + /* so must not be executed on windows */ + + dup2 (fds [1], evpipe [1]); + close (fds [1]); + } + + fd_intern (evpipe [1]); + + ev_io_set (&pipe_w, evpipe [0] < 0 ? evpipe [1] : evpipe [0], EV_READ); + ev_io_start (EV_A_ &pipe_w); + ev_unref (EV_A); /* watcher should not keep loop alive */ + } +} + +inline_speed void +evpipe_write (EV_P_ EV_ATOMIC_T *flag) +{ + ECB_MEMORY_FENCE; /* push out the write before this function was called, acquire flag */ + + if (expect_true (*flag)) + return; + + *flag = 1; + ECB_MEMORY_FENCE_RELEASE; /* make sure flag is visible before the wakeup */ + + pipe_write_skipped = 1; + + ECB_MEMORY_FENCE; /* make sure pipe_write_skipped is visible before we check pipe_write_wanted */ + + if (pipe_write_wanted) + { + int old_errno; + + pipe_write_skipped = 0; + ECB_MEMORY_FENCE_RELEASE; + + old_errno = errno; /* save errno because write will clobber it */ + +#if EV_USE_EVENTFD + if (evpipe [0] < 0) + { + uint64_t counter = 1; + write (evpipe [1], &counter, sizeof (uint64_t)); + } + else +#endif + { +#ifdef _WIN32 + WSABUF buf; + DWORD sent; + buf.buf = &buf; + buf.len = 1; + WSASend (EV_FD_TO_WIN32_HANDLE (evpipe [1]), &buf, 1, &sent, 0, 0, 0); +#else + write (evpipe [1], &(evpipe [1]), 1); +#endif + } + + errno = old_errno; + } +} + +/* called whenever the libev signal pipe */ +/* got some events (signal, async) */ +static void +pipecb (EV_P_ ev_io *iow, int revents) +{ + int i; + + if (revents & EV_READ) + { +#if EV_USE_EVENTFD + if (evpipe [0] < 0) + { + uint64_t counter; + read (evpipe [1], &counter, sizeof (uint64_t)); + } + else +#endif + { + char dummy[4]; +#ifdef _WIN32 + WSABUF buf; + DWORD recvd; + DWORD flags = 0; + buf.buf = dummy; + buf.len = sizeof (dummy); + WSARecv (EV_FD_TO_WIN32_HANDLE (evpipe [0]), &buf, 1, &recvd, &flags, 0, 0); +#else + read (evpipe [0], &dummy, sizeof (dummy)); +#endif + } + } + + pipe_write_skipped = 0; + + ECB_MEMORY_FENCE; /* push out skipped, acquire flags */ + +#if EV_SIGNAL_ENABLE + if (sig_pending) + { + sig_pending = 0; + + ECB_MEMORY_FENCE; + + for (i = EV_NSIG - 1; i--; ) + if (expect_false (signals [i].pending)) + ev_feed_signal_event (EV_A_ i + 1); + } +#endif + +#if EV_ASYNC_ENABLE + if (async_pending) + { + async_pending = 0; + + ECB_MEMORY_FENCE; + + for (i = asynccnt; i--; ) + if (asyncs [i]->sent) + { + asyncs [i]->sent = 0; + ECB_MEMORY_FENCE_RELEASE; + ev_feed_event (EV_A_ asyncs [i], EV_ASYNC); + } + } +#endif +} + +/*****************************************************************************/ + +void +ev_feed_signal (int signum) EV_THROW +{ +#if EV_MULTIPLICITY + EV_P; + ECB_MEMORY_FENCE_ACQUIRE; + EV_A = signals [signum - 1].loop; + + if (!EV_A) + return; +#endif + + signals [signum - 1].pending = 1; + evpipe_write (EV_A_ &sig_pending); +} + +static void +ev_sighandler (int signum) +{ +#ifdef _WIN32 + signal (signum, ev_sighandler); +#endif + + ev_feed_signal (signum); +} + +noinline +void +ev_feed_signal_event (EV_P_ int signum) EV_THROW +{ + WL w; + + if (expect_false (signum <= 0 || signum >= EV_NSIG)) + return; + + --signum; + +#if EV_MULTIPLICITY + /* it is permissible to try to feed a signal to the wrong loop */ + /* or, likely more useful, feeding a signal nobody is waiting for */ + + if (expect_false (signals [signum].loop != EV_A)) + return; +#endif + + signals [signum].pending = 0; + ECB_MEMORY_FENCE_RELEASE; + + for (w = signals [signum].head; w; w = w->next) + ev_feed_event (EV_A_ (W)w, EV_SIGNAL); +} + +#if EV_USE_SIGNALFD +static void +sigfdcb (EV_P_ ev_io *iow, int revents) +{ + struct signalfd_siginfo si[2], *sip; /* these structs are big */ + + for (;;) + { + ssize_t res = read (sigfd, si, sizeof (si)); + + /* not ISO-C, as res might be -1, but works with SuS */ + for (sip = si; (char *)sip < (char *)si + res; ++sip) + ev_feed_signal_event (EV_A_ sip->ssi_signo); + + if (res < (ssize_t)sizeof (si)) + break; + } +} +#endif + +#endif + +/*****************************************************************************/ + +#if EV_CHILD_ENABLE +static WL childs [EV_PID_HASHSIZE]; + +static ev_signal childev; + +#ifndef WIFCONTINUED +# define WIFCONTINUED(status) 0 +#endif + +/* handle a single child status event */ +inline_speed void +child_reap (EV_P_ int chain, int pid, int status) +{ + ev_child *w; + int traced = WIFSTOPPED (status) || WIFCONTINUED (status); + + for (w = (ev_child *)childs [chain & ((EV_PID_HASHSIZE) - 1)]; w; w = (ev_child *)((WL)w)->next) + { + if ((w->pid == pid || !w->pid) + && (!traced || (w->flags & 1))) + { + ev_set_priority (w, EV_MAXPRI); /* need to do it *now*, this *must* be the same prio as the signal watcher itself */ + w->rpid = pid; + w->rstatus = status; + ev_feed_event (EV_A_ (W)w, EV_CHILD); + } + } +} + +#ifndef WCONTINUED +# define WCONTINUED 0 +#endif + +/* called on sigchld etc., calls waitpid */ +static void +childcb (EV_P_ ev_signal *sw, int revents) +{ + int pid, status; + + /* some systems define WCONTINUED but then fail to support it (linux 2.4) */ + if (0 >= (pid = waitpid (-1, &status, WNOHANG | WUNTRACED | WCONTINUED))) + if (!WCONTINUED + || errno != EINVAL + || 0 >= (pid = waitpid (-1, &status, WNOHANG | WUNTRACED))) + return; + + /* make sure we are called again until all children have been reaped */ + /* we need to do it this way so that the callback gets called before we continue */ + ev_feed_event (EV_A_ (W)sw, EV_SIGNAL); + + child_reap (EV_A_ pid, pid, status); + if ((EV_PID_HASHSIZE) > 1) + child_reap (EV_A_ 0, pid, status); /* this might trigger a watcher twice, but feed_event catches that */ +} + +#endif + +/*****************************************************************************/ + +#if EV_USE_IOCP +# include "ev_iocp.c" +#endif +#if EV_USE_PORT +# include "ev_port.c" +#endif +#if EV_USE_KQUEUE +# include "ev_kqueue.c" +#endif +#if EV_USE_EPOLL +# include "ev_epoll.c" +#endif +#if EV_USE_POLL +# include "ev_poll.c" +#endif +#if EV_USE_SELECT +# include "ev_select.c" +#endif + +ecb_cold int +ev_version_major (void) EV_THROW +{ + return EV_VERSION_MAJOR; +} + +ecb_cold int +ev_version_minor (void) EV_THROW +{ + return EV_VERSION_MINOR; +} + +/* return true if we are running with elevated privileges and should ignore env variables */ +inline_size ecb_cold int +enable_secure (void) +{ +#ifdef _WIN32 + return 0; +#else + return getuid () != geteuid () + || getgid () != getegid (); +#endif +} + +ecb_cold +unsigned int +ev_supported_backends (void) EV_THROW +{ + unsigned int flags = 0; + + if (EV_USE_PORT ) flags |= EVBACKEND_PORT; + if (EV_USE_KQUEUE) flags |= EVBACKEND_KQUEUE; + if (EV_USE_EPOLL ) flags |= EVBACKEND_EPOLL; + if (EV_USE_POLL ) flags |= EVBACKEND_POLL; + if (EV_USE_SELECT) flags |= EVBACKEND_SELECT; + + return flags; +} + +ecb_cold +unsigned int +ev_recommended_backends (void) EV_THROW +{ + unsigned int flags = ev_supported_backends (); + +#ifndef __NetBSD__ + /* kqueue is borked on everything but netbsd apparently */ + /* it usually doesn't work correctly on anything but sockets and pipes */ + flags &= ~EVBACKEND_KQUEUE; +#endif +#ifdef __APPLE__ + /* only select works correctly on that "unix-certified" platform */ + flags &= ~EVBACKEND_KQUEUE; /* horribly broken, even for sockets */ + flags &= ~EVBACKEND_POLL; /* poll is based on kqueue from 10.5 onwards */ +#endif +#ifdef __FreeBSD__ + flags &= ~EVBACKEND_POLL; /* poll return value is unusable (http://forums.freebsd.org/archive/index.php/t-10270.html) */ +#endif + + return flags; +} + +ecb_cold +unsigned int +ev_embeddable_backends (void) EV_THROW +{ + int flags = EVBACKEND_EPOLL | EVBACKEND_KQUEUE | EVBACKEND_PORT; + + /* epoll embeddability broken on all linux versions up to at least 2.6.23 */ + if (ev_linux_version () < 0x020620) /* disable it on linux < 2.6.32 */ + flags &= ~EVBACKEND_EPOLL; + + return flags; +} + +unsigned int +ev_backend (EV_P) EV_THROW +{ + return backend; +} + +#if EV_FEATURE_API +unsigned int +ev_iteration (EV_P) EV_THROW +{ + return loop_count; +} + +unsigned int +ev_depth (EV_P) EV_THROW +{ + return loop_depth; +} + +void +ev_set_io_collect_interval (EV_P_ ev_tstamp interval) EV_THROW +{ + io_blocktime = interval; +} + +void +ev_set_timeout_collect_interval (EV_P_ ev_tstamp interval) EV_THROW +{ + timeout_blocktime = interval; +} + +void +ev_set_userdata (EV_P_ void *data) EV_THROW +{ + userdata = data; +} + +void * +ev_userdata (EV_P) EV_THROW +{ + return userdata; +} + +void +ev_set_invoke_pending_cb (EV_P_ ev_loop_callback invoke_pending_cb) EV_THROW +{ + invoke_cb = invoke_pending_cb; +} + +void +ev_set_loop_release_cb (EV_P_ void (*release)(EV_P) EV_THROW, void (*acquire)(EV_P) EV_THROW) EV_THROW +{ + release_cb = release; + acquire_cb = acquire; +} +#endif + +/* initialise a loop structure, must be zero-initialised */ +noinline ecb_cold +static void +loop_init (EV_P_ unsigned int flags) EV_THROW +{ + if (!backend) + { + origflags = flags; + +#if EV_USE_REALTIME + if (!have_realtime) + { + struct timespec ts; + + if (!clock_gettime (CLOCK_REALTIME, &ts)) + have_realtime = 1; + } +#endif + +#if EV_USE_MONOTONIC + if (!have_monotonic) + { + struct timespec ts; + + if (!clock_gettime (CLOCK_MONOTONIC, &ts)) + have_monotonic = 1; + } +#endif + + /* pid check not overridable via env */ +#ifndef _WIN32 + if (flags & EVFLAG_FORKCHECK) + curpid = getpid (); +#endif + + if (!(flags & EVFLAG_NOENV) + && !enable_secure () + && getenv ("LIBEV_FLAGS")) + flags = atoi (getenv ("LIBEV_FLAGS")); + + ev_rt_now = ev_time (); + mn_now = get_clock (); + now_floor = mn_now; + rtmn_diff = ev_rt_now - mn_now; +#if EV_FEATURE_API + invoke_cb = ev_invoke_pending; +#endif + + io_blocktime = 0.; + timeout_blocktime = 0.; + backend = 0; + backend_fd = -1; + sig_pending = 0; +#if EV_ASYNC_ENABLE + async_pending = 0; +#endif + pipe_write_skipped = 0; + pipe_write_wanted = 0; + evpipe [0] = -1; + evpipe [1] = -1; +#if EV_USE_INOTIFY + fs_fd = flags & EVFLAG_NOINOTIFY ? -1 : -2; +#endif +#if EV_USE_SIGNALFD + sigfd = flags & EVFLAG_SIGNALFD ? -2 : -1; +#endif + + if (!(flags & EVBACKEND_MASK)) + flags |= ev_recommended_backends (); + +#if EV_USE_IOCP + if (!backend && (flags & EVBACKEND_IOCP )) backend = iocp_init (EV_A_ flags); +#endif +#if EV_USE_PORT + if (!backend && (flags & EVBACKEND_PORT )) backend = port_init (EV_A_ flags); +#endif +#if EV_USE_KQUEUE + if (!backend && (flags & EVBACKEND_KQUEUE)) backend = kqueue_init (EV_A_ flags); +#endif +#if EV_USE_EPOLL + if (!backend && (flags & EVBACKEND_EPOLL )) backend = epoll_init (EV_A_ flags); +#endif +#if EV_USE_POLL + if (!backend && (flags & EVBACKEND_POLL )) backend = poll_init (EV_A_ flags); +#endif +#if EV_USE_SELECT + if (!backend && (flags & EVBACKEND_SELECT)) backend = select_init (EV_A_ flags); +#endif + + ev_prepare_init (&pending_w, pendingcb); + +#if EV_SIGNAL_ENABLE || EV_ASYNC_ENABLE + ev_init (&pipe_w, pipecb); + ev_set_priority (&pipe_w, EV_MAXPRI); +#endif + } +} + +/* free up a loop structure */ +ecb_cold +void +ev_loop_destroy (EV_P) +{ + int i; + +#if EV_MULTIPLICITY + /* mimic free (0) */ + if (!EV_A) + return; +#endif + +#if EV_CLEANUP_ENABLE + /* queue cleanup watchers (and execute them) */ + if (expect_false (cleanupcnt)) + { + queue_events (EV_A_ (W *)cleanups, cleanupcnt, EV_CLEANUP); + EV_INVOKE_PENDING; + } +#endif + +#if EV_CHILD_ENABLE + if (ev_is_default_loop (EV_A) && ev_is_active (&childev)) + { + ev_ref (EV_A); /* child watcher */ + ev_signal_stop (EV_A_ &childev); + } +#endif + + if (ev_is_active (&pipe_w)) + { + /*ev_ref (EV_A);*/ + /*ev_io_stop (EV_A_ &pipe_w);*/ + + if (evpipe [0] >= 0) EV_WIN32_CLOSE_FD (evpipe [0]); + if (evpipe [1] >= 0) EV_WIN32_CLOSE_FD (evpipe [1]); + } + +#if EV_USE_SIGNALFD + if (ev_is_active (&sigfd_w)) + close (sigfd); +#endif + +#if EV_USE_INOTIFY + if (fs_fd >= 0) + close (fs_fd); +#endif + + if (backend_fd >= 0) + close (backend_fd); + +#if EV_USE_IOCP + if (backend == EVBACKEND_IOCP ) iocp_destroy (EV_A); +#endif +#if EV_USE_PORT + if (backend == EVBACKEND_PORT ) port_destroy (EV_A); +#endif +#if EV_USE_KQUEUE + if (backend == EVBACKEND_KQUEUE) kqueue_destroy (EV_A); +#endif +#if EV_USE_EPOLL + if (backend == EVBACKEND_EPOLL ) epoll_destroy (EV_A); +#endif +#if EV_USE_POLL + if (backend == EVBACKEND_POLL ) poll_destroy (EV_A); +#endif +#if EV_USE_SELECT + if (backend == EVBACKEND_SELECT) select_destroy (EV_A); +#endif + + for (i = NUMPRI; i--; ) + { + array_free (pending, [i]); +#if EV_IDLE_ENABLE + array_free (idle, [i]); +#endif + } + + ev_free (anfds); anfds = 0; anfdmax = 0; + + /* have to use the microsoft-never-gets-it-right macro */ + array_free (rfeed, EMPTY); + array_free (fdchange, EMPTY); + array_free (timer, EMPTY); +#if EV_PERIODIC_ENABLE + array_free (periodic, EMPTY); +#endif +#if EV_FORK_ENABLE + array_free (fork, EMPTY); +#endif +#if EV_CLEANUP_ENABLE + array_free (cleanup, EMPTY); +#endif + array_free (prepare, EMPTY); + array_free (check, EMPTY); +#if EV_ASYNC_ENABLE + array_free (async, EMPTY); +#endif + + backend = 0; + +#if EV_MULTIPLICITY + if (ev_is_default_loop (EV_A)) +#endif + ev_default_loop_ptr = 0; +#if EV_MULTIPLICITY + else + ev_free (EV_A); +#endif +} + +#if EV_USE_INOTIFY +inline_size void infy_fork (EV_P); +#endif + +inline_size void +loop_fork (EV_P) +{ +#if EV_USE_PORT + if (backend == EVBACKEND_PORT ) port_fork (EV_A); +#endif +#if EV_USE_KQUEUE + if (backend == EVBACKEND_KQUEUE) kqueue_fork (EV_A); +#endif +#if EV_USE_EPOLL + if (backend == EVBACKEND_EPOLL ) epoll_fork (EV_A); +#endif +#if EV_USE_INOTIFY + infy_fork (EV_A); +#endif + +#if EV_SIGNAL_ENABLE || EV_ASYNC_ENABLE + if (ev_is_active (&pipe_w) && postfork != 2) + { + /* pipe_write_wanted must be false now, so modifying fd vars should be safe */ + + ev_ref (EV_A); + ev_io_stop (EV_A_ &pipe_w); + + if (evpipe [0] >= 0) + EV_WIN32_CLOSE_FD (evpipe [0]); + + evpipe_init (EV_A); + /* iterate over everything, in case we missed something before */ + ev_feed_event (EV_A_ &pipe_w, EV_CUSTOM); + } +#endif + + postfork = 0; +} + +#if EV_MULTIPLICITY + +ecb_cold +struct ev_loop * +ev_loop_new (unsigned int flags) EV_THROW +{ + EV_P = (struct ev_loop *)ev_malloc (sizeof (struct ev_loop)); + + memset (EV_A, 0, sizeof (struct ev_loop)); + loop_init (EV_A_ flags); + + if (ev_backend (EV_A)) + return EV_A; + + ev_free (EV_A); + return 0; +} + +#endif /* multiplicity */ + +#if EV_VERIFY +noinline ecb_cold +static void +verify_watcher (EV_P_ W w) +{ + assert (("libev: watcher has invalid priority", ABSPRI (w) >= 0 && ABSPRI (w) < NUMPRI)); + + if (w->pending) + assert (("libev: pending watcher not on pending queue", pendings [ABSPRI (w)][w->pending - 1].w == w)); +} + +noinline ecb_cold +static void +verify_heap (EV_P_ ANHE *heap, int N) +{ + int i; + + for (i = HEAP0; i < N + HEAP0; ++i) + { + assert (("libev: active index mismatch in heap", ev_active (ANHE_w (heap [i])) == i)); + assert (("libev: heap condition violated", i == HEAP0 || ANHE_at (heap [HPARENT (i)]) <= ANHE_at (heap [i]))); + assert (("libev: heap at cache mismatch", ANHE_at (heap [i]) == ev_at (ANHE_w (heap [i])))); + + verify_watcher (EV_A_ (W)ANHE_w (heap [i])); + } +} + +noinline ecb_cold +static void +array_verify (EV_P_ W *ws, int cnt) +{ + while (cnt--) + { + assert (("libev: active index mismatch", ev_active (ws [cnt]) == cnt + 1)); + verify_watcher (EV_A_ ws [cnt]); + } +} +#endif + +#if EV_FEATURE_API +void ecb_cold +ev_verify (EV_P) EV_THROW +{ +#if EV_VERIFY + int i; + WL w, w2; + + assert (activecnt >= -1); + + assert (fdchangemax >= fdchangecnt); + for (i = 0; i < fdchangecnt; ++i) + assert (("libev: negative fd in fdchanges", fdchanges [i] >= 0)); + + assert (anfdmax >= 0); + for (i = 0; i < anfdmax; ++i) + { + int j = 0; + + for (w = w2 = anfds [i].head; w; w = w->next) + { + verify_watcher (EV_A_ (W)w); + + if (j++ & 1) + { + assert (("libev: io watcher list contains a loop", w != w2)); + w2 = w2->next; + } + + assert (("libev: inactive fd watcher on anfd list", ev_active (w) == 1)); + assert (("libev: fd mismatch between watcher and anfd", ((ev_io *)w)->fd == i)); + } + } + + assert (timermax >= timercnt); + verify_heap (EV_A_ timers, timercnt); + +#if EV_PERIODIC_ENABLE + assert (periodicmax >= periodiccnt); + verify_heap (EV_A_ periodics, periodiccnt); +#endif + + for (i = NUMPRI; i--; ) + { + assert (pendingmax [i] >= pendingcnt [i]); +#if EV_IDLE_ENABLE + assert (idleall >= 0); + assert (idlemax [i] >= idlecnt [i]); + array_verify (EV_A_ (W *)idles [i], idlecnt [i]); +#endif + } + +#if EV_FORK_ENABLE + assert (forkmax >= forkcnt); + array_verify (EV_A_ (W *)forks, forkcnt); +#endif + +#if EV_CLEANUP_ENABLE + assert (cleanupmax >= cleanupcnt); + array_verify (EV_A_ (W *)cleanups, cleanupcnt); +#endif + +#if EV_ASYNC_ENABLE + assert (asyncmax >= asynccnt); + array_verify (EV_A_ (W *)asyncs, asynccnt); +#endif + +#if EV_PREPARE_ENABLE + assert (preparemax >= preparecnt); + array_verify (EV_A_ (W *)prepares, preparecnt); +#endif + +#if EV_CHECK_ENABLE + assert (checkmax >= checkcnt); + array_verify (EV_A_ (W *)checks, checkcnt); +#endif + +# if 0 +#if EV_CHILD_ENABLE + for (w = (ev_child *)childs [chain & ((EV_PID_HASHSIZE) - 1)]; w; w = (ev_child *)((WL)w)->next) + for (signum = EV_NSIG; signum--; ) if (signals [signum].pending) +#endif +# endif +#endif +} +#endif + +#if EV_MULTIPLICITY +ecb_cold +struct ev_loop * +#else +int +#endif +ev_default_loop (unsigned int flags) EV_THROW +{ + if (!ev_default_loop_ptr) + { +#if EV_MULTIPLICITY + EV_P = ev_default_loop_ptr = &default_loop_struct; +#else + ev_default_loop_ptr = 1; +#endif + + loop_init (EV_A_ flags); + + if (ev_backend (EV_A)) + { +#if EV_CHILD_ENABLE + ev_signal_init (&childev, childcb, SIGCHLD); + ev_set_priority (&childev, EV_MAXPRI); + ev_signal_start (EV_A_ &childev); + ev_unref (EV_A); /* child watcher should not keep loop alive */ +#endif + } + else + ev_default_loop_ptr = 0; + } + + return ev_default_loop_ptr; +} + +void +ev_loop_fork (EV_P) EV_THROW +{ + postfork = 1; +} + +/*****************************************************************************/ + +void +ev_invoke (EV_P_ void *w, int revents) +{ + EV_CB_INVOKE ((W)w, revents); +} + +unsigned int +ev_pending_count (EV_P) EV_THROW +{ + int pri; + unsigned int count = 0; + + for (pri = NUMPRI; pri--; ) + count += pendingcnt [pri]; + + return count; +} + +noinline +void +ev_invoke_pending (EV_P) +{ + pendingpri = NUMPRI; + + while (pendingpri) /* pendingpri possibly gets modified in the inner loop */ + { + --pendingpri; + + while (pendingcnt [pendingpri]) + { + ANPENDING *p = pendings [pendingpri] + --pendingcnt [pendingpri]; + + p->w->pending = 0; + EV_CB_INVOKE (p->w, p->events); + EV_FREQUENT_CHECK; + } + } +} + +#if EV_IDLE_ENABLE +/* make idle watchers pending. this handles the "call-idle */ +/* only when higher priorities are idle" logic */ +inline_size void +idle_reify (EV_P) +{ + if (expect_false (idleall)) + { + int pri; + + for (pri = NUMPRI; pri--; ) + { + if (pendingcnt [pri]) + break; + + if (idlecnt [pri]) + { + queue_events (EV_A_ (W *)idles [pri], idlecnt [pri], EV_IDLE); + break; + } + } + } +} +#endif + +/* make timers pending */ +inline_size void +timers_reify (EV_P) +{ + EV_FREQUENT_CHECK; + + if (timercnt && ANHE_at (timers [HEAP0]) < mn_now) + { + do + { + ev_timer *w = (ev_timer *)ANHE_w (timers [HEAP0]); + + /*assert (("libev: inactive timer on timer heap detected", ev_is_active (w)));*/ + + /* first reschedule or stop timer */ + if (w->repeat) + { + ev_at (w) += w->repeat; + if (ev_at (w) < mn_now) + ev_at (w) = mn_now; + + assert (("libev: negative ev_timer repeat value found while processing timers", w->repeat > 0.)); + + ANHE_at_cache (timers [HEAP0]); + downheap (timers, timercnt, HEAP0); + } + else + ev_timer_stop (EV_A_ w); /* nonrepeating: stop timer */ + + EV_FREQUENT_CHECK; + feed_reverse (EV_A_ (W)w); + } + while (timercnt && ANHE_at (timers [HEAP0]) < mn_now); + + feed_reverse_done (EV_A_ EV_TIMER); + } +} + +#if EV_PERIODIC_ENABLE + +noinline +static void +periodic_recalc (EV_P_ ev_periodic *w) +{ + ev_tstamp interval = w->interval > MIN_INTERVAL ? w->interval : MIN_INTERVAL; + ev_tstamp at = w->offset + interval * ev_floor ((ev_rt_now - w->offset) / interval); + + /* the above almost always errs on the low side */ + while (at <= ev_rt_now) + { + ev_tstamp nat = at + w->interval; + + /* when resolution fails us, we use ev_rt_now */ + if (expect_false (nat == at)) + { + at = ev_rt_now; + break; + } + + at = nat; + } + + ev_at (w) = at; +} + +/* make periodics pending */ +inline_size void +periodics_reify (EV_P) +{ + EV_FREQUENT_CHECK; + + while (periodiccnt && ANHE_at (periodics [HEAP0]) < ev_rt_now) + { + do + { + ev_periodic *w = (ev_periodic *)ANHE_w (periodics [HEAP0]); + + /*assert (("libev: inactive timer on periodic heap detected", ev_is_active (w)));*/ + + /* first reschedule or stop timer */ + if (w->reschedule_cb) + { + ev_at (w) = w->reschedule_cb (w, ev_rt_now); + + assert (("libev: ev_periodic reschedule callback returned time in the past", ev_at (w) >= ev_rt_now)); + + ANHE_at_cache (periodics [HEAP0]); + downheap (periodics, periodiccnt, HEAP0); + } + else if (w->interval) + { + periodic_recalc (EV_A_ w); + ANHE_at_cache (periodics [HEAP0]); + downheap (periodics, periodiccnt, HEAP0); + } + else + ev_periodic_stop (EV_A_ w); /* nonrepeating: stop timer */ + + EV_FREQUENT_CHECK; + feed_reverse (EV_A_ (W)w); + } + while (periodiccnt && ANHE_at (periodics [HEAP0]) < ev_rt_now); + + feed_reverse_done (EV_A_ EV_PERIODIC); + } +} + +/* simply recalculate all periodics */ +/* TODO: maybe ensure that at least one event happens when jumping forward? */ +noinline ecb_cold +static void +periodics_reschedule (EV_P) +{ + int i; + + /* adjust periodics after time jump */ + for (i = HEAP0; i < periodiccnt + HEAP0; ++i) + { + ev_periodic *w = (ev_periodic *)ANHE_w (periodics [i]); + + if (w->reschedule_cb) + ev_at (w) = w->reschedule_cb (w, ev_rt_now); + else if (w->interval) + periodic_recalc (EV_A_ w); + + ANHE_at_cache (periodics [i]); + } + + reheap (periodics, periodiccnt); +} +#endif + +/* adjust all timers by a given offset */ +noinline ecb_cold +static void +timers_reschedule (EV_P_ ev_tstamp adjust) +{ + int i; + + for (i = 0; i < timercnt; ++i) + { + ANHE *he = timers + i + HEAP0; + ANHE_w (*he)->at += adjust; + ANHE_at_cache (*he); + } +} + +/* fetch new monotonic and realtime times from the kernel */ +/* also detect if there was a timejump, and act accordingly */ +inline_speed void +time_update (EV_P_ ev_tstamp max_block) +{ +#if EV_USE_MONOTONIC + if (expect_true (have_monotonic)) + { + int i; + ev_tstamp odiff = rtmn_diff; + + mn_now = get_clock (); + + /* only fetch the realtime clock every 0.5*MIN_TIMEJUMP seconds */ + /* interpolate in the meantime */ + if (expect_true (mn_now - now_floor < MIN_TIMEJUMP * .5)) + { + ev_rt_now = rtmn_diff + mn_now; + return; + } + + now_floor = mn_now; + ev_rt_now = ev_time (); + + /* loop a few times, before making important decisions. + * on the choice of "4": one iteration isn't enough, + * in case we get preempted during the calls to + * ev_time and get_clock. a second call is almost guaranteed + * to succeed in that case, though. and looping a few more times + * doesn't hurt either as we only do this on time-jumps or + * in the unlikely event of having been preempted here. + */ + for (i = 4; --i; ) + { + ev_tstamp diff; + rtmn_diff = ev_rt_now - mn_now; + + diff = odiff - rtmn_diff; + + if (expect_true ((diff < 0. ? -diff : diff) < MIN_TIMEJUMP)) + return; /* all is well */ + + ev_rt_now = ev_time (); + mn_now = get_clock (); + now_floor = mn_now; + } + + /* no timer adjustment, as the monotonic clock doesn't jump */ + /* timers_reschedule (EV_A_ rtmn_diff - odiff) */ +# if EV_PERIODIC_ENABLE + periodics_reschedule (EV_A); +# endif + } + else +#endif + { + ev_rt_now = ev_time (); + + if (expect_false (mn_now > ev_rt_now || ev_rt_now > mn_now + max_block + MIN_TIMEJUMP)) + { + /* adjust timers. this is easy, as the offset is the same for all of them */ + timers_reschedule (EV_A_ ev_rt_now - mn_now); +#if EV_PERIODIC_ENABLE + periodics_reschedule (EV_A); +#endif + } + + mn_now = ev_rt_now; + } +} + +int +ev_run (EV_P_ int flags) +{ +#if EV_FEATURE_API + ++loop_depth; +#endif + + assert (("libev: ev_loop recursion during release detected", loop_done != EVBREAK_RECURSE)); + + loop_done = EVBREAK_CANCEL; + + EV_INVOKE_PENDING; /* in case we recurse, ensure ordering stays nice and clean */ + + do + { +#if EV_VERIFY >= 2 + ev_verify (EV_A); +#endif + +#ifndef _WIN32 + if (expect_false (curpid)) /* penalise the forking check even more */ + if (expect_false (getpid () != curpid)) + { + curpid = getpid (); + postfork = 1; + } +#endif + +#if EV_FORK_ENABLE + /* we might have forked, so queue fork handlers */ + if (expect_false (postfork)) + if (forkcnt) + { + queue_events (EV_A_ (W *)forks, forkcnt, EV_FORK); + EV_INVOKE_PENDING; + } +#endif + +#if EV_PREPARE_ENABLE + /* queue prepare watchers (and execute them) */ + if (expect_false (preparecnt)) + { + queue_events (EV_A_ (W *)prepares, preparecnt, EV_PREPARE); + EV_INVOKE_PENDING; + } +#endif + + if (expect_false (loop_done)) + break; + + /* we might have forked, so reify kernel state if necessary */ + if (expect_false (postfork)) + loop_fork (EV_A); + + /* update fd-related kernel structures */ + fd_reify (EV_A); + + /* calculate blocking time */ + { + ev_tstamp waittime = 0.; + ev_tstamp sleeptime = 0.; + + /* remember old timestamp for io_blocktime calculation */ + ev_tstamp prev_mn_now = mn_now; + + /* update time to cancel out callback processing overhead */ + time_update (EV_A_ 1e100); + + /* from now on, we want a pipe-wake-up */ + pipe_write_wanted = 1; + + ECB_MEMORY_FENCE; /* make sure pipe_write_wanted is visible before we check for potential skips */ + + if (expect_true (!(flags & EVRUN_NOWAIT || idleall || !activecnt || pipe_write_skipped))) + { + waittime = MAX_BLOCKTIME; + + if (timercnt) + { + ev_tstamp to = ANHE_at (timers [HEAP0]) - mn_now; + if (waittime > to) waittime = to; + } + +#if EV_PERIODIC_ENABLE + if (periodiccnt) + { + ev_tstamp to = ANHE_at (periodics [HEAP0]) - ev_rt_now; + if (waittime > to) waittime = to; + } +#endif + + /* don't let timeouts decrease the waittime below timeout_blocktime */ + if (expect_false (waittime < timeout_blocktime)) + waittime = timeout_blocktime; + + /* at this point, we NEED to wait, so we have to ensure */ + /* to pass a minimum nonzero value to the backend */ + if (expect_false (waittime < backend_mintime)) + waittime = backend_mintime; + + /* extra check because io_blocktime is commonly 0 */ + if (expect_false (io_blocktime)) + { + sleeptime = io_blocktime - (mn_now - prev_mn_now); + + if (sleeptime > waittime - backend_mintime) + sleeptime = waittime - backend_mintime; + + if (expect_true (sleeptime > 0.)) + { + ev_sleep (sleeptime); + waittime -= sleeptime; + } + } + } + +#if EV_FEATURE_API + ++loop_count; +#endif + assert ((loop_done = EVBREAK_RECURSE, 1)); /* assert for side effect */ + backend_poll (EV_A_ waittime); + assert ((loop_done = EVBREAK_CANCEL, 1)); /* assert for side effect */ + + pipe_write_wanted = 0; /* just an optimisation, no fence needed */ + + ECB_MEMORY_FENCE_ACQUIRE; + if (pipe_write_skipped) + { + assert (("libev: pipe_w not active, but pipe not written", ev_is_active (&pipe_w))); + ev_feed_event (EV_A_ &pipe_w, EV_CUSTOM); + } + + + /* update ev_rt_now, do magic */ + time_update (EV_A_ waittime + sleeptime); + } + + /* queue pending timers and reschedule them */ + timers_reify (EV_A); /* relative timers called last */ +#if EV_PERIODIC_ENABLE + periodics_reify (EV_A); /* absolute timers called first */ +#endif + +#if EV_IDLE_ENABLE + /* queue idle watchers unless other events are pending */ + idle_reify (EV_A); +#endif + +#if EV_CHECK_ENABLE + /* queue check watchers, to be executed first */ + if (expect_false (checkcnt)) + queue_events (EV_A_ (W *)checks, checkcnt, EV_CHECK); +#endif + + EV_INVOKE_PENDING; + } + while (expect_true ( + activecnt + && !loop_done + && !(flags & (EVRUN_ONCE | EVRUN_NOWAIT)) + )); + + if (loop_done == EVBREAK_ONE) + loop_done = EVBREAK_CANCEL; + +#if EV_FEATURE_API + --loop_depth; +#endif + + return activecnt; +} + +void +ev_break (EV_P_ int how) EV_THROW +{ + loop_done = how; +} + +void +ev_ref (EV_P) EV_THROW +{ + ++activecnt; +} + +void +ev_unref (EV_P) EV_THROW +{ + --activecnt; +} + +void +ev_now_update (EV_P) EV_THROW +{ + time_update (EV_A_ 1e100); +} + +void +ev_suspend (EV_P) EV_THROW +{ + ev_now_update (EV_A); +} + +void +ev_resume (EV_P) EV_THROW +{ + ev_tstamp mn_prev = mn_now; + + ev_now_update (EV_A); + timers_reschedule (EV_A_ mn_now - mn_prev); +#if EV_PERIODIC_ENABLE + /* TODO: really do this? */ + periodics_reschedule (EV_A); +#endif +} + +/*****************************************************************************/ +/* singly-linked list management, used when the expected list length is short */ + +inline_size void +wlist_add (WL *head, WL elem) +{ + elem->next = *head; + *head = elem; +} + +inline_size void +wlist_del (WL *head, WL elem) +{ + while (*head) + { + if (expect_true (*head == elem)) + { + *head = elem->next; + break; + } + + head = &(*head)->next; + } +} + +/* internal, faster, version of ev_clear_pending */ +inline_speed void +clear_pending (EV_P_ W w) +{ + if (w->pending) + { + pendings [ABSPRI (w)][w->pending - 1].w = (W)&pending_w; + w->pending = 0; + } +} + +int +ev_clear_pending (EV_P_ void *w) EV_THROW +{ + W w_ = (W)w; + int pending = w_->pending; + + if (expect_true (pending)) + { + ANPENDING *p = pendings [ABSPRI (w_)] + pending - 1; + p->w = (W)&pending_w; + w_->pending = 0; + return p->events; + } + else + return 0; +} + +inline_size void +pri_adjust (EV_P_ W w) +{ + int pri = ev_priority (w); + pri = pri < EV_MINPRI ? EV_MINPRI : pri; + pri = pri > EV_MAXPRI ? EV_MAXPRI : pri; + ev_set_priority (w, pri); +} + +inline_speed void +ev_start (EV_P_ W w, int active) +{ + pri_adjust (EV_A_ w); + w->active = active; + ev_ref (EV_A); +} + +inline_size void +ev_stop (EV_P_ W w) +{ + ev_unref (EV_A); + w->active = 0; +} + +/*****************************************************************************/ + +noinline +void +ev_io_start (EV_P_ ev_io *w) EV_THROW +{ + int fd = w->fd; + + if (expect_false (ev_is_active (w))) + return; + + assert (("libev: ev_io_start called with negative fd", fd >= 0)); + assert (("libev: ev_io_start called with illegal event mask", !(w->events & ~(EV__IOFDSET | EV_READ | EV_WRITE)))); + + EV_FREQUENT_CHECK; + + ev_start (EV_A_ (W)w, 1); + array_needsize (ANFD, anfds, anfdmax, fd + 1, array_init_zero); + wlist_add (&anfds[fd].head, (WL)w); + + /* common bug, apparently */ + assert (("libev: ev_io_start called with corrupted watcher", ((WL)w)->next != (WL)w)); + + fd_change (EV_A_ fd, w->events & EV__IOFDSET | EV_ANFD_REIFY); + w->events &= ~EV__IOFDSET; + + EV_FREQUENT_CHECK; +} + +noinline +void +ev_io_stop (EV_P_ ev_io *w) EV_THROW +{ + clear_pending (EV_A_ (W)w); + if (expect_false (!ev_is_active (w))) + return; + + assert (("libev: ev_io_stop called with illegal fd (must stay constant after start!)", w->fd >= 0 && w->fd < anfdmax)); + + EV_FREQUENT_CHECK; + + wlist_del (&anfds[w->fd].head, (WL)w); + ev_stop (EV_A_ (W)w); + + fd_change (EV_A_ w->fd, EV_ANFD_REIFY); + + EV_FREQUENT_CHECK; +} + +noinline +void +ev_timer_start (EV_P_ ev_timer *w) EV_THROW +{ + if (expect_false (ev_is_active (w))) + return; + + ev_at (w) += mn_now; + + assert (("libev: ev_timer_start called with negative timer repeat value", w->repeat >= 0.)); + + EV_FREQUENT_CHECK; + + ++timercnt; + ev_start (EV_A_ (W)w, timercnt + HEAP0 - 1); + array_needsize (ANHE, timers, timermax, ev_active (w) + 1, EMPTY2); + ANHE_w (timers [ev_active (w)]) = (WT)w; + ANHE_at_cache (timers [ev_active (w)]); + upheap (timers, ev_active (w)); + + EV_FREQUENT_CHECK; + + /*assert (("libev: internal timer heap corruption", timers [ev_active (w)] == (WT)w));*/ +} + +noinline +void +ev_timer_stop (EV_P_ ev_timer *w) EV_THROW +{ + clear_pending (EV_A_ (W)w); + if (expect_false (!ev_is_active (w))) + return; + + EV_FREQUENT_CHECK; + + { + int active = ev_active (w); + + assert (("libev: internal timer heap corruption", ANHE_w (timers [active]) == (WT)w)); + + --timercnt; + + if (expect_true (active < timercnt + HEAP0)) + { + timers [active] = timers [timercnt + HEAP0]; + adjustheap (timers, timercnt, active); + } + } + + ev_at (w) -= mn_now; + + ev_stop (EV_A_ (W)w); + + EV_FREQUENT_CHECK; +} + +noinline +void +ev_timer_again (EV_P_ ev_timer *w) EV_THROW +{ + EV_FREQUENT_CHECK; + + clear_pending (EV_A_ (W)w); + + if (ev_is_active (w)) + { + if (w->repeat) + { + ev_at (w) = mn_now + w->repeat; + ANHE_at_cache (timers [ev_active (w)]); + adjustheap (timers, timercnt, ev_active (w)); + } + else + ev_timer_stop (EV_A_ w); + } + else if (w->repeat) + { + ev_at (w) = w->repeat; + ev_timer_start (EV_A_ w); + } + + EV_FREQUENT_CHECK; +} + +ev_tstamp +ev_timer_remaining (EV_P_ ev_timer *w) EV_THROW +{ + return ev_at (w) - (ev_is_active (w) ? mn_now : 0.); +} + +#if EV_PERIODIC_ENABLE +noinline +void +ev_periodic_start (EV_P_ ev_periodic *w) EV_THROW +{ + if (expect_false (ev_is_active (w))) + return; + + if (w->reschedule_cb) + ev_at (w) = w->reschedule_cb (w, ev_rt_now); + else if (w->interval) + { + assert (("libev: ev_periodic_start called with negative interval value", w->interval >= 0.)); + periodic_recalc (EV_A_ w); + } + else + ev_at (w) = w->offset; + + EV_FREQUENT_CHECK; + + ++periodiccnt; + ev_start (EV_A_ (W)w, periodiccnt + HEAP0 - 1); + array_needsize (ANHE, periodics, periodicmax, ev_active (w) + 1, EMPTY2); + ANHE_w (periodics [ev_active (w)]) = (WT)w; + ANHE_at_cache (periodics [ev_active (w)]); + upheap (periodics, ev_active (w)); + + EV_FREQUENT_CHECK; + + /*assert (("libev: internal periodic heap corruption", ANHE_w (periodics [ev_active (w)]) == (WT)w));*/ +} + +noinline +void +ev_periodic_stop (EV_P_ ev_periodic *w) EV_THROW +{ + clear_pending (EV_A_ (W)w); + if (expect_false (!ev_is_active (w))) + return; + + EV_FREQUENT_CHECK; + + { + int active = ev_active (w); + + assert (("libev: internal periodic heap corruption", ANHE_w (periodics [active]) == (WT)w)); + + --periodiccnt; + + if (expect_true (active < periodiccnt + HEAP0)) + { + periodics [active] = periodics [periodiccnt + HEAP0]; + adjustheap (periodics, periodiccnt, active); + } + } + + ev_stop (EV_A_ (W)w); + + EV_FREQUENT_CHECK; +} + +noinline +void +ev_periodic_again (EV_P_ ev_periodic *w) EV_THROW +{ + /* TODO: use adjustheap and recalculation */ + ev_periodic_stop (EV_A_ w); + ev_periodic_start (EV_A_ w); +} +#endif + +#ifndef SA_RESTART +# define SA_RESTART 0 +#endif + +#if EV_SIGNAL_ENABLE + +noinline +void +ev_signal_start (EV_P_ ev_signal *w) EV_THROW +{ + if (expect_false (ev_is_active (w))) + return; + + assert (("libev: ev_signal_start called with illegal signal number", w->signum > 0 && w->signum < EV_NSIG)); + +#if EV_MULTIPLICITY + assert (("libev: a signal must not be attached to two different loops", + !signals [w->signum - 1].loop || signals [w->signum - 1].loop == loop)); + + signals [w->signum - 1].loop = EV_A; + ECB_MEMORY_FENCE_RELEASE; +#endif + + EV_FREQUENT_CHECK; + +#if EV_USE_SIGNALFD + if (sigfd == -2) + { + sigfd = signalfd (-1, &sigfd_set, SFD_NONBLOCK | SFD_CLOEXEC); + if (sigfd < 0 && errno == EINVAL) + sigfd = signalfd (-1, &sigfd_set, 0); /* retry without flags */ + + if (sigfd >= 0) + { + fd_intern (sigfd); /* doing it twice will not hurt */ + + sigemptyset (&sigfd_set); + + ev_io_init (&sigfd_w, sigfdcb, sigfd, EV_READ); + ev_set_priority (&sigfd_w, EV_MAXPRI); + ev_io_start (EV_A_ &sigfd_w); + ev_unref (EV_A); /* signalfd watcher should not keep loop alive */ + } + } + + if (sigfd >= 0) + { + /* TODO: check .head */ + sigaddset (&sigfd_set, w->signum); + sigprocmask (SIG_BLOCK, &sigfd_set, 0); + + signalfd (sigfd, &sigfd_set, 0); + } +#endif + + ev_start (EV_A_ (W)w, 1); + wlist_add (&signals [w->signum - 1].head, (WL)w); + + if (!((WL)w)->next) +# if EV_USE_SIGNALFD + if (sigfd < 0) /*TODO*/ +# endif + { +# ifdef _WIN32 + evpipe_init (EV_A); + + signal (w->signum, ev_sighandler); +# else + struct sigaction sa; + + evpipe_init (EV_A); + + sa.sa_handler = ev_sighandler; + sigfillset (&sa.sa_mask); + sa.sa_flags = SA_RESTART; /* if restarting works we save one iteration */ + sigaction (w->signum, &sa, 0); + + if (origflags & EVFLAG_NOSIGMASK) + { + sigemptyset (&sa.sa_mask); + sigaddset (&sa.sa_mask, w->signum); + sigprocmask (SIG_UNBLOCK, &sa.sa_mask, 0); + } +#endif + } + + EV_FREQUENT_CHECK; +} + +noinline +void +ev_signal_stop (EV_P_ ev_signal *w) EV_THROW +{ + clear_pending (EV_A_ (W)w); + if (expect_false (!ev_is_active (w))) + return; + + EV_FREQUENT_CHECK; + + wlist_del (&signals [w->signum - 1].head, (WL)w); + ev_stop (EV_A_ (W)w); + + if (!signals [w->signum - 1].head) + { +#if EV_MULTIPLICITY + signals [w->signum - 1].loop = 0; /* unattach from signal */ +#endif +#if EV_USE_SIGNALFD + if (sigfd >= 0) + { + sigset_t ss; + + sigemptyset (&ss); + sigaddset (&ss, w->signum); + sigdelset (&sigfd_set, w->signum); + + signalfd (sigfd, &sigfd_set, 0); + sigprocmask (SIG_UNBLOCK, &ss, 0); + } + else +#endif + signal (w->signum, SIG_DFL); + } + + EV_FREQUENT_CHECK; +} + +#endif + +#if EV_CHILD_ENABLE + +void +ev_child_start (EV_P_ ev_child *w) EV_THROW +{ +#if EV_MULTIPLICITY + assert (("libev: child watchers are only supported in the default loop", loop == ev_default_loop_ptr)); +#endif + if (expect_false (ev_is_active (w))) + return; + + EV_FREQUENT_CHECK; + + ev_start (EV_A_ (W)w, 1); + wlist_add (&childs [w->pid & ((EV_PID_HASHSIZE) - 1)], (WL)w); + + EV_FREQUENT_CHECK; +} + +void +ev_child_stop (EV_P_ ev_child *w) EV_THROW +{ + clear_pending (EV_A_ (W)w); + if (expect_false (!ev_is_active (w))) + return; + + EV_FREQUENT_CHECK; + + wlist_del (&childs [w->pid & ((EV_PID_HASHSIZE) - 1)], (WL)w); + ev_stop (EV_A_ (W)w); + + EV_FREQUENT_CHECK; +} + +#endif + +#if EV_STAT_ENABLE + +# ifdef _WIN32 +# undef lstat +# define lstat(a,b) _stati64 (a,b) +# endif + +#define DEF_STAT_INTERVAL 5.0074891 +#define NFS_STAT_INTERVAL 30.1074891 /* for filesystems potentially failing inotify */ +#define MIN_STAT_INTERVAL 0.1074891 + +noinline static void stat_timer_cb (EV_P_ ev_timer *w_, int revents); + +#if EV_USE_INOTIFY + +/* the * 2 is to allow for alignment padding, which for some reason is >> 8 */ +# define EV_INOTIFY_BUFSIZE (sizeof (struct inotify_event) * 2 + NAME_MAX) + +noinline +static void +infy_add (EV_P_ ev_stat *w) +{ + w->wd = inotify_add_watch (fs_fd, w->path, + IN_ATTRIB | IN_DELETE_SELF | IN_MOVE_SELF | IN_MODIFY + | IN_CREATE | IN_DELETE | IN_MOVED_FROM | IN_MOVED_TO + | IN_DONT_FOLLOW | IN_MASK_ADD); + + if (w->wd >= 0) + { + struct statfs sfs; + + /* now local changes will be tracked by inotify, but remote changes won't */ + /* unless the filesystem is known to be local, we therefore still poll */ + /* also do poll on <2.6.25, but with normal frequency */ + + if (!fs_2625) + w->timer.repeat = w->interval ? w->interval : DEF_STAT_INTERVAL; + else if (!statfs (w->path, &sfs) + && (sfs.f_type == 0x1373 /* devfs */ + || sfs.f_type == 0x4006 /* fat */ + || sfs.f_type == 0x4d44 /* msdos */ + || sfs.f_type == 0xEF53 /* ext2/3 */ + || sfs.f_type == 0x72b6 /* jffs2 */ + || sfs.f_type == 0x858458f6 /* ramfs */ + || sfs.f_type == 0x5346544e /* ntfs */ + || sfs.f_type == 0x3153464a /* jfs */ + || sfs.f_type == 0x9123683e /* btrfs */ + || sfs.f_type == 0x52654973 /* reiser3 */ + || sfs.f_type == 0x01021994 /* tmpfs */ + || sfs.f_type == 0x58465342 /* xfs */)) + w->timer.repeat = 0.; /* filesystem is local, kernel new enough */ + else + w->timer.repeat = w->interval ? w->interval : NFS_STAT_INTERVAL; /* remote, use reduced frequency */ + } + else + { + /* can't use inotify, continue to stat */ + w->timer.repeat = w->interval ? w->interval : DEF_STAT_INTERVAL; + + /* if path is not there, monitor some parent directory for speedup hints */ + /* note that exceeding the hardcoded path limit is not a correctness issue, */ + /* but an efficiency issue only */ + if ((errno == ENOENT || errno == EACCES) && strlen (w->path) < 4096) + { + char path [4096]; + strcpy (path, w->path); + + do + { + int mask = IN_MASK_ADD | IN_DELETE_SELF | IN_MOVE_SELF + | (errno == EACCES ? IN_ATTRIB : IN_CREATE | IN_MOVED_TO); + + char *pend = strrchr (path, '/'); + + if (!pend || pend == path) + break; + + *pend = 0; + w->wd = inotify_add_watch (fs_fd, path, mask); + } + while (w->wd < 0 && (errno == ENOENT || errno == EACCES)); + } + } + + if (w->wd >= 0) + wlist_add (&fs_hash [w->wd & ((EV_INOTIFY_HASHSIZE) - 1)].head, (WL)w); + + /* now re-arm timer, if required */ + if (ev_is_active (&w->timer)) ev_ref (EV_A); + ev_timer_again (EV_A_ &w->timer); + if (ev_is_active (&w->timer)) ev_unref (EV_A); +} + +noinline +static void +infy_del (EV_P_ ev_stat *w) +{ + int slot; + int wd = w->wd; + + if (wd < 0) + return; + + w->wd = -2; + slot = wd & ((EV_INOTIFY_HASHSIZE) - 1); + wlist_del (&fs_hash [slot].head, (WL)w); + + /* remove this watcher, if others are watching it, they will rearm */ + inotify_rm_watch (fs_fd, wd); +} + +noinline +static void +infy_wd (EV_P_ int slot, int wd, struct inotify_event *ev) +{ + if (slot < 0) + /* overflow, need to check for all hash slots */ + for (slot = 0; slot < (EV_INOTIFY_HASHSIZE); ++slot) + infy_wd (EV_A_ slot, wd, ev); + else + { + WL w_; + + for (w_ = fs_hash [slot & ((EV_INOTIFY_HASHSIZE) - 1)].head; w_; ) + { + ev_stat *w = (ev_stat *)w_; + w_ = w_->next; /* lets us remove this watcher and all before it */ + + if (w->wd == wd || wd == -1) + { + if (ev->mask & (IN_IGNORED | IN_UNMOUNT | IN_DELETE_SELF)) + { + wlist_del (&fs_hash [slot & ((EV_INOTIFY_HASHSIZE) - 1)].head, (WL)w); + w->wd = -1; + infy_add (EV_A_ w); /* re-add, no matter what */ + } + + stat_timer_cb (EV_A_ &w->timer, 0); + } + } + } +} + +static void +infy_cb (EV_P_ ev_io *w, int revents) +{ + char buf [EV_INOTIFY_BUFSIZE]; + int ofs; + int len = read (fs_fd, buf, sizeof (buf)); + + for (ofs = 0; ofs < len; ) + { + struct inotify_event *ev = (struct inotify_event *)(buf + ofs); + infy_wd (EV_A_ ev->wd, ev->wd, ev); + ofs += sizeof (struct inotify_event) + ev->len; + } +} + +inline_size ecb_cold +void +ev_check_2625 (EV_P) +{ + /* kernels < 2.6.25 are borked + * http://www.ussg.indiana.edu/hypermail/linux/kernel/0711.3/1208.html + */ + if (ev_linux_version () < 0x020619) + return; + + fs_2625 = 1; +} + +inline_size int +infy_newfd (void) +{ +#if defined IN_CLOEXEC && defined IN_NONBLOCK + int fd = inotify_init1 (IN_CLOEXEC | IN_NONBLOCK); + if (fd >= 0) + return fd; +#endif + return inotify_init (); +} + +inline_size void +infy_init (EV_P) +{ + if (fs_fd != -2) + return; + + fs_fd = -1; + + ev_check_2625 (EV_A); + + fs_fd = infy_newfd (); + + if (fs_fd >= 0) + { + fd_intern (fs_fd); + ev_io_init (&fs_w, infy_cb, fs_fd, EV_READ); + ev_set_priority (&fs_w, EV_MAXPRI); + ev_io_start (EV_A_ &fs_w); + ev_unref (EV_A); + } +} + +inline_size void +infy_fork (EV_P) +{ + int slot; + + if (fs_fd < 0) + return; + + ev_ref (EV_A); + ev_io_stop (EV_A_ &fs_w); + close (fs_fd); + fs_fd = infy_newfd (); + + if (fs_fd >= 0) + { + fd_intern (fs_fd); + ev_io_set (&fs_w, fs_fd, EV_READ); + ev_io_start (EV_A_ &fs_w); + ev_unref (EV_A); + } + + for (slot = 0; slot < (EV_INOTIFY_HASHSIZE); ++slot) + { + WL w_ = fs_hash [slot].head; + fs_hash [slot].head = 0; + + while (w_) + { + ev_stat *w = (ev_stat *)w_; + w_ = w_->next; /* lets us add this watcher */ + + w->wd = -1; + + if (fs_fd >= 0) + infy_add (EV_A_ w); /* re-add, no matter what */ + else + { + w->timer.repeat = w->interval ? w->interval : DEF_STAT_INTERVAL; + if (ev_is_active (&w->timer)) ev_ref (EV_A); + ev_timer_again (EV_A_ &w->timer); + if (ev_is_active (&w->timer)) ev_unref (EV_A); + } + } + } +} + +#endif + +#ifdef _WIN32 +# define EV_LSTAT(p,b) _stati64 (p, b) +#else +# define EV_LSTAT(p,b) lstat (p, b) +#endif + +void +ev_stat_stat (EV_P_ ev_stat *w) EV_THROW +{ + if (lstat (w->path, &w->attr) < 0) + w->attr.st_nlink = 0; + else if (!w->attr.st_nlink) + w->attr.st_nlink = 1; +} + +noinline +static void +stat_timer_cb (EV_P_ ev_timer *w_, int revents) +{ + ev_stat *w = (ev_stat *)(((char *)w_) - offsetof (ev_stat, timer)); + + ev_statdata prev = w->attr; + ev_stat_stat (EV_A_ w); + + /* memcmp doesn't work on netbsd, they.... do stuff to their struct stat */ + if ( + prev.st_dev != w->attr.st_dev + || prev.st_ino != w->attr.st_ino + || prev.st_mode != w->attr.st_mode + || prev.st_nlink != w->attr.st_nlink + || prev.st_uid != w->attr.st_uid + || prev.st_gid != w->attr.st_gid + || prev.st_rdev != w->attr.st_rdev + || prev.st_size != w->attr.st_size + || prev.st_atime != w->attr.st_atime + || prev.st_mtime != w->attr.st_mtime + || prev.st_ctime != w->attr.st_ctime + ) { + /* we only update w->prev on actual differences */ + /* in case we test more often than invoke the callback, */ + /* to ensure that prev is always different to attr */ + w->prev = prev; + + #if EV_USE_INOTIFY + if (fs_fd >= 0) + { + infy_del (EV_A_ w); + infy_add (EV_A_ w); + ev_stat_stat (EV_A_ w); /* avoid race... */ + } + #endif + + ev_feed_event (EV_A_ w, EV_STAT); + } +} + +void +ev_stat_start (EV_P_ ev_stat *w) EV_THROW +{ + if (expect_false (ev_is_active (w))) + return; + + ev_stat_stat (EV_A_ w); + + if (w->interval < MIN_STAT_INTERVAL && w->interval) + w->interval = MIN_STAT_INTERVAL; + + ev_timer_init (&w->timer, stat_timer_cb, 0., w->interval ? w->interval : DEF_STAT_INTERVAL); + ev_set_priority (&w->timer, ev_priority (w)); + +#if EV_USE_INOTIFY + infy_init (EV_A); + + if (fs_fd >= 0) + infy_add (EV_A_ w); + else +#endif + { + ev_timer_again (EV_A_ &w->timer); + ev_unref (EV_A); + } + + ev_start (EV_A_ (W)w, 1); + + EV_FREQUENT_CHECK; +} + +void +ev_stat_stop (EV_P_ ev_stat *w) EV_THROW +{ + clear_pending (EV_A_ (W)w); + if (expect_false (!ev_is_active (w))) + return; + + EV_FREQUENT_CHECK; + +#if EV_USE_INOTIFY + infy_del (EV_A_ w); +#endif + + if (ev_is_active (&w->timer)) + { + ev_ref (EV_A); + ev_timer_stop (EV_A_ &w->timer); + } + + ev_stop (EV_A_ (W)w); + + EV_FREQUENT_CHECK; +} +#endif + +#if EV_IDLE_ENABLE +void +ev_idle_start (EV_P_ ev_idle *w) EV_THROW +{ + if (expect_false (ev_is_active (w))) + return; + + pri_adjust (EV_A_ (W)w); + + EV_FREQUENT_CHECK; + + { + int active = ++idlecnt [ABSPRI (w)]; + + ++idleall; + ev_start (EV_A_ (W)w, active); + + array_needsize (ev_idle *, idles [ABSPRI (w)], idlemax [ABSPRI (w)], active, EMPTY2); + idles [ABSPRI (w)][active - 1] = w; + } + + EV_FREQUENT_CHECK; +} + +void +ev_idle_stop (EV_P_ ev_idle *w) EV_THROW +{ + clear_pending (EV_A_ (W)w); + if (expect_false (!ev_is_active (w))) + return; + + EV_FREQUENT_CHECK; + + { + int active = ev_active (w); + + idles [ABSPRI (w)][active - 1] = idles [ABSPRI (w)][--idlecnt [ABSPRI (w)]]; + ev_active (idles [ABSPRI (w)][active - 1]) = active; + + ev_stop (EV_A_ (W)w); + --idleall; + } + + EV_FREQUENT_CHECK; +} +#endif + +#if EV_PREPARE_ENABLE +void +ev_prepare_start (EV_P_ ev_prepare *w) EV_THROW +{ + if (expect_false (ev_is_active (w))) + return; + + EV_FREQUENT_CHECK; + + ev_start (EV_A_ (W)w, ++preparecnt); + array_needsize (ev_prepare *, prepares, preparemax, preparecnt, EMPTY2); + prepares [preparecnt - 1] = w; + + EV_FREQUENT_CHECK; +} + +void +ev_prepare_stop (EV_P_ ev_prepare *w) EV_THROW +{ + clear_pending (EV_A_ (W)w); + if (expect_false (!ev_is_active (w))) + return; + + EV_FREQUENT_CHECK; + + { + int active = ev_active (w); + + prepares [active - 1] = prepares [--preparecnt]; + ev_active (prepares [active - 1]) = active; + } + + ev_stop (EV_A_ (W)w); + + EV_FREQUENT_CHECK; +} +#endif + +#if EV_CHECK_ENABLE +void +ev_check_start (EV_P_ ev_check *w) EV_THROW +{ + if (expect_false (ev_is_active (w))) + return; + + EV_FREQUENT_CHECK; + + ev_start (EV_A_ (W)w, ++checkcnt); + array_needsize (ev_check *, checks, checkmax, checkcnt, EMPTY2); + checks [checkcnt - 1] = w; + + EV_FREQUENT_CHECK; +} + +void +ev_check_stop (EV_P_ ev_check *w) EV_THROW +{ + clear_pending (EV_A_ (W)w); + if (expect_false (!ev_is_active (w))) + return; + + EV_FREQUENT_CHECK; + + { + int active = ev_active (w); + + checks [active - 1] = checks [--checkcnt]; + ev_active (checks [active - 1]) = active; + } + + ev_stop (EV_A_ (W)w); + + EV_FREQUENT_CHECK; +} +#endif + +#if EV_EMBED_ENABLE +noinline +void +ev_embed_sweep (EV_P_ ev_embed *w) EV_THROW +{ + ev_run (w->other, EVRUN_NOWAIT); +} + +static void +embed_io_cb (EV_P_ ev_io *io, int revents) +{ + ev_embed *w = (ev_embed *)(((char *)io) - offsetof (ev_embed, io)); + + if (ev_cb (w)) + ev_feed_event (EV_A_ (W)w, EV_EMBED); + else + ev_run (w->other, EVRUN_NOWAIT); +} + +static void +embed_prepare_cb (EV_P_ ev_prepare *prepare, int revents) +{ + ev_embed *w = (ev_embed *)(((char *)prepare) - offsetof (ev_embed, prepare)); + + { + EV_P = w->other; + + while (fdchangecnt) + { + fd_reify (EV_A); + ev_run (EV_A_ EVRUN_NOWAIT); + } + } +} + +static void +embed_fork_cb (EV_P_ ev_fork *fork_w, int revents) +{ + ev_embed *w = (ev_embed *)(((char *)fork_w) - offsetof (ev_embed, fork)); + + ev_embed_stop (EV_A_ w); + + { + EV_P = w->other; + + ev_loop_fork (EV_A); + ev_run (EV_A_ EVRUN_NOWAIT); + } + + ev_embed_start (EV_A_ w); +} + +#if 0 +static void +embed_idle_cb (EV_P_ ev_idle *idle, int revents) +{ + ev_idle_stop (EV_A_ idle); +} +#endif + +void +ev_embed_start (EV_P_ ev_embed *w) EV_THROW +{ + if (expect_false (ev_is_active (w))) + return; + + { + EV_P = w->other; + assert (("libev: loop to be embedded is not embeddable", backend & ev_embeddable_backends ())); + ev_io_init (&w->io, embed_io_cb, backend_fd, EV_READ); + } + + EV_FREQUENT_CHECK; + + ev_set_priority (&w->io, ev_priority (w)); + ev_io_start (EV_A_ &w->io); + + ev_prepare_init (&w->prepare, embed_prepare_cb); + ev_set_priority (&w->prepare, EV_MINPRI); + ev_prepare_start (EV_A_ &w->prepare); + + ev_fork_init (&w->fork, embed_fork_cb); + ev_fork_start (EV_A_ &w->fork); + + /*ev_idle_init (&w->idle, e,bed_idle_cb);*/ + + ev_start (EV_A_ (W)w, 1); + + EV_FREQUENT_CHECK; +} + +void +ev_embed_stop (EV_P_ ev_embed *w) EV_THROW +{ + clear_pending (EV_A_ (W)w); + if (expect_false (!ev_is_active (w))) + return; + + EV_FREQUENT_CHECK; + + ev_io_stop (EV_A_ &w->io); + ev_prepare_stop (EV_A_ &w->prepare); + ev_fork_stop (EV_A_ &w->fork); + + ev_stop (EV_A_ (W)w); + + EV_FREQUENT_CHECK; +} +#endif + +#if EV_FORK_ENABLE +void +ev_fork_start (EV_P_ ev_fork *w) EV_THROW +{ + if (expect_false (ev_is_active (w))) + return; + + EV_FREQUENT_CHECK; + + ev_start (EV_A_ (W)w, ++forkcnt); + array_needsize (ev_fork *, forks, forkmax, forkcnt, EMPTY2); + forks [forkcnt - 1] = w; + + EV_FREQUENT_CHECK; +} + +void +ev_fork_stop (EV_P_ ev_fork *w) EV_THROW +{ + clear_pending (EV_A_ (W)w); + if (expect_false (!ev_is_active (w))) + return; + + EV_FREQUENT_CHECK; + + { + int active = ev_active (w); + + forks [active - 1] = forks [--forkcnt]; + ev_active (forks [active - 1]) = active; + } + + ev_stop (EV_A_ (W)w); + + EV_FREQUENT_CHECK; +} +#endif + +#if EV_CLEANUP_ENABLE +void +ev_cleanup_start (EV_P_ ev_cleanup *w) EV_THROW +{ + if (expect_false (ev_is_active (w))) + return; + + EV_FREQUENT_CHECK; + + ev_start (EV_A_ (W)w, ++cleanupcnt); + array_needsize (ev_cleanup *, cleanups, cleanupmax, cleanupcnt, EMPTY2); + cleanups [cleanupcnt - 1] = w; + + /* cleanup watchers should never keep a refcount on the loop */ + ev_unref (EV_A); + EV_FREQUENT_CHECK; +} + +void +ev_cleanup_stop (EV_P_ ev_cleanup *w) EV_THROW +{ + clear_pending (EV_A_ (W)w); + if (expect_false (!ev_is_active (w))) + return; + + EV_FREQUENT_CHECK; + ev_ref (EV_A); + + { + int active = ev_active (w); + + cleanups [active - 1] = cleanups [--cleanupcnt]; + ev_active (cleanups [active - 1]) = active; + } + + ev_stop (EV_A_ (W)w); + + EV_FREQUENT_CHECK; +} +#endif + +#if EV_ASYNC_ENABLE +void +ev_async_start (EV_P_ ev_async *w) EV_THROW +{ + if (expect_false (ev_is_active (w))) + return; + + w->sent = 0; + + evpipe_init (EV_A); + + EV_FREQUENT_CHECK; + + ev_start (EV_A_ (W)w, ++asynccnt); + array_needsize (ev_async *, asyncs, asyncmax, asynccnt, EMPTY2); + asyncs [asynccnt - 1] = w; + + EV_FREQUENT_CHECK; +} + +void +ev_async_stop (EV_P_ ev_async *w) EV_THROW +{ + clear_pending (EV_A_ (W)w); + if (expect_false (!ev_is_active (w))) + return; + + EV_FREQUENT_CHECK; + + { + int active = ev_active (w); + + asyncs [active - 1] = asyncs [--asynccnt]; + ev_active (asyncs [active - 1]) = active; + } + + ev_stop (EV_A_ (W)w); + + EV_FREQUENT_CHECK; +} + +void +ev_async_send (EV_P_ ev_async *w) EV_THROW +{ + w->sent = 1; + evpipe_write (EV_A_ &async_pending); +} +#endif + +/*****************************************************************************/ + +struct ev_once +{ + ev_io io; + ev_timer to; + void (*cb)(int revents, void *arg); + void *arg; +}; + +static void +once_cb (EV_P_ struct ev_once *once, int revents) +{ + void (*cb)(int revents, void *arg) = once->cb; + void *arg = once->arg; + + ev_io_stop (EV_A_ &once->io); + ev_timer_stop (EV_A_ &once->to); + ev_free (once); + + cb (revents, arg); +} + +static void +once_cb_io (EV_P_ ev_io *w, int revents) +{ + struct ev_once *once = (struct ev_once *)(((char *)w) - offsetof (struct ev_once, io)); + + once_cb (EV_A_ once, revents | ev_clear_pending (EV_A_ &once->to)); +} + +static void +once_cb_to (EV_P_ ev_timer *w, int revents) +{ + struct ev_once *once = (struct ev_once *)(((char *)w) - offsetof (struct ev_once, to)); + + once_cb (EV_A_ once, revents | ev_clear_pending (EV_A_ &once->io)); +} + +void +ev_once (EV_P_ int fd, int events, ev_tstamp timeout, void (*cb)(int revents, void *arg), void *arg) EV_THROW +{ + struct ev_once *once = (struct ev_once *)ev_malloc (sizeof (struct ev_once)); + + if (expect_false (!once)) + { + cb (EV_ERROR | EV_READ | EV_WRITE | EV_TIMER, arg); + return; + } + + once->cb = cb; + once->arg = arg; + + ev_init (&once->io, once_cb_io); + if (fd >= 0) + { + ev_io_set (&once->io, fd, events); + ev_io_start (EV_A_ &once->io); + } + + ev_init (&once->to, once_cb_to); + if (timeout >= 0.) + { + ev_timer_set (&once->to, timeout, 0.); + ev_timer_start (EV_A_ &once->to); + } +} + +/*****************************************************************************/ + +#if EV_WALK_ENABLE +ecb_cold +void +ev_walk (EV_P_ int types, void (*cb)(EV_P_ int type, void *w)) EV_THROW +{ + int i, j; + ev_watcher_list *wl, *wn; + + if (types & (EV_IO | EV_EMBED)) + for (i = 0; i < anfdmax; ++i) + for (wl = anfds [i].head; wl; ) + { + wn = wl->next; + +#if EV_EMBED_ENABLE + if (ev_cb ((ev_io *)wl) == embed_io_cb) + { + if (types & EV_EMBED) + cb (EV_A_ EV_EMBED, ((char *)wl) - offsetof (struct ev_embed, io)); + } + else +#endif +#if EV_USE_INOTIFY + if (ev_cb ((ev_io *)wl) == infy_cb) + ; + else +#endif + if ((ev_io *)wl != &pipe_w) + if (types & EV_IO) + cb (EV_A_ EV_IO, wl); + + wl = wn; + } + + if (types & (EV_TIMER | EV_STAT)) + for (i = timercnt + HEAP0; i-- > HEAP0; ) +#if EV_STAT_ENABLE + /*TODO: timer is not always active*/ + if (ev_cb ((ev_timer *)ANHE_w (timers [i])) == stat_timer_cb) + { + if (types & EV_STAT) + cb (EV_A_ EV_STAT, ((char *)ANHE_w (timers [i])) - offsetof (struct ev_stat, timer)); + } + else +#endif + if (types & EV_TIMER) + cb (EV_A_ EV_TIMER, ANHE_w (timers [i])); + +#if EV_PERIODIC_ENABLE + if (types & EV_PERIODIC) + for (i = periodiccnt + HEAP0; i-- > HEAP0; ) + cb (EV_A_ EV_PERIODIC, ANHE_w (periodics [i])); +#endif + +#if EV_IDLE_ENABLE + if (types & EV_IDLE) + for (j = NUMPRI; j--; ) + for (i = idlecnt [j]; i--; ) + cb (EV_A_ EV_IDLE, idles [j][i]); +#endif + +#if EV_FORK_ENABLE + if (types & EV_FORK) + for (i = forkcnt; i--; ) + if (ev_cb (forks [i]) != embed_fork_cb) + cb (EV_A_ EV_FORK, forks [i]); +#endif + +#if EV_ASYNC_ENABLE + if (types & EV_ASYNC) + for (i = asynccnt; i--; ) + cb (EV_A_ EV_ASYNC, asyncs [i]); +#endif + +#if EV_PREPARE_ENABLE + if (types & EV_PREPARE) + for (i = preparecnt; i--; ) +# if EV_EMBED_ENABLE + if (ev_cb (prepares [i]) != embed_prepare_cb) +# endif + cb (EV_A_ EV_PREPARE, prepares [i]); +#endif + +#if EV_CHECK_ENABLE + if (types & EV_CHECK) + for (i = checkcnt; i--; ) + cb (EV_A_ EV_CHECK, checks [i]); +#endif + +#if EV_SIGNAL_ENABLE + if (types & EV_SIGNAL) + for (i = 0; i < EV_NSIG - 1; ++i) + for (wl = signals [i].head; wl; ) + { + wn = wl->next; + cb (EV_A_ EV_SIGNAL, wl); + wl = wn; + } +#endif + +#if EV_CHILD_ENABLE + if (types & EV_CHILD) + for (i = (EV_PID_HASHSIZE); i--; ) + for (wl = childs [i]; wl; ) + { + wn = wl->next; + cb (EV_A_ EV_CHILD, wl); + wl = wn; + } +#endif +/* EV_STAT 0x00001000 /* stat data changed */ +/* EV_EMBED 0x00010000 /* embedded event loop needs sweep */ +} +#endif + +#if EV_MULTIPLICITY + #include "ev_wrap.h" +#endif + diff --git a/examples/udp_proxy/libev/ev.h b/examples/udp_proxy/libev/ev.h new file mode 100644 index 0000000..db93777 --- /dev/null +++ b/examples/udp_proxy/libev/ev.h @@ -0,0 +1,854 @@ +/* + * libev native API header + * + * Copyright (c) 2007,2008,2009,2010,2011,2012,2015 Marc Alexander Lehmann + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#ifndef EV_H_ +#define EV_H_ + +#ifdef __cplusplus +# define EV_CPP(x) x +# if __cplusplus >= 201103L +# define EV_THROW noexcept +# else +# define EV_THROW throw () +# endif +#else +# define EV_CPP(x) +# define EV_THROW +#endif + +EV_CPP(extern "C" {) + +/*****************************************************************************/ + +/* pre-4.0 compatibility */ +#ifndef EV_COMPAT3 +# define EV_COMPAT3 1 +#endif + +#ifndef EV_FEATURES +# if defined __OPTIMIZE_SIZE__ +# define EV_FEATURES 0x7c +# else +# define EV_FEATURES 0x7f +# endif +#endif + +#define EV_FEATURE_CODE ((EV_FEATURES) & 1) +#define EV_FEATURE_DATA ((EV_FEATURES) & 2) +#define EV_FEATURE_CONFIG ((EV_FEATURES) & 4) +#define EV_FEATURE_API ((EV_FEATURES) & 8) +#define EV_FEATURE_WATCHERS ((EV_FEATURES) & 16) +#define EV_FEATURE_BACKENDS ((EV_FEATURES) & 32) +#define EV_FEATURE_OS ((EV_FEATURES) & 64) + +/* these priorities are inclusive, higher priorities will be invoked earlier */ +#ifndef EV_MINPRI +# define EV_MINPRI (EV_FEATURE_CONFIG ? -2 : 0) +#endif +#ifndef EV_MAXPRI +# define EV_MAXPRI (EV_FEATURE_CONFIG ? +2 : 0) +#endif + +#ifndef EV_MULTIPLICITY +# define EV_MULTIPLICITY EV_FEATURE_CONFIG +#endif + +#ifndef EV_PERIODIC_ENABLE +# define EV_PERIODIC_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_STAT_ENABLE +# define EV_STAT_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_PREPARE_ENABLE +# define EV_PREPARE_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_CHECK_ENABLE +# define EV_CHECK_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_IDLE_ENABLE +# define EV_IDLE_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_FORK_ENABLE +# define EV_FORK_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_CLEANUP_ENABLE +# define EV_CLEANUP_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_SIGNAL_ENABLE +# define EV_SIGNAL_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_CHILD_ENABLE +# ifdef _WIN32 +# define EV_CHILD_ENABLE 0 +# else +# define EV_CHILD_ENABLE EV_FEATURE_WATCHERS +#endif +#endif + +#ifndef EV_ASYNC_ENABLE +# define EV_ASYNC_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_EMBED_ENABLE +# define EV_EMBED_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_WALK_ENABLE +# define EV_WALK_ENABLE 0 /* not yet */ +#endif + +/*****************************************************************************/ + +#if EV_CHILD_ENABLE && !EV_SIGNAL_ENABLE +# undef EV_SIGNAL_ENABLE +# define EV_SIGNAL_ENABLE 1 +#endif + +/*****************************************************************************/ + +typedef double ev_tstamp; + +#include /* for memmove */ + +#ifndef EV_ATOMIC_T +# include +# define EV_ATOMIC_T sig_atomic_t volatile +#endif + +#if EV_STAT_ENABLE +# ifdef _WIN32 +# include +# include +# endif +# include +#endif + +/* support multiple event loops? */ +#if EV_MULTIPLICITY +struct ev_loop; +# define EV_P struct ev_loop *loop /* a loop as sole parameter in a declaration */ +# define EV_P_ EV_P, /* a loop as first of multiple parameters */ +# define EV_A loop /* a loop as sole argument to a function call */ +# define EV_A_ EV_A, /* a loop as first of multiple arguments */ +# define EV_DEFAULT_UC ev_default_loop_uc_ () /* the default loop, if initialised, as sole arg */ +# define EV_DEFAULT_UC_ EV_DEFAULT_UC, /* the default loop as first of multiple arguments */ +# define EV_DEFAULT ev_default_loop (0) /* the default loop as sole arg */ +# define EV_DEFAULT_ EV_DEFAULT, /* the default loop as first of multiple arguments */ +#else +# define EV_P void +# define EV_P_ +# define EV_A +# define EV_A_ +# define EV_DEFAULT +# define EV_DEFAULT_ +# define EV_DEFAULT_UC +# define EV_DEFAULT_UC_ +# undef EV_EMBED_ENABLE +#endif + +/* EV_INLINE is used for functions in header files */ +#if __STDC_VERSION__ >= 199901L || __GNUC__ >= 3 +# define EV_INLINE static inline +#else +# define EV_INLINE static +#endif + +#ifdef EV_API_STATIC +# define EV_API_DECL static +#else +# define EV_API_DECL extern +#endif + +/* EV_PROTOTYPES can be used to switch of prototype declarations */ +#ifndef EV_PROTOTYPES +# define EV_PROTOTYPES 1 +#endif + +/*****************************************************************************/ + +#define EV_VERSION_MAJOR 4 +#define EV_VERSION_MINOR 24 + +/* eventmask, revents, events... */ +enum { + EV_UNDEF = (int)0xFFFFFFFF, /* guaranteed to be invalid */ + EV_NONE = 0x00, /* no events */ + EV_READ = 0x01, /* ev_io detected read will not block */ + EV_WRITE = 0x02, /* ev_io detected write will not block */ + EV__IOFDSET = 0x80, /* internal use only */ + EV_IO = EV_READ, /* alias for type-detection */ + EV_TIMER = 0x00000100, /* timer timed out */ +#if EV_COMPAT3 + EV_TIMEOUT = EV_TIMER, /* pre 4.0 API compatibility */ +#endif + EV_PERIODIC = 0x00000200, /* periodic timer timed out */ + EV_SIGNAL = 0x00000400, /* signal was received */ + EV_CHILD = 0x00000800, /* child/pid had status change */ + EV_STAT = 0x00001000, /* stat data changed */ + EV_IDLE = 0x00002000, /* event loop is idling */ + EV_PREPARE = 0x00004000, /* event loop about to poll */ + EV_CHECK = 0x00008000, /* event loop finished poll */ + EV_EMBED = 0x00010000, /* embedded event loop needs sweep */ + EV_FORK = 0x00020000, /* event loop resumed in child */ + EV_CLEANUP = 0x00040000, /* event loop resumed in child */ + EV_ASYNC = 0x00080000, /* async intra-loop signal */ + EV_CUSTOM = 0x01000000, /* for use by user code */ + EV_ERROR = (int)0x80000000 /* sent when an error occurs */ +}; + +/* can be used to add custom fields to all watchers, while losing binary compatibility */ +#ifndef EV_COMMON +# define EV_COMMON void *data; +#endif + +#ifndef EV_CB_DECLARE +# define EV_CB_DECLARE(type) void (*cb)(EV_P_ struct type *w, int revents); +#endif +#ifndef EV_CB_INVOKE +# define EV_CB_INVOKE(watcher,revents) (watcher)->cb (EV_A_ (watcher), (revents)) +#endif + +/* not official, do not use */ +#define EV_CB(type,name) void name (EV_P_ struct ev_ ## type *w, int revents) + +/* + * struct member types: + * private: you may look at them, but not change them, + * and they might not mean anything to you. + * ro: can be read anytime, but only changed when the watcher isn't active. + * rw: can be read and modified anytime, even when the watcher is active. + * + * some internal details that might be helpful for debugging: + * + * active is either 0, which means the watcher is not active, + * or the array index of the watcher (periodics, timers) + * or the array index + 1 (most other watchers) + * or simply 1 for watchers that aren't in some array. + * pending is either 0, in which case the watcher isn't, + * or the array index + 1 in the pendings array. + */ + +#if EV_MINPRI == EV_MAXPRI +# define EV_DECL_PRIORITY +#elif !defined (EV_DECL_PRIORITY) +# define EV_DECL_PRIORITY int priority; +#endif + +/* shared by all watchers */ +#define EV_WATCHER(type) \ + int active; /* private */ \ + int pending; /* private */ \ + EV_DECL_PRIORITY /* private */ \ + EV_COMMON /* rw */ \ + EV_CB_DECLARE (type) /* private */ + +#define EV_WATCHER_LIST(type) \ + EV_WATCHER (type) \ + struct ev_watcher_list *next; /* private */ + +#define EV_WATCHER_TIME(type) \ + EV_WATCHER (type) \ + ev_tstamp at; /* private */ + +/* base class, nothing to see here unless you subclass */ +typedef struct ev_watcher +{ + EV_WATCHER (ev_watcher) +} ev_watcher; + +/* base class, nothing to see here unless you subclass */ +typedef struct ev_watcher_list +{ + EV_WATCHER_LIST (ev_watcher_list) +} ev_watcher_list; + +/* base class, nothing to see here unless you subclass */ +typedef struct ev_watcher_time +{ + EV_WATCHER_TIME (ev_watcher_time) +} ev_watcher_time; + +/* invoked when fd is either EV_READable or EV_WRITEable */ +/* revent EV_READ, EV_WRITE */ +typedef struct ev_io +{ + EV_WATCHER_LIST (ev_io) + + int fd; /* ro */ + int events; /* ro */ +} ev_io; + +/* invoked after a specific time, repeatable (based on monotonic clock) */ +/* revent EV_TIMEOUT */ +typedef struct ev_timer +{ + EV_WATCHER_TIME (ev_timer) + + ev_tstamp repeat; /* rw */ +} ev_timer; + +/* invoked at some specific time, possibly repeating at regular intervals (based on UTC) */ +/* revent EV_PERIODIC */ +typedef struct ev_periodic +{ + EV_WATCHER_TIME (ev_periodic) + + ev_tstamp offset; /* rw */ + ev_tstamp interval; /* rw */ + ev_tstamp (*reschedule_cb)(struct ev_periodic *w, ev_tstamp now) EV_THROW; /* rw */ +} ev_periodic; + +/* invoked when the given signal has been received */ +/* revent EV_SIGNAL */ +typedef struct ev_signal +{ + EV_WATCHER_LIST (ev_signal) + + int signum; /* ro */ +} ev_signal; + +/* invoked when sigchld is received and waitpid indicates the given pid */ +/* revent EV_CHILD */ +/* does not support priorities */ +typedef struct ev_child +{ + EV_WATCHER_LIST (ev_child) + + int flags; /* private */ + int pid; /* ro */ + int rpid; /* rw, holds the received pid */ + int rstatus; /* rw, holds the exit status, use the macros from sys/wait.h */ +} ev_child; + +#if EV_STAT_ENABLE +/* st_nlink = 0 means missing file or other error */ +# ifdef _WIN32 +typedef struct _stati64 ev_statdata; +# else +typedef struct stat ev_statdata; +# endif + +/* invoked each time the stat data changes for a given path */ +/* revent EV_STAT */ +typedef struct ev_stat +{ + EV_WATCHER_LIST (ev_stat) + + ev_timer timer; /* private */ + ev_tstamp interval; /* ro */ + const char *path; /* ro */ + ev_statdata prev; /* ro */ + ev_statdata attr; /* ro */ + + int wd; /* wd for inotify, fd for kqueue */ +} ev_stat; +#endif + +#if EV_IDLE_ENABLE +/* invoked when the nothing else needs to be done, keeps the process from blocking */ +/* revent EV_IDLE */ +typedef struct ev_idle +{ + EV_WATCHER (ev_idle) +} ev_idle; +#endif + +/* invoked for each run of the mainloop, just before the blocking call */ +/* you can still change events in any way you like */ +/* revent EV_PREPARE */ +typedef struct ev_prepare +{ + EV_WATCHER (ev_prepare) +} ev_prepare; + +/* invoked for each run of the mainloop, just after the blocking call */ +/* revent EV_CHECK */ +typedef struct ev_check +{ + EV_WATCHER (ev_check) +} ev_check; + +#if EV_FORK_ENABLE +/* the callback gets invoked before check in the child process when a fork was detected */ +/* revent EV_FORK */ +typedef struct ev_fork +{ + EV_WATCHER (ev_fork) +} ev_fork; +#endif + +#if EV_CLEANUP_ENABLE +/* is invoked just before the loop gets destroyed */ +/* revent EV_CLEANUP */ +typedef struct ev_cleanup +{ + EV_WATCHER (ev_cleanup) +} ev_cleanup; +#endif + +#if EV_EMBED_ENABLE +/* used to embed an event loop inside another */ +/* the callback gets invoked when the event loop has handled events, and can be 0 */ +typedef struct ev_embed +{ + EV_WATCHER (ev_embed) + + struct ev_loop *other; /* ro */ + ev_io io; /* private */ + ev_prepare prepare; /* private */ + ev_check check; /* unused */ + ev_timer timer; /* unused */ + ev_periodic periodic; /* unused */ + ev_idle idle; /* unused */ + ev_fork fork; /* private */ +#if EV_CLEANUP_ENABLE + ev_cleanup cleanup; /* unused */ +#endif +} ev_embed; +#endif + +#if EV_ASYNC_ENABLE +/* invoked when somebody calls ev_async_send on the watcher */ +/* revent EV_ASYNC */ +typedef struct ev_async +{ + EV_WATCHER (ev_async) + + EV_ATOMIC_T sent; /* private */ +} ev_async; + +# define ev_async_pending(w) (+(w)->sent) +#endif + +/* the presence of this union forces similar struct layout */ +union ev_any_watcher +{ + struct ev_watcher w; + struct ev_watcher_list wl; + + struct ev_io io; + struct ev_timer timer; + struct ev_periodic periodic; + struct ev_signal signal; + struct ev_child child; +#if EV_STAT_ENABLE + struct ev_stat stat; +#endif +#if EV_IDLE_ENABLE + struct ev_idle idle; +#endif + struct ev_prepare prepare; + struct ev_check check; +#if EV_FORK_ENABLE + struct ev_fork fork; +#endif +#if EV_CLEANUP_ENABLE + struct ev_cleanup cleanup; +#endif +#if EV_EMBED_ENABLE + struct ev_embed embed; +#endif +#if EV_ASYNC_ENABLE + struct ev_async async; +#endif +}; + +/* flag bits for ev_default_loop and ev_loop_new */ +enum { + /* the default */ + EVFLAG_AUTO = 0x00000000U, /* not quite a mask */ + /* flag bits */ + EVFLAG_NOENV = 0x01000000U, /* do NOT consult environment */ + EVFLAG_FORKCHECK = 0x02000000U, /* check for a fork in each iteration */ + /* debugging/feature disable */ + EVFLAG_NOINOTIFY = 0x00100000U, /* do not attempt to use inotify */ +#if EV_COMPAT3 + EVFLAG_NOSIGFD = 0, /* compatibility to pre-3.9 */ +#endif + EVFLAG_SIGNALFD = 0x00200000U, /* attempt to use signalfd */ + EVFLAG_NOSIGMASK = 0x00400000U /* avoid modifying the signal mask */ +}; + +/* method bits to be ored together */ +enum { + EVBACKEND_SELECT = 0x00000001U, /* available just about anywhere */ + EVBACKEND_POLL = 0x00000002U, /* !win, !aix, broken on osx */ + EVBACKEND_EPOLL = 0x00000004U, /* linux */ + EVBACKEND_KQUEUE = 0x00000008U, /* bsd, broken on osx */ + EVBACKEND_DEVPOLL = 0x00000010U, /* solaris 8 */ /* NYI */ + EVBACKEND_PORT = 0x00000020U, /* solaris 10 */ + EVBACKEND_ALL = 0x0000003FU, /* all known backends */ + EVBACKEND_MASK = 0x0000FFFFU /* all future backends */ +}; + +#if EV_PROTOTYPES +EV_API_DECL int ev_version_major (void) EV_THROW; +EV_API_DECL int ev_version_minor (void) EV_THROW; + +EV_API_DECL unsigned int ev_supported_backends (void) EV_THROW; +EV_API_DECL unsigned int ev_recommended_backends (void) EV_THROW; +EV_API_DECL unsigned int ev_embeddable_backends (void) EV_THROW; + +EV_API_DECL ev_tstamp ev_time (void) EV_THROW; +EV_API_DECL void ev_sleep (ev_tstamp delay) EV_THROW; /* sleep for a while */ + +/* Sets the allocation function to use, works like realloc. + * It is used to allocate and free memory. + * If it returns zero when memory needs to be allocated, the library might abort + * or take some potentially destructive action. + * The default is your system realloc function. + */ +EV_API_DECL void ev_set_allocator (void *(*cb)(void *ptr, long size) EV_THROW) EV_THROW; + +/* set the callback function to call on a + * retryable syscall error + * (such as failed select, poll, epoll_wait) + */ +EV_API_DECL void ev_set_syserr_cb (void (*cb)(const char *msg) EV_THROW) EV_THROW; + +#if EV_MULTIPLICITY + +/* the default loop is the only one that handles signals and child watchers */ +/* you can call this as often as you like */ +EV_API_DECL struct ev_loop *ev_default_loop (unsigned int flags EV_CPP (= 0)) EV_THROW; + +#ifdef EV_API_STATIC +EV_API_DECL struct ev_loop *ev_default_loop_ptr; +#endif + +EV_INLINE struct ev_loop * +ev_default_loop_uc_ (void) EV_THROW +{ + extern struct ev_loop *ev_default_loop_ptr; + + return ev_default_loop_ptr; +} + +EV_INLINE int +ev_is_default_loop (EV_P) EV_THROW +{ + return EV_A == EV_DEFAULT_UC; +} + +/* create and destroy alternative loops that don't handle signals */ +EV_API_DECL struct ev_loop *ev_loop_new (unsigned int flags EV_CPP (= 0)) EV_THROW; + +EV_API_DECL ev_tstamp ev_now (EV_P) EV_THROW; /* time w.r.t. timers and the eventloop, updated after each poll */ + +#else + +EV_API_DECL int ev_default_loop (unsigned int flags EV_CPP (= 0)) EV_THROW; /* returns true when successful */ + +EV_API_DECL ev_tstamp ev_rt_now; + +EV_INLINE ev_tstamp +ev_now (void) EV_THROW +{ + return ev_rt_now; +} + +/* looks weird, but ev_is_default_loop (EV_A) still works if this exists */ +EV_INLINE int +ev_is_default_loop (void) EV_THROW +{ + return 1; +} + +#endif /* multiplicity */ + +/* destroy event loops, also works for the default loop */ +EV_API_DECL void ev_loop_destroy (EV_P); + +/* this needs to be called after fork, to duplicate the loop */ +/* when you want to re-use it in the child */ +/* you can call it in either the parent or the child */ +/* you can actually call it at any time, anywhere :) */ +EV_API_DECL void ev_loop_fork (EV_P) EV_THROW; + +EV_API_DECL unsigned int ev_backend (EV_P) EV_THROW; /* backend in use by loop */ + +EV_API_DECL void ev_now_update (EV_P) EV_THROW; /* update event loop time */ + +#if EV_WALK_ENABLE +/* walk (almost) all watchers in the loop of a given type, invoking the */ +/* callback on every such watcher. The callback might stop the watcher, */ +/* but do nothing else with the loop */ +EV_API_DECL void ev_walk (EV_P_ int types, void (*cb)(EV_P_ int type, void *w)) EV_THROW; +#endif + +#endif /* prototypes */ + +/* ev_run flags values */ +enum { + EVRUN_NOWAIT = 1, /* do not block/wait */ + EVRUN_ONCE = 2 /* block *once* only */ +}; + +/* ev_break how values */ +enum { + EVBREAK_CANCEL = 0, /* undo unloop */ + EVBREAK_ONE = 1, /* unloop once */ + EVBREAK_ALL = 2 /* unloop all loops */ +}; + +#if EV_PROTOTYPES +EV_API_DECL int ev_run (EV_P_ int flags EV_CPP (= 0)); +EV_API_DECL void ev_break (EV_P_ int how EV_CPP (= EVBREAK_ONE)) EV_THROW; /* break out of the loop */ + +/* + * ref/unref can be used to add or remove a refcount on the mainloop. every watcher + * keeps one reference. if you have a long-running watcher you never unregister that + * should not keep ev_loop from running, unref() after starting, and ref() before stopping. + */ +EV_API_DECL void ev_ref (EV_P) EV_THROW; +EV_API_DECL void ev_unref (EV_P) EV_THROW; + +/* + * convenience function, wait for a single event, without registering an event watcher + * if timeout is < 0, do wait indefinitely + */ +EV_API_DECL void ev_once (EV_P_ int fd, int events, ev_tstamp timeout, void (*cb)(int revents, void *arg), void *arg) EV_THROW; + +# if EV_FEATURE_API +EV_API_DECL unsigned int ev_iteration (EV_P) EV_THROW; /* number of loop iterations */ +EV_API_DECL unsigned int ev_depth (EV_P) EV_THROW; /* #ev_loop enters - #ev_loop leaves */ +EV_API_DECL void ev_verify (EV_P) EV_THROW; /* abort if loop data corrupted */ + +EV_API_DECL void ev_set_io_collect_interval (EV_P_ ev_tstamp interval) EV_THROW; /* sleep at least this time, default 0 */ +EV_API_DECL void ev_set_timeout_collect_interval (EV_P_ ev_tstamp interval) EV_THROW; /* sleep at least this time, default 0 */ + +/* advanced stuff for threading etc. support, see docs */ +EV_API_DECL void ev_set_userdata (EV_P_ void *data) EV_THROW; +EV_API_DECL void *ev_userdata (EV_P) EV_THROW; +typedef void (*ev_loop_callback)(EV_P); +EV_API_DECL void ev_set_invoke_pending_cb (EV_P_ ev_loop_callback invoke_pending_cb) EV_THROW; +/* C++ doesn't allow the use of the ev_loop_callback typedef here, so we need to spell it out */ +EV_API_DECL void ev_set_loop_release_cb (EV_P_ void (*release)(EV_P) EV_THROW, void (*acquire)(EV_P) EV_THROW) EV_THROW; + +EV_API_DECL unsigned int ev_pending_count (EV_P) EV_THROW; /* number of pending events, if any */ +EV_API_DECL void ev_invoke_pending (EV_P); /* invoke all pending watchers */ + +/* + * stop/start the timer handling. + */ +EV_API_DECL void ev_suspend (EV_P) EV_THROW; +EV_API_DECL void ev_resume (EV_P) EV_THROW; +#endif + +#endif + +/* these may evaluate ev multiple times, and the other arguments at most once */ +/* either use ev_init + ev_TYPE_set, or the ev_TYPE_init macro, below, to first initialise a watcher */ +#define ev_init(ev,cb_) do { \ + ((ev_watcher *)(void *)(ev))->active = \ + ((ev_watcher *)(void *)(ev))->pending = 0; \ + ev_set_priority ((ev), 0); \ + ev_set_cb ((ev), cb_); \ +} while (0) + +#define ev_io_set(ev,fd_,events_) do { (ev)->fd = (fd_); (ev)->events = (events_) | EV__IOFDSET; } while (0) +#define ev_timer_set(ev,after_,repeat_) do { ((ev_watcher_time *)(ev))->at = (after_); (ev)->repeat = (repeat_); } while (0) +#define ev_periodic_set(ev,ofs_,ival_,rcb_) do { (ev)->offset = (ofs_); (ev)->interval = (ival_); (ev)->reschedule_cb = (rcb_); } while (0) +#define ev_signal_set(ev,signum_) do { (ev)->signum = (signum_); } while (0) +#define ev_child_set(ev,pid_,trace_) do { (ev)->pid = (pid_); (ev)->flags = !!(trace_); } while (0) +#define ev_stat_set(ev,path_,interval_) do { (ev)->path = (path_); (ev)->interval = (interval_); (ev)->wd = -2; } while (0) +#define ev_idle_set(ev) /* nop, yes, this is a serious in-joke */ +#define ev_prepare_set(ev) /* nop, yes, this is a serious in-joke */ +#define ev_check_set(ev) /* nop, yes, this is a serious in-joke */ +#define ev_embed_set(ev,other_) do { (ev)->other = (other_); } while (0) +#define ev_fork_set(ev) /* nop, yes, this is a serious in-joke */ +#define ev_cleanup_set(ev) /* nop, yes, this is a serious in-joke */ +#define ev_async_set(ev) /* nop, yes, this is a serious in-joke */ + +#define ev_io_init(ev,cb,fd,events) do { ev_init ((ev), (cb)); ev_io_set ((ev),(fd),(events)); } while (0) +#define ev_timer_init(ev,cb,after,repeat) do { ev_init ((ev), (cb)); ev_timer_set ((ev),(after),(repeat)); } while (0) +#define ev_periodic_init(ev,cb,ofs,ival,rcb) do { ev_init ((ev), (cb)); ev_periodic_set ((ev),(ofs),(ival),(rcb)); } while (0) +#define ev_signal_init(ev,cb,signum) do { ev_init ((ev), (cb)); ev_signal_set ((ev), (signum)); } while (0) +#define ev_child_init(ev,cb,pid,trace) do { ev_init ((ev), (cb)); ev_child_set ((ev),(pid),(trace)); } while (0) +#define ev_stat_init(ev,cb,path,interval) do { ev_init ((ev), (cb)); ev_stat_set ((ev),(path),(interval)); } while (0) +#define ev_idle_init(ev,cb) do { ev_init ((ev), (cb)); ev_idle_set ((ev)); } while (0) +#define ev_prepare_init(ev,cb) do { ev_init ((ev), (cb)); ev_prepare_set ((ev)); } while (0) +#define ev_check_init(ev,cb) do { ev_init ((ev), (cb)); ev_check_set ((ev)); } while (0) +#define ev_embed_init(ev,cb,other) do { ev_init ((ev), (cb)); ev_embed_set ((ev),(other)); } while (0) +#define ev_fork_init(ev,cb) do { ev_init ((ev), (cb)); ev_fork_set ((ev)); } while (0) +#define ev_cleanup_init(ev,cb) do { ev_init ((ev), (cb)); ev_cleanup_set ((ev)); } while (0) +#define ev_async_init(ev,cb) do { ev_init ((ev), (cb)); ev_async_set ((ev)); } while (0) + +#define ev_is_pending(ev) (0 + ((ev_watcher *)(void *)(ev))->pending) /* ro, true when watcher is waiting for callback invocation */ +#define ev_is_active(ev) (0 + ((ev_watcher *)(void *)(ev))->active) /* ro, true when the watcher has been started */ + +#define ev_cb_(ev) (ev)->cb /* rw */ +#define ev_cb(ev) (memmove (&ev_cb_ (ev), &((ev_watcher *)(ev))->cb, sizeof (ev_cb_ (ev))), (ev)->cb) + +#if EV_MINPRI == EV_MAXPRI +# define ev_priority(ev) ((ev), EV_MINPRI) +# define ev_set_priority(ev,pri) ((ev), (pri)) +#else +# define ev_priority(ev) (+(((ev_watcher *)(void *)(ev))->priority)) +# define ev_set_priority(ev,pri) ( (ev_watcher *)(void *)(ev))->priority = (pri) +#endif + +#define ev_periodic_at(ev) (+((ev_watcher_time *)(ev))->at) + +#ifndef ev_set_cb +# define ev_set_cb(ev,cb_) (ev_cb_ (ev) = (cb_), memmove (&((ev_watcher *)(ev))->cb, &ev_cb_ (ev), sizeof (ev_cb_ (ev)))) +#endif + +/* stopping (enabling, adding) a watcher does nothing if it is already running */ +/* stopping (disabling, deleting) a watcher does nothing unless it's already running */ +#if EV_PROTOTYPES + +/* feeds an event into a watcher as if the event actually occurred */ +/* accepts any ev_watcher type */ +EV_API_DECL void ev_feed_event (EV_P_ void *w, int revents) EV_THROW; +EV_API_DECL void ev_feed_fd_event (EV_P_ int fd, int revents) EV_THROW; +#if EV_SIGNAL_ENABLE +EV_API_DECL void ev_feed_signal (int signum) EV_THROW; +EV_API_DECL void ev_feed_signal_event (EV_P_ int signum) EV_THROW; +#endif +EV_API_DECL void ev_invoke (EV_P_ void *w, int revents); +EV_API_DECL int ev_clear_pending (EV_P_ void *w) EV_THROW; + +EV_API_DECL void ev_io_start (EV_P_ ev_io *w) EV_THROW; +EV_API_DECL void ev_io_stop (EV_P_ ev_io *w) EV_THROW; + +EV_API_DECL void ev_timer_start (EV_P_ ev_timer *w) EV_THROW; +EV_API_DECL void ev_timer_stop (EV_P_ ev_timer *w) EV_THROW; +/* stops if active and no repeat, restarts if active and repeating, starts if inactive and repeating */ +EV_API_DECL void ev_timer_again (EV_P_ ev_timer *w) EV_THROW; +/* return remaining time */ +EV_API_DECL ev_tstamp ev_timer_remaining (EV_P_ ev_timer *w) EV_THROW; + +#if EV_PERIODIC_ENABLE +EV_API_DECL void ev_periodic_start (EV_P_ ev_periodic *w) EV_THROW; +EV_API_DECL void ev_periodic_stop (EV_P_ ev_periodic *w) EV_THROW; +EV_API_DECL void ev_periodic_again (EV_P_ ev_periodic *w) EV_THROW; +#endif + +/* only supported in the default loop */ +#if EV_SIGNAL_ENABLE +EV_API_DECL void ev_signal_start (EV_P_ ev_signal *w) EV_THROW; +EV_API_DECL void ev_signal_stop (EV_P_ ev_signal *w) EV_THROW; +#endif + +/* only supported in the default loop */ +# if EV_CHILD_ENABLE +EV_API_DECL void ev_child_start (EV_P_ ev_child *w) EV_THROW; +EV_API_DECL void ev_child_stop (EV_P_ ev_child *w) EV_THROW; +# endif + +# if EV_STAT_ENABLE +EV_API_DECL void ev_stat_start (EV_P_ ev_stat *w) EV_THROW; +EV_API_DECL void ev_stat_stop (EV_P_ ev_stat *w) EV_THROW; +EV_API_DECL void ev_stat_stat (EV_P_ ev_stat *w) EV_THROW; +# endif + +# if EV_IDLE_ENABLE +EV_API_DECL void ev_idle_start (EV_P_ ev_idle *w) EV_THROW; +EV_API_DECL void ev_idle_stop (EV_P_ ev_idle *w) EV_THROW; +# endif + +#if EV_PREPARE_ENABLE +EV_API_DECL void ev_prepare_start (EV_P_ ev_prepare *w) EV_THROW; +EV_API_DECL void ev_prepare_stop (EV_P_ ev_prepare *w) EV_THROW; +#endif + +#if EV_CHECK_ENABLE +EV_API_DECL void ev_check_start (EV_P_ ev_check *w) EV_THROW; +EV_API_DECL void ev_check_stop (EV_P_ ev_check *w) EV_THROW; +#endif + +# if EV_FORK_ENABLE +EV_API_DECL void ev_fork_start (EV_P_ ev_fork *w) EV_THROW; +EV_API_DECL void ev_fork_stop (EV_P_ ev_fork *w) EV_THROW; +# endif + +# if EV_CLEANUP_ENABLE +EV_API_DECL void ev_cleanup_start (EV_P_ ev_cleanup *w) EV_THROW; +EV_API_DECL void ev_cleanup_stop (EV_P_ ev_cleanup *w) EV_THROW; +# endif + +# if EV_EMBED_ENABLE +/* only supported when loop to be embedded is in fact embeddable */ +EV_API_DECL void ev_embed_start (EV_P_ ev_embed *w) EV_THROW; +EV_API_DECL void ev_embed_stop (EV_P_ ev_embed *w) EV_THROW; +EV_API_DECL void ev_embed_sweep (EV_P_ ev_embed *w) EV_THROW; +# endif + +# if EV_ASYNC_ENABLE +EV_API_DECL void ev_async_start (EV_P_ ev_async *w) EV_THROW; +EV_API_DECL void ev_async_stop (EV_P_ ev_async *w) EV_THROW; +EV_API_DECL void ev_async_send (EV_P_ ev_async *w) EV_THROW; +# endif + +#if EV_COMPAT3 + #define EVLOOP_NONBLOCK EVRUN_NOWAIT + #define EVLOOP_ONESHOT EVRUN_ONCE + #define EVUNLOOP_CANCEL EVBREAK_CANCEL + #define EVUNLOOP_ONE EVBREAK_ONE + #define EVUNLOOP_ALL EVBREAK_ALL + #if EV_PROTOTYPES + EV_INLINE void ev_loop (EV_P_ int flags) { ev_run (EV_A_ flags); } + EV_INLINE void ev_unloop (EV_P_ int how ) { ev_break (EV_A_ how ); } + EV_INLINE void ev_default_destroy (void) { ev_loop_destroy (EV_DEFAULT); } + EV_INLINE void ev_default_fork (void) { ev_loop_fork (EV_DEFAULT); } + #if EV_FEATURE_API + EV_INLINE unsigned int ev_loop_count (EV_P) { return ev_iteration (EV_A); } + EV_INLINE unsigned int ev_loop_depth (EV_P) { return ev_depth (EV_A); } + EV_INLINE void ev_loop_verify (EV_P) { ev_verify (EV_A); } + #endif + #endif +#else + typedef struct ev_loop ev_loop; +#endif + +#endif + +EV_CPP(}) + +#endif + diff --git a/examples/udp_proxy/libev/ev_epoll.c b/examples/udp_proxy/libev/ev_epoll.c new file mode 100644 index 0000000..df118a6 --- /dev/null +++ b/examples/udp_proxy/libev/ev_epoll.c @@ -0,0 +1,285 @@ +/* + * libev epoll fd activity backend + * + * Copyright (c) 2007,2008,2009,2010,2011 Marc Alexander Lehmann + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +/* + * general notes about epoll: + * + * a) epoll silently removes fds from the fd set. as nothing tells us + * that an fd has been removed otherwise, we have to continually + * "rearm" fds that we suspect *might* have changed (same + * problem with kqueue, but much less costly there). + * b) the fact that ADD != MOD creates a lot of extra syscalls due to a) + * and seems not to have any advantage. + * c) the inability to handle fork or file descriptors (think dup) + * limits the applicability over poll, so this is not a generic + * poll replacement. + * d) epoll doesn't work the same as select with many file descriptors + * (such as files). while not critical, no other advanced interface + * seems to share this (rather non-unixy) limitation. + * e) epoll claims to be embeddable, but in practise you never get + * a ready event for the epoll fd (broken: <=2.6.26, working: >=2.6.32). + * f) epoll_ctl returning EPERM means the fd is always ready. + * + * lots of "weird code" and complication handling in this file is due + * to these design problems with epoll, as we try very hard to avoid + * epoll_ctl syscalls for common usage patterns and handle the breakage + * ensuing from receiving events for closed and otherwise long gone + * file descriptors. + */ + +#include + +#define EV_EMASK_EPERM 0x80 + +static void +epoll_modify (EV_P_ int fd, int oev, int nev) +{ + struct epoll_event ev; + unsigned char oldmask; + + /* + * we handle EPOLL_CTL_DEL by ignoring it here + * on the assumption that the fd is gone anyways + * if that is wrong, we have to handle the spurious + * event in epoll_poll. + * if the fd is added again, we try to ADD it, and, if that + * fails, we assume it still has the same eventmask. + */ + if (!nev) + return; + + oldmask = anfds [fd].emask; + anfds [fd].emask = nev; + + /* store the generation counter in the upper 32 bits, the fd in the lower 32 bits */ + ev.data.u64 = (uint64_t)(uint32_t)fd + | ((uint64_t)(uint32_t)++anfds [fd].egen << 32); + ev.events = (nev & EV_READ ? EPOLLIN : 0) + | (nev & EV_WRITE ? EPOLLOUT : 0); + + if (expect_true (!epoll_ctl (backend_fd, oev && oldmask != nev ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev))) + return; + + if (expect_true (errno == ENOENT)) + { + /* if ENOENT then the fd went away, so try to do the right thing */ + if (!nev) + goto dec_egen; + + if (!epoll_ctl (backend_fd, EPOLL_CTL_ADD, fd, &ev)) + return; + } + else if (expect_true (errno == EEXIST)) + { + /* EEXIST means we ignored a previous DEL, but the fd is still active */ + /* if the kernel mask is the same as the new mask, we assume it hasn't changed */ + if (oldmask == nev) + goto dec_egen; + + if (!epoll_ctl (backend_fd, EPOLL_CTL_MOD, fd, &ev)) + return; + } + else if (expect_true (errno == EPERM)) + { + /* EPERM means the fd is always ready, but epoll is too snobbish */ + /* to handle it, unlike select or poll. */ + anfds [fd].emask = EV_EMASK_EPERM; + + /* add fd to epoll_eperms, if not already inside */ + if (!(oldmask & EV_EMASK_EPERM)) + { + array_needsize (int, epoll_eperms, epoll_epermmax, epoll_epermcnt + 1, EMPTY2); + epoll_eperms [epoll_epermcnt++] = fd; + } + + return; + } + + fd_kill (EV_A_ fd); + +dec_egen: + /* we didn't successfully call epoll_ctl, so decrement the generation counter again */ + --anfds [fd].egen; +} + +static void +epoll_poll (EV_P_ ev_tstamp timeout) +{ + int i; + int eventcnt; + + if (expect_false (epoll_epermcnt)) + timeout = 0.; + + /* epoll wait times cannot be larger than (LONG_MAX - 999UL) / HZ msecs, which is below */ + /* the default libev max wait time, however. */ + EV_RELEASE_CB; + eventcnt = epoll_wait (backend_fd, epoll_events, epoll_eventmax, timeout * 1e3); + EV_ACQUIRE_CB; + + if (expect_false (eventcnt < 0)) + { + if (errno != EINTR) + ev_syserr ("(libev) epoll_wait"); + + return; + } + + for (i = 0; i < eventcnt; ++i) + { + struct epoll_event *ev = epoll_events + i; + + int fd = (uint32_t)ev->data.u64; /* mask out the lower 32 bits */ + int want = anfds [fd].events; + int got = (ev->events & (EPOLLOUT | EPOLLERR | EPOLLHUP) ? EV_WRITE : 0) + | (ev->events & (EPOLLIN | EPOLLERR | EPOLLHUP) ? EV_READ : 0); + + /* + * check for spurious notification. + * this only finds spurious notifications on egen updates + * other spurious notifications will be found by epoll_ctl, below + * we assume that fd is always in range, as we never shrink the anfds array + */ + if (expect_false ((uint32_t)anfds [fd].egen != (uint32_t)(ev->data.u64 >> 32))) + { + /* recreate kernel state */ + postfork |= 2; + continue; + } + + if (expect_false (got & ~want)) + { + anfds [fd].emask = want; + + /* + * we received an event but are not interested in it, try mod or del + * this often happens because we optimistically do not unregister fds + * when we are no longer interested in them, but also when we get spurious + * notifications for fds from another process. this is partially handled + * above with the gencounter check (== our fd is not the event fd), and + * partially here, when epoll_ctl returns an error (== a child has the fd + * but we closed it). + */ + ev->events = (want & EV_READ ? EPOLLIN : 0) + | (want & EV_WRITE ? EPOLLOUT : 0); + + /* pre-2.6.9 kernels require a non-null pointer with EPOLL_CTL_DEL, */ + /* which is fortunately easy to do for us. */ + if (epoll_ctl (backend_fd, want ? EPOLL_CTL_MOD : EPOLL_CTL_DEL, fd, ev)) + { + postfork |= 2; /* an error occurred, recreate kernel state */ + continue; + } + } + + fd_event (EV_A_ fd, got); + } + + /* if the receive array was full, increase its size */ + if (expect_false (eventcnt == epoll_eventmax)) + { + ev_free (epoll_events); + epoll_eventmax = array_nextsize (sizeof (struct epoll_event), epoll_eventmax, epoll_eventmax + 1); + epoll_events = (struct epoll_event *)ev_malloc (sizeof (struct epoll_event) * epoll_eventmax); + } + + /* now synthesize events for all fds where epoll fails, while select works... */ + for (i = epoll_epermcnt; i--; ) + { + int fd = epoll_eperms [i]; + unsigned char events = anfds [fd].events & (EV_READ | EV_WRITE); + + if (anfds [fd].emask & EV_EMASK_EPERM && events) + fd_event (EV_A_ fd, events); + else + { + epoll_eperms [i] = epoll_eperms [--epoll_epermcnt]; + anfds [fd].emask = 0; + } + } +} + +inline_size +int +epoll_init (EV_P_ int flags) +{ +#ifdef EPOLL_CLOEXEC + backend_fd = epoll_create1 (EPOLL_CLOEXEC); + + if (backend_fd < 0 && (errno == EINVAL || errno == ENOSYS)) +#endif + backend_fd = epoll_create (256); + + if (backend_fd < 0) + return 0; + + fcntl (backend_fd, F_SETFD, FD_CLOEXEC); + + backend_mintime = 1e-3; /* epoll does sometimes return early, this is just to avoid the worst */ + backend_modify = epoll_modify; + backend_poll = epoll_poll; + + epoll_eventmax = 64; /* initial number of events receivable per poll */ + epoll_events = (struct epoll_event *)ev_malloc (sizeof (struct epoll_event) * epoll_eventmax); + + return EVBACKEND_EPOLL; +} + +inline_size +void +epoll_destroy (EV_P) +{ + ev_free (epoll_events); + array_free (epoll_eperm, EMPTY); +} + +inline_size +void +epoll_fork (EV_P) +{ + close (backend_fd); + + while ((backend_fd = epoll_create (256)) < 0) + ev_syserr ("(libev) epoll_create"); + + fcntl (backend_fd, F_SETFD, FD_CLOEXEC); + + fd_rearm_all (EV_A); +} + diff --git a/examples/udp_proxy/libev/ev_poll.c b/examples/udp_proxy/libev/ev_poll.c new file mode 100644 index 0000000..bd742b0 --- /dev/null +++ b/examples/udp_proxy/libev/ev_poll.c @@ -0,0 +1,151 @@ +/* + * libev poll fd activity backend + * + * Copyright (c) 2007,2008,2009,2010,2011 Marc Alexander Lehmann + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#include + +inline_size +void +pollidx_init (int *base, int count) +{ + /* consider using memset (.., -1, ...), which is practically guaranteed + * to work on all systems implementing poll */ + while (count--) + *base++ = -1; +} + +static void +poll_modify (EV_P_ int fd, int oev, int nev) +{ + int idx; + + if (oev == nev) + return; + + array_needsize (int, pollidxs, pollidxmax, fd + 1, pollidx_init); + + idx = pollidxs [fd]; + + if (idx < 0) /* need to allocate a new pollfd */ + { + pollidxs [fd] = idx = pollcnt++; + array_needsize (struct pollfd, polls, pollmax, pollcnt, EMPTY2); + polls [idx].fd = fd; + } + + assert (polls [idx].fd == fd); + + if (nev) + polls [idx].events = + (nev & EV_READ ? POLLIN : 0) + | (nev & EV_WRITE ? POLLOUT : 0); + else /* remove pollfd */ + { + pollidxs [fd] = -1; + + if (expect_true (idx < --pollcnt)) + { + polls [idx] = polls [pollcnt]; + pollidxs [polls [idx].fd] = idx; + } + } +} + +static void +poll_poll (EV_P_ ev_tstamp timeout) +{ + struct pollfd *p; + int res; + + EV_RELEASE_CB; + res = poll (polls, pollcnt, timeout * 1e3); + EV_ACQUIRE_CB; + + if (expect_false (res < 0)) + { + if (errno == EBADF) + fd_ebadf (EV_A); + else if (errno == ENOMEM && !syserr_cb) + fd_enomem (EV_A); + else if (errno != EINTR) + ev_syserr ("(libev) poll"); + } + else + for (p = polls; res; ++p) + { + assert (("libev: poll() returned illegal result, broken BSD kernel?", p < polls + pollcnt)); + + if (expect_false (p->revents)) /* this expect is debatable */ + { + --res; + + if (expect_false (p->revents & POLLNVAL)) + fd_kill (EV_A_ p->fd); + else + fd_event ( + EV_A_ + p->fd, + (p->revents & (POLLOUT | POLLERR | POLLHUP) ? EV_WRITE : 0) + | (p->revents & (POLLIN | POLLERR | POLLHUP) ? EV_READ : 0) + ); + } + } +} + +inline_size +int +poll_init (EV_P_ int flags) +{ + backend_mintime = 1e-3; + backend_modify = poll_modify; + backend_poll = poll_poll; + + pollidxs = 0; pollidxmax = 0; + polls = 0; pollmax = 0; pollcnt = 0; + + return EVBACKEND_POLL; +} + +inline_size +void +poll_destroy (EV_P) +{ + ev_free (pollidxs); + ev_free (polls); +} + diff --git a/examples/udp_proxy/libev/ev_select.c b/examples/udp_proxy/libev/ev_select.c new file mode 100644 index 0000000..ed1fc7a --- /dev/null +++ b/examples/udp_proxy/libev/ev_select.c @@ -0,0 +1,316 @@ +/* + * libev select fd activity backend + * + * Copyright (c) 2007,2008,2009,2010,2011 Marc Alexander Lehmann + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#ifndef _WIN32 +/* for unix systems */ +# include +# ifndef __hpux +/* for REAL unix systems */ +# include +# endif +#endif + +#ifndef EV_SELECT_USE_FD_SET +# ifdef NFDBITS +# define EV_SELECT_USE_FD_SET 0 +# else +# define EV_SELECT_USE_FD_SET 1 +# endif +#endif + +#if EV_SELECT_IS_WINSOCKET +# undef EV_SELECT_USE_FD_SET +# define EV_SELECT_USE_FD_SET 1 +# undef NFDBITS +# define NFDBITS 0 +#endif + +#if !EV_SELECT_USE_FD_SET +# define NFDBYTES (NFDBITS / 8) +#endif + +#include + +static void +select_modify (EV_P_ int fd, int oev, int nev) +{ + if (oev == nev) + return; + + { +#if EV_SELECT_USE_FD_SET + + #if EV_SELECT_IS_WINSOCKET + SOCKET handle = anfds [fd].handle; + #else + int handle = fd; + #endif + + assert (("libev: fd >= FD_SETSIZE passed to fd_set-based select backend", fd < FD_SETSIZE)); + + /* FD_SET is broken on windows (it adds the fd to a set twice or more, + * which eventually leads to overflows). Need to call it only on changes. + */ + #if EV_SELECT_IS_WINSOCKET + if ((oev ^ nev) & EV_READ) + #endif + if (nev & EV_READ) + FD_SET (handle, (fd_set *)vec_ri); + else + FD_CLR (handle, (fd_set *)vec_ri); + + #if EV_SELECT_IS_WINSOCKET + if ((oev ^ nev) & EV_WRITE) + #endif + if (nev & EV_WRITE) + FD_SET (handle, (fd_set *)vec_wi); + else + FD_CLR (handle, (fd_set *)vec_wi); + +#else + + int word = fd / NFDBITS; + fd_mask mask = 1UL << (fd % NFDBITS); + + if (expect_false (vec_max <= word)) + { + int new_max = word + 1; + + vec_ri = ev_realloc (vec_ri, new_max * NFDBYTES); + vec_ro = ev_realloc (vec_ro, new_max * NFDBYTES); /* could free/malloc */ + vec_wi = ev_realloc (vec_wi, new_max * NFDBYTES); + vec_wo = ev_realloc (vec_wo, new_max * NFDBYTES); /* could free/malloc */ + #ifdef _WIN32 + vec_eo = ev_realloc (vec_eo, new_max * NFDBYTES); /* could free/malloc */ + #endif + + for (; vec_max < new_max; ++vec_max) + ((fd_mask *)vec_ri) [vec_max] = + ((fd_mask *)vec_wi) [vec_max] = 0; + } + + ((fd_mask *)vec_ri) [word] |= mask; + if (!(nev & EV_READ)) + ((fd_mask *)vec_ri) [word] &= ~mask; + + ((fd_mask *)vec_wi) [word] |= mask; + if (!(nev & EV_WRITE)) + ((fd_mask *)vec_wi) [word] &= ~mask; +#endif + } +} + +static void +select_poll (EV_P_ ev_tstamp timeout) +{ + struct timeval tv; + int res; + int fd_setsize; + + EV_RELEASE_CB; + EV_TV_SET (tv, timeout); + +#if EV_SELECT_USE_FD_SET + fd_setsize = sizeof (fd_set); +#else + fd_setsize = vec_max * NFDBYTES; +#endif + + memcpy (vec_ro, vec_ri, fd_setsize); + memcpy (vec_wo, vec_wi, fd_setsize); + +#ifdef _WIN32 + /* pass in the write set as except set. + * the idea behind this is to work around a windows bug that causes + * errors to be reported as an exception and not by setting + * the writable bit. this is so uncontrollably lame. + */ + memcpy (vec_eo, vec_wi, fd_setsize); + res = select (vec_max * NFDBITS, (fd_set *)vec_ro, (fd_set *)vec_wo, (fd_set *)vec_eo, &tv); +#elif EV_SELECT_USE_FD_SET + fd_setsize = anfdmax < FD_SETSIZE ? anfdmax : FD_SETSIZE; + res = select (fd_setsize, (fd_set *)vec_ro, (fd_set *)vec_wo, 0, &tv); +#else + res = select (vec_max * NFDBITS, (fd_set *)vec_ro, (fd_set *)vec_wo, 0, &tv); +#endif + EV_ACQUIRE_CB; + + if (expect_false (res < 0)) + { + #if EV_SELECT_IS_WINSOCKET + errno = WSAGetLastError (); + #endif + #ifdef WSABASEERR + /* on windows, select returns incompatible error codes, fix this */ + if (errno >= WSABASEERR && errno < WSABASEERR + 1000) + if (errno == WSAENOTSOCK) + errno = EBADF; + else + errno -= WSABASEERR; + #endif + + #ifdef _WIN32 + /* select on windows erroneously returns EINVAL when no fd sets have been + * provided (this is documented). what microsoft doesn't tell you that this bug + * exists even when the fd sets _are_ provided, so we have to check for this bug + * here and emulate by sleeping manually. + * we also get EINVAL when the timeout is invalid, but we ignore this case here + * and assume that EINVAL always means: you have to wait manually. + */ + if (errno == EINVAL) + { + if (timeout) + { + unsigned long ms = timeout * 1e3; + Sleep (ms ? ms : 1); + } + + return; + } + #endif + + if (errno == EBADF) + fd_ebadf (EV_A); + else if (errno == ENOMEM && !syserr_cb) + fd_enomem (EV_A); + else if (errno != EINTR) + ev_syserr ("(libev) select"); + + return; + } + +#if EV_SELECT_USE_FD_SET + + { + int fd; + + for (fd = 0; fd < anfdmax; ++fd) + if (anfds [fd].events) + { + int events = 0; + #if EV_SELECT_IS_WINSOCKET + SOCKET handle = anfds [fd].handle; + #else + int handle = fd; + #endif + + if (FD_ISSET (handle, (fd_set *)vec_ro)) events |= EV_READ; + if (FD_ISSET (handle, (fd_set *)vec_wo)) events |= EV_WRITE; + #ifdef _WIN32 + if (FD_ISSET (handle, (fd_set *)vec_eo)) events |= EV_WRITE; + #endif + + if (expect_true (events)) + fd_event (EV_A_ fd, events); + } + } + +#else + + { + int word, bit; + for (word = vec_max; word--; ) + { + fd_mask word_r = ((fd_mask *)vec_ro) [word]; + fd_mask word_w = ((fd_mask *)vec_wo) [word]; + #ifdef _WIN32 + word_w |= ((fd_mask *)vec_eo) [word]; + #endif + + if (word_r || word_w) + for (bit = NFDBITS; bit--; ) + { + fd_mask mask = 1UL << bit; + int events = 0; + + events |= word_r & mask ? EV_READ : 0; + events |= word_w & mask ? EV_WRITE : 0; + + if (expect_true (events)) + fd_event (EV_A_ word * NFDBITS + bit, events); + } + } + } + +#endif +} + +inline_size +int +select_init (EV_P_ int flags) +{ + backend_mintime = 1e-6; + backend_modify = select_modify; + backend_poll = select_poll; + +#if EV_SELECT_USE_FD_SET + vec_ri = ev_malloc (sizeof (fd_set)); FD_ZERO ((fd_set *)vec_ri); + vec_ro = ev_malloc (sizeof (fd_set)); + vec_wi = ev_malloc (sizeof (fd_set)); FD_ZERO ((fd_set *)vec_wi); + vec_wo = ev_malloc (sizeof (fd_set)); + #ifdef _WIN32 + vec_eo = ev_malloc (sizeof (fd_set)); + #endif +#else + vec_max = 0; + vec_ri = 0; + vec_ro = 0; + vec_wi = 0; + vec_wo = 0; + #ifdef _WIN32 + vec_eo = 0; + #endif +#endif + + return EVBACKEND_SELECT; +} + +inline_size +void +select_destroy (EV_P) +{ + ev_free (vec_ri); + ev_free (vec_ro); + ev_free (vec_wi); + ev_free (vec_wo); + #ifdef _WIN32 + ev_free (vec_eo); + #endif +} + diff --git a/examples/udp_proxy/libev/ev_vars.h b/examples/udp_proxy/libev/ev_vars.h new file mode 100644 index 0000000..04d4db1 --- /dev/null +++ b/examples/udp_proxy/libev/ev_vars.h @@ -0,0 +1,204 @@ +/* + * loop member variable declarations + * + * Copyright (c) 2007,2008,2009,2010,2011,2012,2013 Marc Alexander Lehmann + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#define VARx(type,name) VAR(name, type name) + +VARx(ev_tstamp, now_floor) /* last time we refreshed rt_time */ +VARx(ev_tstamp, mn_now) /* monotonic clock "now" */ +VARx(ev_tstamp, rtmn_diff) /* difference realtime - monotonic time */ + +/* for reverse feeding of events */ +VARx(W *, rfeeds) +VARx(int, rfeedmax) +VARx(int, rfeedcnt) + +VAR (pendings, ANPENDING *pendings [NUMPRI]) +VAR (pendingmax, int pendingmax [NUMPRI]) +VAR (pendingcnt, int pendingcnt [NUMPRI]) +VARx(int, pendingpri) /* highest priority currently pending */ +VARx(ev_prepare, pending_w) /* dummy pending watcher */ + +VARx(ev_tstamp, io_blocktime) +VARx(ev_tstamp, timeout_blocktime) + +VARx(int, backend) +VARx(int, activecnt) /* total number of active events ("refcount") */ +VARx(EV_ATOMIC_T, loop_done) /* signal by ev_break */ + +VARx(int, backend_fd) +VARx(ev_tstamp, backend_mintime) /* assumed typical timer resolution */ +VAR (backend_modify, void (*backend_modify)(EV_P_ int fd, int oev, int nev)) +VAR (backend_poll , void (*backend_poll)(EV_P_ ev_tstamp timeout)) + +VARx(ANFD *, anfds) +VARx(int, anfdmax) + +VAR (evpipe, int evpipe [2]) +VARx(ev_io, pipe_w) +VARx(EV_ATOMIC_T, pipe_write_wanted) +VARx(EV_ATOMIC_T, pipe_write_skipped) + +#if !defined(_WIN32) || EV_GENWRAP +VARx(pid_t, curpid) +#endif + +VARx(char, postfork) /* true if we need to recreate kernel state after fork */ + +#if EV_USE_SELECT || EV_GENWRAP +VARx(void *, vec_ri) +VARx(void *, vec_ro) +VARx(void *, vec_wi) +VARx(void *, vec_wo) +#if defined(_WIN32) || EV_GENWRAP +VARx(void *, vec_eo) +#endif +VARx(int, vec_max) +#endif + +#if EV_USE_POLL || EV_GENWRAP +VARx(struct pollfd *, polls) +VARx(int, pollmax) +VARx(int, pollcnt) +VARx(int *, pollidxs) /* maps fds into structure indices */ +VARx(int, pollidxmax) +#endif + +#if EV_USE_EPOLL || EV_GENWRAP +VARx(struct epoll_event *, epoll_events) +VARx(int, epoll_eventmax) +VARx(int *, epoll_eperms) +VARx(int, epoll_epermcnt) +VARx(int, epoll_epermmax) +#endif + +#if EV_USE_KQUEUE || EV_GENWRAP +VARx(pid_t, kqueue_fd_pid) +VARx(struct kevent *, kqueue_changes) +VARx(int, kqueue_changemax) +VARx(int, kqueue_changecnt) +VARx(struct kevent *, kqueue_events) +VARx(int, kqueue_eventmax) +#endif + +#if EV_USE_PORT || EV_GENWRAP +VARx(struct port_event *, port_events) +VARx(int, port_eventmax) +#endif + +#if EV_USE_IOCP || EV_GENWRAP +VARx(HANDLE, iocp) +#endif + +VARx(int *, fdchanges) +VARx(int, fdchangemax) +VARx(int, fdchangecnt) + +VARx(ANHE *, timers) +VARx(int, timermax) +VARx(int, timercnt) + +#if EV_PERIODIC_ENABLE || EV_GENWRAP +VARx(ANHE *, periodics) +VARx(int, periodicmax) +VARx(int, periodiccnt) +#endif + +#if EV_IDLE_ENABLE || EV_GENWRAP +VAR (idles, ev_idle **idles [NUMPRI]) +VAR (idlemax, int idlemax [NUMPRI]) +VAR (idlecnt, int idlecnt [NUMPRI]) +#endif +VARx(int, idleall) /* total number */ + +VARx(struct ev_prepare **, prepares) +VARx(int, preparemax) +VARx(int, preparecnt) + +VARx(struct ev_check **, checks) +VARx(int, checkmax) +VARx(int, checkcnt) + +#if EV_FORK_ENABLE || EV_GENWRAP +VARx(struct ev_fork **, forks) +VARx(int, forkmax) +VARx(int, forkcnt) +#endif + +#if EV_CLEANUP_ENABLE || EV_GENWRAP +VARx(struct ev_cleanup **, cleanups) +VARx(int, cleanupmax) +VARx(int, cleanupcnt) +#endif + +#if EV_ASYNC_ENABLE || EV_GENWRAP +VARx(EV_ATOMIC_T, async_pending) +VARx(struct ev_async **, asyncs) +VARx(int, asyncmax) +VARx(int, asynccnt) +#endif + +#if EV_USE_INOTIFY || EV_GENWRAP +VARx(int, fs_fd) +VARx(ev_io, fs_w) +VARx(char, fs_2625) /* whether we are running in linux 2.6.25 or newer */ +VAR (fs_hash, ANFS fs_hash [EV_INOTIFY_HASHSIZE]) +#endif + +VARx(EV_ATOMIC_T, sig_pending) +#if EV_USE_SIGNALFD || EV_GENWRAP +VARx(int, sigfd) +VARx(ev_io, sigfd_w) +VARx(sigset_t, sigfd_set) +#endif + +VARx(unsigned int, origflags) /* original loop flags */ + +#if EV_FEATURE_API || EV_GENWRAP +VARx(unsigned int, loop_count) /* total number of loop iterations/blocks */ +VARx(unsigned int, loop_depth) /* #ev_run enters - #ev_run leaves */ + +VARx(void *, userdata) +/* C++ doesn't support the ev_loop_callback typedef here. stinks. */ +VAR (release_cb, void (*release_cb)(EV_P) EV_THROW) +VAR (acquire_cb, void (*acquire_cb)(EV_P) EV_THROW) +VAR (invoke_cb , ev_loop_callback invoke_cb) +#endif + +#undef VARx + diff --git a/examples/udp_proxy/libev/ev_wrap.h b/examples/udp_proxy/libev/ev_wrap.h new file mode 100644 index 0000000..ad989ea --- /dev/null +++ b/examples/udp_proxy/libev/ev_wrap.h @@ -0,0 +1,200 @@ +/* DO NOT EDIT, automatically generated by update_ev_wrap */ +#ifndef EV_WRAP_H +#define EV_WRAP_H +#define acquire_cb ((loop)->acquire_cb) +#define activecnt ((loop)->activecnt) +#define anfdmax ((loop)->anfdmax) +#define anfds ((loop)->anfds) +#define async_pending ((loop)->async_pending) +#define asynccnt ((loop)->asynccnt) +#define asyncmax ((loop)->asyncmax) +#define asyncs ((loop)->asyncs) +#define backend ((loop)->backend) +#define backend_fd ((loop)->backend_fd) +#define backend_mintime ((loop)->backend_mintime) +#define backend_modify ((loop)->backend_modify) +#define backend_poll ((loop)->backend_poll) +#define checkcnt ((loop)->checkcnt) +#define checkmax ((loop)->checkmax) +#define checks ((loop)->checks) +#define cleanupcnt ((loop)->cleanupcnt) +#define cleanupmax ((loop)->cleanupmax) +#define cleanups ((loop)->cleanups) +#define curpid ((loop)->curpid) +#define epoll_epermcnt ((loop)->epoll_epermcnt) +#define epoll_epermmax ((loop)->epoll_epermmax) +#define epoll_eperms ((loop)->epoll_eperms) +#define epoll_eventmax ((loop)->epoll_eventmax) +#define epoll_events ((loop)->epoll_events) +#define evpipe ((loop)->evpipe) +#define fdchangecnt ((loop)->fdchangecnt) +#define fdchangemax ((loop)->fdchangemax) +#define fdchanges ((loop)->fdchanges) +#define forkcnt ((loop)->forkcnt) +#define forkmax ((loop)->forkmax) +#define forks ((loop)->forks) +#define fs_2625 ((loop)->fs_2625) +#define fs_fd ((loop)->fs_fd) +#define fs_hash ((loop)->fs_hash) +#define fs_w ((loop)->fs_w) +#define idleall ((loop)->idleall) +#define idlecnt ((loop)->idlecnt) +#define idlemax ((loop)->idlemax) +#define idles ((loop)->idles) +#define invoke_cb ((loop)->invoke_cb) +#define io_blocktime ((loop)->io_blocktime) +#define iocp ((loop)->iocp) +#define kqueue_changecnt ((loop)->kqueue_changecnt) +#define kqueue_changemax ((loop)->kqueue_changemax) +#define kqueue_changes ((loop)->kqueue_changes) +#define kqueue_eventmax ((loop)->kqueue_eventmax) +#define kqueue_events ((loop)->kqueue_events) +#define kqueue_fd_pid ((loop)->kqueue_fd_pid) +#define loop_count ((loop)->loop_count) +#define loop_depth ((loop)->loop_depth) +#define loop_done ((loop)->loop_done) +#define mn_now ((loop)->mn_now) +#define now_floor ((loop)->now_floor) +#define origflags ((loop)->origflags) +#define pending_w ((loop)->pending_w) +#define pendingcnt ((loop)->pendingcnt) +#define pendingmax ((loop)->pendingmax) +#define pendingpri ((loop)->pendingpri) +#define pendings ((loop)->pendings) +#define periodiccnt ((loop)->periodiccnt) +#define periodicmax ((loop)->periodicmax) +#define periodics ((loop)->periodics) +#define pipe_w ((loop)->pipe_w) +#define pipe_write_skipped ((loop)->pipe_write_skipped) +#define pipe_write_wanted ((loop)->pipe_write_wanted) +#define pollcnt ((loop)->pollcnt) +#define pollidxmax ((loop)->pollidxmax) +#define pollidxs ((loop)->pollidxs) +#define pollmax ((loop)->pollmax) +#define polls ((loop)->polls) +#define port_eventmax ((loop)->port_eventmax) +#define port_events ((loop)->port_events) +#define postfork ((loop)->postfork) +#define preparecnt ((loop)->preparecnt) +#define preparemax ((loop)->preparemax) +#define prepares ((loop)->prepares) +#define release_cb ((loop)->release_cb) +#define rfeedcnt ((loop)->rfeedcnt) +#define rfeedmax ((loop)->rfeedmax) +#define rfeeds ((loop)->rfeeds) +#define rtmn_diff ((loop)->rtmn_diff) +#define sig_pending ((loop)->sig_pending) +#define sigfd ((loop)->sigfd) +#define sigfd_set ((loop)->sigfd_set) +#define sigfd_w ((loop)->sigfd_w) +#define timeout_blocktime ((loop)->timeout_blocktime) +#define timercnt ((loop)->timercnt) +#define timermax ((loop)->timermax) +#define timers ((loop)->timers) +#define userdata ((loop)->userdata) +#define vec_eo ((loop)->vec_eo) +#define vec_max ((loop)->vec_max) +#define vec_ri ((loop)->vec_ri) +#define vec_ro ((loop)->vec_ro) +#define vec_wi ((loop)->vec_wi) +#define vec_wo ((loop)->vec_wo) +#else +#undef EV_WRAP_H +#undef acquire_cb +#undef activecnt +#undef anfdmax +#undef anfds +#undef async_pending +#undef asynccnt +#undef asyncmax +#undef asyncs +#undef backend +#undef backend_fd +#undef backend_mintime +#undef backend_modify +#undef backend_poll +#undef checkcnt +#undef checkmax +#undef checks +#undef cleanupcnt +#undef cleanupmax +#undef cleanups +#undef curpid +#undef epoll_epermcnt +#undef epoll_epermmax +#undef epoll_eperms +#undef epoll_eventmax +#undef epoll_events +#undef evpipe +#undef fdchangecnt +#undef fdchangemax +#undef fdchanges +#undef forkcnt +#undef forkmax +#undef forks +#undef fs_2625 +#undef fs_fd +#undef fs_hash +#undef fs_w +#undef idleall +#undef idlecnt +#undef idlemax +#undef idles +#undef invoke_cb +#undef io_blocktime +#undef iocp +#undef kqueue_changecnt +#undef kqueue_changemax +#undef kqueue_changes +#undef kqueue_eventmax +#undef kqueue_events +#undef kqueue_fd_pid +#undef loop_count +#undef loop_depth +#undef loop_done +#undef mn_now +#undef now_floor +#undef origflags +#undef pending_w +#undef pendingcnt +#undef pendingmax +#undef pendingpri +#undef pendings +#undef periodiccnt +#undef periodicmax +#undef periodics +#undef pipe_w +#undef pipe_write_skipped +#undef pipe_write_wanted +#undef pollcnt +#undef pollidxmax +#undef pollidxs +#undef pollmax +#undef polls +#undef port_eventmax +#undef port_events +#undef postfork +#undef preparecnt +#undef preparemax +#undef prepares +#undef release_cb +#undef rfeedcnt +#undef rfeedmax +#undef rfeeds +#undef rtmn_diff +#undef sig_pending +#undef sigfd +#undef sigfd_set +#undef sigfd_w +#undef timeout_blocktime +#undef timercnt +#undef timermax +#undef timers +#undef userdata +#undef vec_eo +#undef vec_max +#undef vec_ri +#undef vec_ro +#undef vec_wi +#undef vec_wo +#endif diff --git a/examples/udp_proxy/libev/event.c b/examples/udp_proxy/libev/event.c new file mode 100644 index 0000000..5586cd3 --- /dev/null +++ b/examples/udp_proxy/libev/event.c @@ -0,0 +1,425 @@ +/* + * libevent compatibility layer + * + * Copyright (c) 2007,2008,2009,2010,2012 Marc Alexander Lehmann + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#include +#include +#include + +#ifdef EV_EVENT_H +# include EV_EVENT_H +#else +# include "event.h" +#endif + +#if EV_MULTIPLICITY +# define dLOOPev struct ev_loop *loop = (struct ev_loop *)ev->ev_base +# define dLOOPbase struct ev_loop *loop = (struct ev_loop *)base +#else +# define dLOOPev +# define dLOOPbase +#endif + +/* never accessed, will always be cast from/to ev_loop */ +struct event_base +{ + int dummy; +}; + +static struct event_base *ev_x_cur; + +static ev_tstamp +ev_tv_get (struct timeval *tv) +{ + if (tv) + { + ev_tstamp after = tv->tv_sec + tv->tv_usec * 1e-6; + return after ? after : 1e-6; + } + else + return -1.; +} + +#define EVENT_STRINGIFY(s) # s +#define EVENT_VERSION(a,b) EVENT_STRINGIFY (a) "." EVENT_STRINGIFY (b) + +const char * +event_get_version (void) +{ + /* returns ABI, not API or library, version */ + return EVENT_VERSION (EV_VERSION_MAJOR, EV_VERSION_MINOR); +} + +const char * +event_get_method (void) +{ + return "libev"; +} + +void *event_init (void) +{ +#if EV_MULTIPLICITY + if (ev_x_cur) + ev_x_cur = (struct event_base *)ev_loop_new (EVFLAG_AUTO); + else + ev_x_cur = (struct event_base *)ev_default_loop (EVFLAG_AUTO); +#else + assert (("libev: multiple event bases not supported when not compiled with EV_MULTIPLICITY", !ev_x_cur)); + + ev_x_cur = (struct event_base *)(long)ev_default_loop (EVFLAG_AUTO); +#endif + + return ev_x_cur; +} + +const char * +event_base_get_method (const struct event_base *base) +{ + return "libev"; +} + +struct event_base * +event_base_new (void) +{ +#if EV_MULTIPLICITY + return (struct event_base *)ev_loop_new (EVFLAG_AUTO); +#else + assert (("libev: multiple event bases not supported when not compiled with EV_MULTIPLICITY")); + return NULL; +#endif +} + +void event_base_free (struct event_base *base) +{ + dLOOPbase; + +#if EV_MULTIPLICITY + if (!ev_is_default_loop (loop)) + ev_loop_destroy (loop); +#endif +} + +int event_dispatch (void) +{ + return event_base_dispatch (ev_x_cur); +} + +#ifdef EV_STANDALONE +void event_set_log_callback (event_log_cb cb) +{ + /* nop */ +} +#endif + +int event_loop (int flags) +{ + return event_base_loop (ev_x_cur, flags); +} + +int event_loopexit (struct timeval *tv) +{ + return event_base_loopexit (ev_x_cur, tv); +} + +event_callback_fn event_get_callback +(const struct event *ev) +{ + return ev->ev_callback; +} + +static void +ev_x_cb (struct event *ev, int revents) +{ + revents &= EV_READ | EV_WRITE | EV_TIMER | EV_SIGNAL; + + ev->ev_res = revents; + ev->ev_callback (ev->ev_fd, (short)revents, ev->ev_arg); +} + +static void +ev_x_cb_sig (EV_P_ struct ev_signal *w, int revents) +{ + struct event *ev = (struct event *)(((char *)w) - offsetof (struct event, iosig.sig)); + + if (revents & EV_ERROR) + event_del (ev); + + ev_x_cb (ev, revents); +} + +static void +ev_x_cb_io (EV_P_ struct ev_io *w, int revents) +{ + struct event *ev = (struct event *)(((char *)w) - offsetof (struct event, iosig.io)); + + if ((revents & EV_ERROR) || !(ev->ev_events & EV_PERSIST)) + event_del (ev); + + ev_x_cb (ev, revents); +} + +static void +ev_x_cb_to (EV_P_ struct ev_timer *w, int revents) +{ + struct event *ev = (struct event *)(((char *)w) - offsetof (struct event, to)); + + event_del (ev); + + ev_x_cb (ev, revents); +} + +void event_set (struct event *ev, int fd, short events, void (*cb)(int, short, void *), void *arg) +{ + if (events & EV_SIGNAL) + ev_init (&ev->iosig.sig, ev_x_cb_sig); + else + ev_init (&ev->iosig.io, ev_x_cb_io); + + ev_init (&ev->to, ev_x_cb_to); + + ev->ev_base = ev_x_cur; /* not threadsafe, but it's how libevent works */ + ev->ev_fd = fd; + ev->ev_events = events; + ev->ev_pri = 0; + ev->ev_callback = cb; + ev->ev_arg = arg; + ev->ev_res = 0; + ev->ev_flags = EVLIST_INIT; +} + +int event_once (int fd, short events, void (*cb)(int, short, void *), void *arg, struct timeval *tv) +{ + return event_base_once (ev_x_cur, fd, events, cb, arg, tv); +} + +int event_add (struct event *ev, struct timeval *tv) +{ + dLOOPev; + + if (ev->ev_events & EV_SIGNAL) + { + if (!ev_is_active (&ev->iosig.sig)) + { + ev_signal_set (&ev->iosig.sig, ev->ev_fd); + ev_signal_start (EV_A_ &ev->iosig.sig); + + ev->ev_flags |= EVLIST_SIGNAL; + } + } + else if (ev->ev_events & (EV_READ | EV_WRITE)) + { + if (!ev_is_active (&ev->iosig.io)) + { + ev_io_set (&ev->iosig.io, ev->ev_fd, ev->ev_events & (EV_READ | EV_WRITE)); + ev_io_start (EV_A_ &ev->iosig.io); + + ev->ev_flags |= EVLIST_INSERTED; + } + } + + if (tv) + { + ev->to.repeat = ev_tv_get (tv); + ev_timer_again (EV_A_ &ev->to); + ev->ev_flags |= EVLIST_TIMEOUT; + } + else + { + ev_timer_stop (EV_A_ &ev->to); + ev->ev_flags &= ~EVLIST_TIMEOUT; + } + + ev->ev_flags |= EVLIST_ACTIVE; + + return 0; +} + +int event_del (struct event *ev) +{ + dLOOPev; + + if (ev->ev_events & EV_SIGNAL) + ev_signal_stop (EV_A_ &ev->iosig.sig); + else if (ev->ev_events & (EV_READ | EV_WRITE)) + ev_io_stop (EV_A_ &ev->iosig.io); + + if (ev_is_active (&ev->to)) + ev_timer_stop (EV_A_ &ev->to); + + ev->ev_flags = EVLIST_INIT; + + return 0; +} + +void event_active (struct event *ev, int res, short ncalls) +{ + dLOOPev; + + if (res & EV_TIMEOUT) + ev_feed_event (EV_A_ &ev->to, res & EV_TIMEOUT); + + if (res & EV_SIGNAL) + ev_feed_event (EV_A_ &ev->iosig.sig, res & EV_SIGNAL); + + if (res & (EV_READ | EV_WRITE)) + ev_feed_event (EV_A_ &ev->iosig.io, res & (EV_READ | EV_WRITE)); +} + +int event_pending (struct event *ev, short events, struct timeval *tv) +{ + short revents = 0; + dLOOPev; + + if (ev->ev_events & EV_SIGNAL) + { + /* sig */ + if (ev_is_active (&ev->iosig.sig) || ev_is_pending (&ev->iosig.sig)) + revents |= EV_SIGNAL; + } + else if (ev->ev_events & (EV_READ | EV_WRITE)) + { + /* io */ + if (ev_is_active (&ev->iosig.io) || ev_is_pending (&ev->iosig.io)) + revents |= ev->ev_events & (EV_READ | EV_WRITE); + } + + if (ev->ev_events & EV_TIMEOUT || ev_is_active (&ev->to) || ev_is_pending (&ev->to)) + { + revents |= EV_TIMEOUT; + + if (tv) + { + ev_tstamp at = ev_now (EV_A); + + tv->tv_sec = (long)at; + tv->tv_usec = (long)((at - (ev_tstamp)tv->tv_sec) * 1e6); + } + } + + return events & revents; +} + +int event_priority_init (int npri) +{ + return event_base_priority_init (ev_x_cur, npri); +} + +int event_priority_set (struct event *ev, int pri) +{ + ev->ev_pri = pri; + + return 0; +} + +int event_base_set (struct event_base *base, struct event *ev) +{ + ev->ev_base = base; + + return 0; +} + +int event_base_loop (struct event_base *base, int flags) +{ + dLOOPbase; + + return !ev_run (EV_A_ flags); +} + +int event_base_dispatch (struct event_base *base) +{ + return event_base_loop (base, 0); +} + +static void +ev_x_loopexit_cb (int revents, void *base) +{ + dLOOPbase; + + ev_break (EV_A_ EVBREAK_ONE); +} + +int event_base_loopexit (struct event_base *base, struct timeval *tv) +{ + ev_tstamp after = ev_tv_get (tv); + dLOOPbase; + + ev_once (EV_A_ -1, 0, after >= 0. ? after : 0., ev_x_loopexit_cb, (void *)base); + + return 0; +} + +struct ev_x_once +{ + int fd; + void (*cb)(int, short, void *); + void *arg; +}; + +static void +ev_x_once_cb (int revents, void *arg) +{ + struct ev_x_once *once = (struct ev_x_once *)arg; + + once->cb (once->fd, (short)revents, once->arg); + free (once); +} + +int event_base_once (struct event_base *base, int fd, short events, void (*cb)(int, short, void *), void *arg, struct timeval *tv) +{ + struct ev_x_once *once = (struct ev_x_once *)malloc (sizeof (struct ev_x_once)); + dLOOPbase; + + if (!once) + return -1; + + once->fd = fd; + once->cb = cb; + once->arg = arg; + + ev_once (EV_A_ fd, events & (EV_READ | EV_WRITE), ev_tv_get (tv), ev_x_once_cb, (void *)once); + + return 0; +} + +int event_base_priority_init (struct event_base *base, int npri) +{ + /*dLOOPbase;*/ + + return 0; +} + diff --git a/examples/udp_proxy/libev/event.h b/examples/udp_proxy/libev/event.h new file mode 100644 index 0000000..aa81928 --- /dev/null +++ b/examples/udp_proxy/libev/event.h @@ -0,0 +1,177 @@ +/* + * libevent compatibility header, only core events supported + * + * Copyright (c) 2007,2008,2010,2012 Marc Alexander Lehmann + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#ifndef EVENT_H_ +#define EVENT_H_ + +#ifdef EV_H +# include EV_H +#else +# include "ev.h" +#endif + +#ifndef EVLOOP_NONBLOCK +# define EVLOOP_NONBLOCK EVRUN_NOWAIT +#endif +#ifndef EVLOOP_ONESHOT +# define EVLOOP_ONESHOT EVRUN_ONCE +#endif +#ifndef EV_TIMEOUT +# define EV_TIMEOUT EV_TIMER +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* we need sys/time.h for struct timeval only */ +#if !defined (WIN32) || defined (__MINGW32__) +# include /* mingw seems to need this, for whatever reason */ +# include +#endif + +struct event_base; + +#define EVLIST_TIMEOUT 0x01 +#define EVLIST_INSERTED 0x02 +#define EVLIST_SIGNAL 0x04 +#define EVLIST_ACTIVE 0x08 +#define EVLIST_INTERNAL 0x10 +#define EVLIST_INIT 0x80 + +typedef void (*event_callback_fn)(int, short, void *); + +struct event +{ + /* libev watchers we map onto */ + union { + struct ev_io io; + struct ev_signal sig; + } iosig; + struct ev_timer to; + + /* compatibility slots */ + struct event_base *ev_base; + event_callback_fn ev_callback; + void *ev_arg; + int ev_fd; + int ev_pri; + int ev_res; + int ev_flags; + short ev_events; +}; + +event_callback_fn event_get_callback (const struct event *ev); + +#define EV_READ EV_READ +#define EV_WRITE EV_WRITE +#define EV_PERSIST 0x10 +#define EV_ET 0x20 /* nop */ + +#define EVENT_SIGNAL(ev) ((int) (ev)->ev_fd) +#define EVENT_FD(ev) ((int) (ev)->ev_fd) + +#define event_initialized(ev) ((ev)->ev_flags & EVLIST_INIT) + +#define evtimer_add(ev,tv) event_add (ev, tv) +#define evtimer_set(ev,cb,data) event_set (ev, -1, 0, cb, data) +#define evtimer_del(ev) event_del (ev) +#define evtimer_pending(ev,tv) event_pending (ev, EV_TIMEOUT, tv) +#define evtimer_initialized(ev) event_initialized (ev) + +#define timeout_add(ev,tv) evtimer_add (ev, tv) +#define timeout_set(ev,cb,data) evtimer_set (ev, cb, data) +#define timeout_del(ev) evtimer_del (ev) +#define timeout_pending(ev,tv) evtimer_pending (ev, tv) +#define timeout_initialized(ev) evtimer_initialized (ev) + +#define signal_add(ev,tv) event_add (ev, tv) +#define signal_set(ev,sig,cb,data) event_set (ev, sig, EV_SIGNAL | EV_PERSIST, cb, data) +#define signal_del(ev) event_del (ev) +#define signal_pending(ev,tv) event_pending (ev, EV_SIGNAL, tv) +#define signal_initialized(ev) event_initialized (ev) + +const char *event_get_version (void); +const char *event_get_method (void); + +void *event_init (void); +void event_base_free (struct event_base *base); + +#define EVLOOP_ONCE EVLOOP_ONESHOT +int event_loop (int); +int event_loopexit (struct timeval *tv); +int event_dispatch (void); + +#define _EVENT_LOG_DEBUG 0 +#define _EVENT_LOG_MSG 1 +#define _EVENT_LOG_WARN 2 +#define _EVENT_LOG_ERR 3 +typedef void (*event_log_cb)(int severity, const char *msg); +void event_set_log_callback(event_log_cb cb); + +void event_set (struct event *ev, int fd, short events, void (*cb)(int, short, void *), void *arg); +int event_once (int fd, short events, void (*cb)(int, short, void *), void *arg, struct timeval *tv); + +int event_add (struct event *ev, struct timeval *tv); +int event_del (struct event *ev); +void event_active (struct event *ev, int res, short ncalls); /* ncalls is being ignored */ + +int event_pending (struct event *ev, short, struct timeval *tv); + +int event_priority_init (int npri); +int event_priority_set (struct event *ev, int pri); + +struct event_base *event_base_new (void); +const char *event_base_get_method (const struct event_base *); +int event_base_set (struct event_base *base, struct event *ev); +int event_base_loop (struct event_base *base, int); +int event_base_loopexit (struct event_base *base, struct timeval *tv); +int event_base_dispatch (struct event_base *base); +int event_base_once (struct event_base *base, int fd, short events, void (*cb)(int, short, void *), void *arg, struct timeval *tv); +int event_base_priority_init (struct event_base *base, int fd); + +/* next line is different in the libevent+libev version */ +/*libevent-include*/ + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/examples/udp_proxy/libev/include/ev.h b/examples/udp_proxy/libev/include/ev.h new file mode 100644 index 0000000..db93777 --- /dev/null +++ b/examples/udp_proxy/libev/include/ev.h @@ -0,0 +1,854 @@ +/* + * libev native API header + * + * Copyright (c) 2007,2008,2009,2010,2011,2012,2015 Marc Alexander Lehmann + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#ifndef EV_H_ +#define EV_H_ + +#ifdef __cplusplus +# define EV_CPP(x) x +# if __cplusplus >= 201103L +# define EV_THROW noexcept +# else +# define EV_THROW throw () +# endif +#else +# define EV_CPP(x) +# define EV_THROW +#endif + +EV_CPP(extern "C" {) + +/*****************************************************************************/ + +/* pre-4.0 compatibility */ +#ifndef EV_COMPAT3 +# define EV_COMPAT3 1 +#endif + +#ifndef EV_FEATURES +# if defined __OPTIMIZE_SIZE__ +# define EV_FEATURES 0x7c +# else +# define EV_FEATURES 0x7f +# endif +#endif + +#define EV_FEATURE_CODE ((EV_FEATURES) & 1) +#define EV_FEATURE_DATA ((EV_FEATURES) & 2) +#define EV_FEATURE_CONFIG ((EV_FEATURES) & 4) +#define EV_FEATURE_API ((EV_FEATURES) & 8) +#define EV_FEATURE_WATCHERS ((EV_FEATURES) & 16) +#define EV_FEATURE_BACKENDS ((EV_FEATURES) & 32) +#define EV_FEATURE_OS ((EV_FEATURES) & 64) + +/* these priorities are inclusive, higher priorities will be invoked earlier */ +#ifndef EV_MINPRI +# define EV_MINPRI (EV_FEATURE_CONFIG ? -2 : 0) +#endif +#ifndef EV_MAXPRI +# define EV_MAXPRI (EV_FEATURE_CONFIG ? +2 : 0) +#endif + +#ifndef EV_MULTIPLICITY +# define EV_MULTIPLICITY EV_FEATURE_CONFIG +#endif + +#ifndef EV_PERIODIC_ENABLE +# define EV_PERIODIC_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_STAT_ENABLE +# define EV_STAT_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_PREPARE_ENABLE +# define EV_PREPARE_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_CHECK_ENABLE +# define EV_CHECK_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_IDLE_ENABLE +# define EV_IDLE_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_FORK_ENABLE +# define EV_FORK_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_CLEANUP_ENABLE +# define EV_CLEANUP_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_SIGNAL_ENABLE +# define EV_SIGNAL_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_CHILD_ENABLE +# ifdef _WIN32 +# define EV_CHILD_ENABLE 0 +# else +# define EV_CHILD_ENABLE EV_FEATURE_WATCHERS +#endif +#endif + +#ifndef EV_ASYNC_ENABLE +# define EV_ASYNC_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_EMBED_ENABLE +# define EV_EMBED_ENABLE EV_FEATURE_WATCHERS +#endif + +#ifndef EV_WALK_ENABLE +# define EV_WALK_ENABLE 0 /* not yet */ +#endif + +/*****************************************************************************/ + +#if EV_CHILD_ENABLE && !EV_SIGNAL_ENABLE +# undef EV_SIGNAL_ENABLE +# define EV_SIGNAL_ENABLE 1 +#endif + +/*****************************************************************************/ + +typedef double ev_tstamp; + +#include /* for memmove */ + +#ifndef EV_ATOMIC_T +# include +# define EV_ATOMIC_T sig_atomic_t volatile +#endif + +#if EV_STAT_ENABLE +# ifdef _WIN32 +# include +# include +# endif +# include +#endif + +/* support multiple event loops? */ +#if EV_MULTIPLICITY +struct ev_loop; +# define EV_P struct ev_loop *loop /* a loop as sole parameter in a declaration */ +# define EV_P_ EV_P, /* a loop as first of multiple parameters */ +# define EV_A loop /* a loop as sole argument to a function call */ +# define EV_A_ EV_A, /* a loop as first of multiple arguments */ +# define EV_DEFAULT_UC ev_default_loop_uc_ () /* the default loop, if initialised, as sole arg */ +# define EV_DEFAULT_UC_ EV_DEFAULT_UC, /* the default loop as first of multiple arguments */ +# define EV_DEFAULT ev_default_loop (0) /* the default loop as sole arg */ +# define EV_DEFAULT_ EV_DEFAULT, /* the default loop as first of multiple arguments */ +#else +# define EV_P void +# define EV_P_ +# define EV_A +# define EV_A_ +# define EV_DEFAULT +# define EV_DEFAULT_ +# define EV_DEFAULT_UC +# define EV_DEFAULT_UC_ +# undef EV_EMBED_ENABLE +#endif + +/* EV_INLINE is used for functions in header files */ +#if __STDC_VERSION__ >= 199901L || __GNUC__ >= 3 +# define EV_INLINE static inline +#else +# define EV_INLINE static +#endif + +#ifdef EV_API_STATIC +# define EV_API_DECL static +#else +# define EV_API_DECL extern +#endif + +/* EV_PROTOTYPES can be used to switch of prototype declarations */ +#ifndef EV_PROTOTYPES +# define EV_PROTOTYPES 1 +#endif + +/*****************************************************************************/ + +#define EV_VERSION_MAJOR 4 +#define EV_VERSION_MINOR 24 + +/* eventmask, revents, events... */ +enum { + EV_UNDEF = (int)0xFFFFFFFF, /* guaranteed to be invalid */ + EV_NONE = 0x00, /* no events */ + EV_READ = 0x01, /* ev_io detected read will not block */ + EV_WRITE = 0x02, /* ev_io detected write will not block */ + EV__IOFDSET = 0x80, /* internal use only */ + EV_IO = EV_READ, /* alias for type-detection */ + EV_TIMER = 0x00000100, /* timer timed out */ +#if EV_COMPAT3 + EV_TIMEOUT = EV_TIMER, /* pre 4.0 API compatibility */ +#endif + EV_PERIODIC = 0x00000200, /* periodic timer timed out */ + EV_SIGNAL = 0x00000400, /* signal was received */ + EV_CHILD = 0x00000800, /* child/pid had status change */ + EV_STAT = 0x00001000, /* stat data changed */ + EV_IDLE = 0x00002000, /* event loop is idling */ + EV_PREPARE = 0x00004000, /* event loop about to poll */ + EV_CHECK = 0x00008000, /* event loop finished poll */ + EV_EMBED = 0x00010000, /* embedded event loop needs sweep */ + EV_FORK = 0x00020000, /* event loop resumed in child */ + EV_CLEANUP = 0x00040000, /* event loop resumed in child */ + EV_ASYNC = 0x00080000, /* async intra-loop signal */ + EV_CUSTOM = 0x01000000, /* for use by user code */ + EV_ERROR = (int)0x80000000 /* sent when an error occurs */ +}; + +/* can be used to add custom fields to all watchers, while losing binary compatibility */ +#ifndef EV_COMMON +# define EV_COMMON void *data; +#endif + +#ifndef EV_CB_DECLARE +# define EV_CB_DECLARE(type) void (*cb)(EV_P_ struct type *w, int revents); +#endif +#ifndef EV_CB_INVOKE +# define EV_CB_INVOKE(watcher,revents) (watcher)->cb (EV_A_ (watcher), (revents)) +#endif + +/* not official, do not use */ +#define EV_CB(type,name) void name (EV_P_ struct ev_ ## type *w, int revents) + +/* + * struct member types: + * private: you may look at them, but not change them, + * and they might not mean anything to you. + * ro: can be read anytime, but only changed when the watcher isn't active. + * rw: can be read and modified anytime, even when the watcher is active. + * + * some internal details that might be helpful for debugging: + * + * active is either 0, which means the watcher is not active, + * or the array index of the watcher (periodics, timers) + * or the array index + 1 (most other watchers) + * or simply 1 for watchers that aren't in some array. + * pending is either 0, in which case the watcher isn't, + * or the array index + 1 in the pendings array. + */ + +#if EV_MINPRI == EV_MAXPRI +# define EV_DECL_PRIORITY +#elif !defined (EV_DECL_PRIORITY) +# define EV_DECL_PRIORITY int priority; +#endif + +/* shared by all watchers */ +#define EV_WATCHER(type) \ + int active; /* private */ \ + int pending; /* private */ \ + EV_DECL_PRIORITY /* private */ \ + EV_COMMON /* rw */ \ + EV_CB_DECLARE (type) /* private */ + +#define EV_WATCHER_LIST(type) \ + EV_WATCHER (type) \ + struct ev_watcher_list *next; /* private */ + +#define EV_WATCHER_TIME(type) \ + EV_WATCHER (type) \ + ev_tstamp at; /* private */ + +/* base class, nothing to see here unless you subclass */ +typedef struct ev_watcher +{ + EV_WATCHER (ev_watcher) +} ev_watcher; + +/* base class, nothing to see here unless you subclass */ +typedef struct ev_watcher_list +{ + EV_WATCHER_LIST (ev_watcher_list) +} ev_watcher_list; + +/* base class, nothing to see here unless you subclass */ +typedef struct ev_watcher_time +{ + EV_WATCHER_TIME (ev_watcher_time) +} ev_watcher_time; + +/* invoked when fd is either EV_READable or EV_WRITEable */ +/* revent EV_READ, EV_WRITE */ +typedef struct ev_io +{ + EV_WATCHER_LIST (ev_io) + + int fd; /* ro */ + int events; /* ro */ +} ev_io; + +/* invoked after a specific time, repeatable (based on monotonic clock) */ +/* revent EV_TIMEOUT */ +typedef struct ev_timer +{ + EV_WATCHER_TIME (ev_timer) + + ev_tstamp repeat; /* rw */ +} ev_timer; + +/* invoked at some specific time, possibly repeating at regular intervals (based on UTC) */ +/* revent EV_PERIODIC */ +typedef struct ev_periodic +{ + EV_WATCHER_TIME (ev_periodic) + + ev_tstamp offset; /* rw */ + ev_tstamp interval; /* rw */ + ev_tstamp (*reschedule_cb)(struct ev_periodic *w, ev_tstamp now) EV_THROW; /* rw */ +} ev_periodic; + +/* invoked when the given signal has been received */ +/* revent EV_SIGNAL */ +typedef struct ev_signal +{ + EV_WATCHER_LIST (ev_signal) + + int signum; /* ro */ +} ev_signal; + +/* invoked when sigchld is received and waitpid indicates the given pid */ +/* revent EV_CHILD */ +/* does not support priorities */ +typedef struct ev_child +{ + EV_WATCHER_LIST (ev_child) + + int flags; /* private */ + int pid; /* ro */ + int rpid; /* rw, holds the received pid */ + int rstatus; /* rw, holds the exit status, use the macros from sys/wait.h */ +} ev_child; + +#if EV_STAT_ENABLE +/* st_nlink = 0 means missing file or other error */ +# ifdef _WIN32 +typedef struct _stati64 ev_statdata; +# else +typedef struct stat ev_statdata; +# endif + +/* invoked each time the stat data changes for a given path */ +/* revent EV_STAT */ +typedef struct ev_stat +{ + EV_WATCHER_LIST (ev_stat) + + ev_timer timer; /* private */ + ev_tstamp interval; /* ro */ + const char *path; /* ro */ + ev_statdata prev; /* ro */ + ev_statdata attr; /* ro */ + + int wd; /* wd for inotify, fd for kqueue */ +} ev_stat; +#endif + +#if EV_IDLE_ENABLE +/* invoked when the nothing else needs to be done, keeps the process from blocking */ +/* revent EV_IDLE */ +typedef struct ev_idle +{ + EV_WATCHER (ev_idle) +} ev_idle; +#endif + +/* invoked for each run of the mainloop, just before the blocking call */ +/* you can still change events in any way you like */ +/* revent EV_PREPARE */ +typedef struct ev_prepare +{ + EV_WATCHER (ev_prepare) +} ev_prepare; + +/* invoked for each run of the mainloop, just after the blocking call */ +/* revent EV_CHECK */ +typedef struct ev_check +{ + EV_WATCHER (ev_check) +} ev_check; + +#if EV_FORK_ENABLE +/* the callback gets invoked before check in the child process when a fork was detected */ +/* revent EV_FORK */ +typedef struct ev_fork +{ + EV_WATCHER (ev_fork) +} ev_fork; +#endif + +#if EV_CLEANUP_ENABLE +/* is invoked just before the loop gets destroyed */ +/* revent EV_CLEANUP */ +typedef struct ev_cleanup +{ + EV_WATCHER (ev_cleanup) +} ev_cleanup; +#endif + +#if EV_EMBED_ENABLE +/* used to embed an event loop inside another */ +/* the callback gets invoked when the event loop has handled events, and can be 0 */ +typedef struct ev_embed +{ + EV_WATCHER (ev_embed) + + struct ev_loop *other; /* ro */ + ev_io io; /* private */ + ev_prepare prepare; /* private */ + ev_check check; /* unused */ + ev_timer timer; /* unused */ + ev_periodic periodic; /* unused */ + ev_idle idle; /* unused */ + ev_fork fork; /* private */ +#if EV_CLEANUP_ENABLE + ev_cleanup cleanup; /* unused */ +#endif +} ev_embed; +#endif + +#if EV_ASYNC_ENABLE +/* invoked when somebody calls ev_async_send on the watcher */ +/* revent EV_ASYNC */ +typedef struct ev_async +{ + EV_WATCHER (ev_async) + + EV_ATOMIC_T sent; /* private */ +} ev_async; + +# define ev_async_pending(w) (+(w)->sent) +#endif + +/* the presence of this union forces similar struct layout */ +union ev_any_watcher +{ + struct ev_watcher w; + struct ev_watcher_list wl; + + struct ev_io io; + struct ev_timer timer; + struct ev_periodic periodic; + struct ev_signal signal; + struct ev_child child; +#if EV_STAT_ENABLE + struct ev_stat stat; +#endif +#if EV_IDLE_ENABLE + struct ev_idle idle; +#endif + struct ev_prepare prepare; + struct ev_check check; +#if EV_FORK_ENABLE + struct ev_fork fork; +#endif +#if EV_CLEANUP_ENABLE + struct ev_cleanup cleanup; +#endif +#if EV_EMBED_ENABLE + struct ev_embed embed; +#endif +#if EV_ASYNC_ENABLE + struct ev_async async; +#endif +}; + +/* flag bits for ev_default_loop and ev_loop_new */ +enum { + /* the default */ + EVFLAG_AUTO = 0x00000000U, /* not quite a mask */ + /* flag bits */ + EVFLAG_NOENV = 0x01000000U, /* do NOT consult environment */ + EVFLAG_FORKCHECK = 0x02000000U, /* check for a fork in each iteration */ + /* debugging/feature disable */ + EVFLAG_NOINOTIFY = 0x00100000U, /* do not attempt to use inotify */ +#if EV_COMPAT3 + EVFLAG_NOSIGFD = 0, /* compatibility to pre-3.9 */ +#endif + EVFLAG_SIGNALFD = 0x00200000U, /* attempt to use signalfd */ + EVFLAG_NOSIGMASK = 0x00400000U /* avoid modifying the signal mask */ +}; + +/* method bits to be ored together */ +enum { + EVBACKEND_SELECT = 0x00000001U, /* available just about anywhere */ + EVBACKEND_POLL = 0x00000002U, /* !win, !aix, broken on osx */ + EVBACKEND_EPOLL = 0x00000004U, /* linux */ + EVBACKEND_KQUEUE = 0x00000008U, /* bsd, broken on osx */ + EVBACKEND_DEVPOLL = 0x00000010U, /* solaris 8 */ /* NYI */ + EVBACKEND_PORT = 0x00000020U, /* solaris 10 */ + EVBACKEND_ALL = 0x0000003FU, /* all known backends */ + EVBACKEND_MASK = 0x0000FFFFU /* all future backends */ +}; + +#if EV_PROTOTYPES +EV_API_DECL int ev_version_major (void) EV_THROW; +EV_API_DECL int ev_version_minor (void) EV_THROW; + +EV_API_DECL unsigned int ev_supported_backends (void) EV_THROW; +EV_API_DECL unsigned int ev_recommended_backends (void) EV_THROW; +EV_API_DECL unsigned int ev_embeddable_backends (void) EV_THROW; + +EV_API_DECL ev_tstamp ev_time (void) EV_THROW; +EV_API_DECL void ev_sleep (ev_tstamp delay) EV_THROW; /* sleep for a while */ + +/* Sets the allocation function to use, works like realloc. + * It is used to allocate and free memory. + * If it returns zero when memory needs to be allocated, the library might abort + * or take some potentially destructive action. + * The default is your system realloc function. + */ +EV_API_DECL void ev_set_allocator (void *(*cb)(void *ptr, long size) EV_THROW) EV_THROW; + +/* set the callback function to call on a + * retryable syscall error + * (such as failed select, poll, epoll_wait) + */ +EV_API_DECL void ev_set_syserr_cb (void (*cb)(const char *msg) EV_THROW) EV_THROW; + +#if EV_MULTIPLICITY + +/* the default loop is the only one that handles signals and child watchers */ +/* you can call this as often as you like */ +EV_API_DECL struct ev_loop *ev_default_loop (unsigned int flags EV_CPP (= 0)) EV_THROW; + +#ifdef EV_API_STATIC +EV_API_DECL struct ev_loop *ev_default_loop_ptr; +#endif + +EV_INLINE struct ev_loop * +ev_default_loop_uc_ (void) EV_THROW +{ + extern struct ev_loop *ev_default_loop_ptr; + + return ev_default_loop_ptr; +} + +EV_INLINE int +ev_is_default_loop (EV_P) EV_THROW +{ + return EV_A == EV_DEFAULT_UC; +} + +/* create and destroy alternative loops that don't handle signals */ +EV_API_DECL struct ev_loop *ev_loop_new (unsigned int flags EV_CPP (= 0)) EV_THROW; + +EV_API_DECL ev_tstamp ev_now (EV_P) EV_THROW; /* time w.r.t. timers and the eventloop, updated after each poll */ + +#else + +EV_API_DECL int ev_default_loop (unsigned int flags EV_CPP (= 0)) EV_THROW; /* returns true when successful */ + +EV_API_DECL ev_tstamp ev_rt_now; + +EV_INLINE ev_tstamp +ev_now (void) EV_THROW +{ + return ev_rt_now; +} + +/* looks weird, but ev_is_default_loop (EV_A) still works if this exists */ +EV_INLINE int +ev_is_default_loop (void) EV_THROW +{ + return 1; +} + +#endif /* multiplicity */ + +/* destroy event loops, also works for the default loop */ +EV_API_DECL void ev_loop_destroy (EV_P); + +/* this needs to be called after fork, to duplicate the loop */ +/* when you want to re-use it in the child */ +/* you can call it in either the parent or the child */ +/* you can actually call it at any time, anywhere :) */ +EV_API_DECL void ev_loop_fork (EV_P) EV_THROW; + +EV_API_DECL unsigned int ev_backend (EV_P) EV_THROW; /* backend in use by loop */ + +EV_API_DECL void ev_now_update (EV_P) EV_THROW; /* update event loop time */ + +#if EV_WALK_ENABLE +/* walk (almost) all watchers in the loop of a given type, invoking the */ +/* callback on every such watcher. The callback might stop the watcher, */ +/* but do nothing else with the loop */ +EV_API_DECL void ev_walk (EV_P_ int types, void (*cb)(EV_P_ int type, void *w)) EV_THROW; +#endif + +#endif /* prototypes */ + +/* ev_run flags values */ +enum { + EVRUN_NOWAIT = 1, /* do not block/wait */ + EVRUN_ONCE = 2 /* block *once* only */ +}; + +/* ev_break how values */ +enum { + EVBREAK_CANCEL = 0, /* undo unloop */ + EVBREAK_ONE = 1, /* unloop once */ + EVBREAK_ALL = 2 /* unloop all loops */ +}; + +#if EV_PROTOTYPES +EV_API_DECL int ev_run (EV_P_ int flags EV_CPP (= 0)); +EV_API_DECL void ev_break (EV_P_ int how EV_CPP (= EVBREAK_ONE)) EV_THROW; /* break out of the loop */ + +/* + * ref/unref can be used to add or remove a refcount on the mainloop. every watcher + * keeps one reference. if you have a long-running watcher you never unregister that + * should not keep ev_loop from running, unref() after starting, and ref() before stopping. + */ +EV_API_DECL void ev_ref (EV_P) EV_THROW; +EV_API_DECL void ev_unref (EV_P) EV_THROW; + +/* + * convenience function, wait for a single event, without registering an event watcher + * if timeout is < 0, do wait indefinitely + */ +EV_API_DECL void ev_once (EV_P_ int fd, int events, ev_tstamp timeout, void (*cb)(int revents, void *arg), void *arg) EV_THROW; + +# if EV_FEATURE_API +EV_API_DECL unsigned int ev_iteration (EV_P) EV_THROW; /* number of loop iterations */ +EV_API_DECL unsigned int ev_depth (EV_P) EV_THROW; /* #ev_loop enters - #ev_loop leaves */ +EV_API_DECL void ev_verify (EV_P) EV_THROW; /* abort if loop data corrupted */ + +EV_API_DECL void ev_set_io_collect_interval (EV_P_ ev_tstamp interval) EV_THROW; /* sleep at least this time, default 0 */ +EV_API_DECL void ev_set_timeout_collect_interval (EV_P_ ev_tstamp interval) EV_THROW; /* sleep at least this time, default 0 */ + +/* advanced stuff for threading etc. support, see docs */ +EV_API_DECL void ev_set_userdata (EV_P_ void *data) EV_THROW; +EV_API_DECL void *ev_userdata (EV_P) EV_THROW; +typedef void (*ev_loop_callback)(EV_P); +EV_API_DECL void ev_set_invoke_pending_cb (EV_P_ ev_loop_callback invoke_pending_cb) EV_THROW; +/* C++ doesn't allow the use of the ev_loop_callback typedef here, so we need to spell it out */ +EV_API_DECL void ev_set_loop_release_cb (EV_P_ void (*release)(EV_P) EV_THROW, void (*acquire)(EV_P) EV_THROW) EV_THROW; + +EV_API_DECL unsigned int ev_pending_count (EV_P) EV_THROW; /* number of pending events, if any */ +EV_API_DECL void ev_invoke_pending (EV_P); /* invoke all pending watchers */ + +/* + * stop/start the timer handling. + */ +EV_API_DECL void ev_suspend (EV_P) EV_THROW; +EV_API_DECL void ev_resume (EV_P) EV_THROW; +#endif + +#endif + +/* these may evaluate ev multiple times, and the other arguments at most once */ +/* either use ev_init + ev_TYPE_set, or the ev_TYPE_init macro, below, to first initialise a watcher */ +#define ev_init(ev,cb_) do { \ + ((ev_watcher *)(void *)(ev))->active = \ + ((ev_watcher *)(void *)(ev))->pending = 0; \ + ev_set_priority ((ev), 0); \ + ev_set_cb ((ev), cb_); \ +} while (0) + +#define ev_io_set(ev,fd_,events_) do { (ev)->fd = (fd_); (ev)->events = (events_) | EV__IOFDSET; } while (0) +#define ev_timer_set(ev,after_,repeat_) do { ((ev_watcher_time *)(ev))->at = (after_); (ev)->repeat = (repeat_); } while (0) +#define ev_periodic_set(ev,ofs_,ival_,rcb_) do { (ev)->offset = (ofs_); (ev)->interval = (ival_); (ev)->reschedule_cb = (rcb_); } while (0) +#define ev_signal_set(ev,signum_) do { (ev)->signum = (signum_); } while (0) +#define ev_child_set(ev,pid_,trace_) do { (ev)->pid = (pid_); (ev)->flags = !!(trace_); } while (0) +#define ev_stat_set(ev,path_,interval_) do { (ev)->path = (path_); (ev)->interval = (interval_); (ev)->wd = -2; } while (0) +#define ev_idle_set(ev) /* nop, yes, this is a serious in-joke */ +#define ev_prepare_set(ev) /* nop, yes, this is a serious in-joke */ +#define ev_check_set(ev) /* nop, yes, this is a serious in-joke */ +#define ev_embed_set(ev,other_) do { (ev)->other = (other_); } while (0) +#define ev_fork_set(ev) /* nop, yes, this is a serious in-joke */ +#define ev_cleanup_set(ev) /* nop, yes, this is a serious in-joke */ +#define ev_async_set(ev) /* nop, yes, this is a serious in-joke */ + +#define ev_io_init(ev,cb,fd,events) do { ev_init ((ev), (cb)); ev_io_set ((ev),(fd),(events)); } while (0) +#define ev_timer_init(ev,cb,after,repeat) do { ev_init ((ev), (cb)); ev_timer_set ((ev),(after),(repeat)); } while (0) +#define ev_periodic_init(ev,cb,ofs,ival,rcb) do { ev_init ((ev), (cb)); ev_periodic_set ((ev),(ofs),(ival),(rcb)); } while (0) +#define ev_signal_init(ev,cb,signum) do { ev_init ((ev), (cb)); ev_signal_set ((ev), (signum)); } while (0) +#define ev_child_init(ev,cb,pid,trace) do { ev_init ((ev), (cb)); ev_child_set ((ev),(pid),(trace)); } while (0) +#define ev_stat_init(ev,cb,path,interval) do { ev_init ((ev), (cb)); ev_stat_set ((ev),(path),(interval)); } while (0) +#define ev_idle_init(ev,cb) do { ev_init ((ev), (cb)); ev_idle_set ((ev)); } while (0) +#define ev_prepare_init(ev,cb) do { ev_init ((ev), (cb)); ev_prepare_set ((ev)); } while (0) +#define ev_check_init(ev,cb) do { ev_init ((ev), (cb)); ev_check_set ((ev)); } while (0) +#define ev_embed_init(ev,cb,other) do { ev_init ((ev), (cb)); ev_embed_set ((ev),(other)); } while (0) +#define ev_fork_init(ev,cb) do { ev_init ((ev), (cb)); ev_fork_set ((ev)); } while (0) +#define ev_cleanup_init(ev,cb) do { ev_init ((ev), (cb)); ev_cleanup_set ((ev)); } while (0) +#define ev_async_init(ev,cb) do { ev_init ((ev), (cb)); ev_async_set ((ev)); } while (0) + +#define ev_is_pending(ev) (0 + ((ev_watcher *)(void *)(ev))->pending) /* ro, true when watcher is waiting for callback invocation */ +#define ev_is_active(ev) (0 + ((ev_watcher *)(void *)(ev))->active) /* ro, true when the watcher has been started */ + +#define ev_cb_(ev) (ev)->cb /* rw */ +#define ev_cb(ev) (memmove (&ev_cb_ (ev), &((ev_watcher *)(ev))->cb, sizeof (ev_cb_ (ev))), (ev)->cb) + +#if EV_MINPRI == EV_MAXPRI +# define ev_priority(ev) ((ev), EV_MINPRI) +# define ev_set_priority(ev,pri) ((ev), (pri)) +#else +# define ev_priority(ev) (+(((ev_watcher *)(void *)(ev))->priority)) +# define ev_set_priority(ev,pri) ( (ev_watcher *)(void *)(ev))->priority = (pri) +#endif + +#define ev_periodic_at(ev) (+((ev_watcher_time *)(ev))->at) + +#ifndef ev_set_cb +# define ev_set_cb(ev,cb_) (ev_cb_ (ev) = (cb_), memmove (&((ev_watcher *)(ev))->cb, &ev_cb_ (ev), sizeof (ev_cb_ (ev)))) +#endif + +/* stopping (enabling, adding) a watcher does nothing if it is already running */ +/* stopping (disabling, deleting) a watcher does nothing unless it's already running */ +#if EV_PROTOTYPES + +/* feeds an event into a watcher as if the event actually occurred */ +/* accepts any ev_watcher type */ +EV_API_DECL void ev_feed_event (EV_P_ void *w, int revents) EV_THROW; +EV_API_DECL void ev_feed_fd_event (EV_P_ int fd, int revents) EV_THROW; +#if EV_SIGNAL_ENABLE +EV_API_DECL void ev_feed_signal (int signum) EV_THROW; +EV_API_DECL void ev_feed_signal_event (EV_P_ int signum) EV_THROW; +#endif +EV_API_DECL void ev_invoke (EV_P_ void *w, int revents); +EV_API_DECL int ev_clear_pending (EV_P_ void *w) EV_THROW; + +EV_API_DECL void ev_io_start (EV_P_ ev_io *w) EV_THROW; +EV_API_DECL void ev_io_stop (EV_P_ ev_io *w) EV_THROW; + +EV_API_DECL void ev_timer_start (EV_P_ ev_timer *w) EV_THROW; +EV_API_DECL void ev_timer_stop (EV_P_ ev_timer *w) EV_THROW; +/* stops if active and no repeat, restarts if active and repeating, starts if inactive and repeating */ +EV_API_DECL void ev_timer_again (EV_P_ ev_timer *w) EV_THROW; +/* return remaining time */ +EV_API_DECL ev_tstamp ev_timer_remaining (EV_P_ ev_timer *w) EV_THROW; + +#if EV_PERIODIC_ENABLE +EV_API_DECL void ev_periodic_start (EV_P_ ev_periodic *w) EV_THROW; +EV_API_DECL void ev_periodic_stop (EV_P_ ev_periodic *w) EV_THROW; +EV_API_DECL void ev_periodic_again (EV_P_ ev_periodic *w) EV_THROW; +#endif + +/* only supported in the default loop */ +#if EV_SIGNAL_ENABLE +EV_API_DECL void ev_signal_start (EV_P_ ev_signal *w) EV_THROW; +EV_API_DECL void ev_signal_stop (EV_P_ ev_signal *w) EV_THROW; +#endif + +/* only supported in the default loop */ +# if EV_CHILD_ENABLE +EV_API_DECL void ev_child_start (EV_P_ ev_child *w) EV_THROW; +EV_API_DECL void ev_child_stop (EV_P_ ev_child *w) EV_THROW; +# endif + +# if EV_STAT_ENABLE +EV_API_DECL void ev_stat_start (EV_P_ ev_stat *w) EV_THROW; +EV_API_DECL void ev_stat_stop (EV_P_ ev_stat *w) EV_THROW; +EV_API_DECL void ev_stat_stat (EV_P_ ev_stat *w) EV_THROW; +# endif + +# if EV_IDLE_ENABLE +EV_API_DECL void ev_idle_start (EV_P_ ev_idle *w) EV_THROW; +EV_API_DECL void ev_idle_stop (EV_P_ ev_idle *w) EV_THROW; +# endif + +#if EV_PREPARE_ENABLE +EV_API_DECL void ev_prepare_start (EV_P_ ev_prepare *w) EV_THROW; +EV_API_DECL void ev_prepare_stop (EV_P_ ev_prepare *w) EV_THROW; +#endif + +#if EV_CHECK_ENABLE +EV_API_DECL void ev_check_start (EV_P_ ev_check *w) EV_THROW; +EV_API_DECL void ev_check_stop (EV_P_ ev_check *w) EV_THROW; +#endif + +# if EV_FORK_ENABLE +EV_API_DECL void ev_fork_start (EV_P_ ev_fork *w) EV_THROW; +EV_API_DECL void ev_fork_stop (EV_P_ ev_fork *w) EV_THROW; +# endif + +# if EV_CLEANUP_ENABLE +EV_API_DECL void ev_cleanup_start (EV_P_ ev_cleanup *w) EV_THROW; +EV_API_DECL void ev_cleanup_stop (EV_P_ ev_cleanup *w) EV_THROW; +# endif + +# if EV_EMBED_ENABLE +/* only supported when loop to be embedded is in fact embeddable */ +EV_API_DECL void ev_embed_start (EV_P_ ev_embed *w) EV_THROW; +EV_API_DECL void ev_embed_stop (EV_P_ ev_embed *w) EV_THROW; +EV_API_DECL void ev_embed_sweep (EV_P_ ev_embed *w) EV_THROW; +# endif + +# if EV_ASYNC_ENABLE +EV_API_DECL void ev_async_start (EV_P_ ev_async *w) EV_THROW; +EV_API_DECL void ev_async_stop (EV_P_ ev_async *w) EV_THROW; +EV_API_DECL void ev_async_send (EV_P_ ev_async *w) EV_THROW; +# endif + +#if EV_COMPAT3 + #define EVLOOP_NONBLOCK EVRUN_NOWAIT + #define EVLOOP_ONESHOT EVRUN_ONCE + #define EVUNLOOP_CANCEL EVBREAK_CANCEL + #define EVUNLOOP_ONE EVBREAK_ONE + #define EVUNLOOP_ALL EVBREAK_ALL + #if EV_PROTOTYPES + EV_INLINE void ev_loop (EV_P_ int flags) { ev_run (EV_A_ flags); } + EV_INLINE void ev_unloop (EV_P_ int how ) { ev_break (EV_A_ how ); } + EV_INLINE void ev_default_destroy (void) { ev_loop_destroy (EV_DEFAULT); } + EV_INLINE void ev_default_fork (void) { ev_loop_fork (EV_DEFAULT); } + #if EV_FEATURE_API + EV_INLINE unsigned int ev_loop_count (EV_P) { return ev_iteration (EV_A); } + EV_INLINE unsigned int ev_loop_depth (EV_P) { return ev_depth (EV_A); } + EV_INLINE void ev_loop_verify (EV_P) { ev_verify (EV_A); } + #endif + #endif +#else + typedef struct ev_loop ev_loop; +#endif + +#endif + +EV_CPP(}) + +#endif + diff --git a/examples/udp_proxy/main.c b/examples/udp_proxy/main.c new file mode 100644 index 0000000..ce69cd1 --- /dev/null +++ b/examples/udp_proxy/main.c @@ -0,0 +1,335 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "./libev/ev.h" + +#define WORKER 3 + +char *local; +char *lport; + +#define PORT 8080 +#define BUFFER_SIZE 1024 +#define MAX_CONNECTIONS 1024 + +#define upstream_ip "1.1.1.2" +#define upstream_port 9999 + +struct connection { + struct ev_io *ev; + struct ev_timer *timer; + int fd; + void *loop; + + struct ev_io *up_ev; + int up_fd; +}; + +void udp_accept_callback(struct ev_loop *loop, struct ev_io *watcher, int revents); +int udp_socket_connect(struct sockaddr_in *peer, char *self_port, int isbind); + +void udp_read_callback(struct ev_loop *loop, struct ev_io *watcher, int revents); + +static void set_nonblocking(int sock) +{ + int opts; + opts = fcntl(sock, F_GETFL); + + if (opts < 0) { + perror("fcntl(sock,GETFL)"); + exit(1); + } + + opts = opts | O_NONBLOCK; + if (fcntl(sock, F_SETFL, opts) < 0) { + perror("fcntl(sock, SETFL, opts)"); + exit(1); + } +} + +/* + static void timeout_cb(struct ev_loop *loop, ev_timer *w, int revents) + { +///time_t now; + +///now = time(NULL); +///printf("in tiemr cb %ld , cur time is %s revents = %d EV_READ = %d EV_WRITE = %d\n", +/// (long int)(w->data), ctime(&now), revents, EV_READ, EV_WRITE); +//ev_timer_init(w, timeout_cb, 5, 0); +//ev_timer_start(loop, w); +} +*/ + +int udp_socket_connect(struct sockaddr_in *peer, char *self_port, int isbind) +{ + int fd = 0; + int opt = SO_REUSEADDR; + + fd = socket(PF_INET, SOCK_DGRAM, 0); + if (fd == -1) + return -1; + + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); + set_nonblocking(fd); + + if (isbind) { + struct sockaddr_in my_addr; + bzero(&my_addr, sizeof(my_addr)); + my_addr.sin_family = PF_INET; + my_addr.sin_port = htons(atoi(self_port)); + my_addr.sin_addr.s_addr = inet_addr(local); + if (bind(fd, (struct sockaddr *)&my_addr, sizeof(struct sockaddr)) == -1) { + perror("bind"); + exit(1); + } else { + printf("IP and port bind success\n"); + } + } + + if (fd == -1) + return -1; + + connect(fd, (struct sockaddr*)peer, sizeof(struct sockaddr_in)); + + return fd; +} + +static int create_upstream_peer(struct connection *conn) +{ + struct sockaddr_in addr; + struct ev_io *ev = NULL; + + addr.sin_family = PF_INET; + addr.sin_port = htons(upstream_port); + addr.sin_addr.s_addr = inet_addr(upstream_ip); + conn->up_fd = udp_socket_connect((struct sockaddr_in*)&addr, NULL, 0); + + ev = (struct ev_io*)malloc(sizeof(struct ev_io)); + if (!ev) { + return -1; + } + + ev->data = conn; + conn->up_ev = ev; + + ev_io_init(ev, udp_read_callback, conn->up_fd, EV_READ); + ev_io_start(conn->loop, ev); + + return 0; +} + +void udp_accept_callback(struct ev_loop *loop, struct ev_io *watcher, int revents) +{ + int client_sd; + struct sockaddr_in addr; + socklen_t client_len = sizeof(addr); + struct ev_io *client_watcher = NULL; + struct ev_timer *timeout_watcher = NULL; + struct connection *conn = NULL; + char buffer[BUFFER_SIZE]; + int ret = 0; + + if (EV_ERROR & revents) { + printf("error event in accept\n"); + return; + } + + conn = (struct connection *)malloc(sizeof(struct connection)); + if (!conn) { + return; + } + + ret = recvfrom(watcher->fd, buffer, BUFFER_SIZE, 0, (struct sockaddr *)&addr, &client_len); + + client_sd = udp_socket_connect((struct sockaddr_in*)&addr, lport, 1); + if (client_sd < 0) { + printf("accept error\n"); + return; + } + + client_watcher = (struct ev_io*)malloc(sizeof(struct ev_io)); + timeout_watcher = (struct ev_timer*)malloc(sizeof(struct ev_timer)); + + if (!client_watcher) { + return; + } + + if (!timeout_watcher) { + free(client_watcher); + return; + } + + printf("client connected, fd: %d\n", client_sd); + + // listen new client + ev_io_init(client_watcher, udp_read_callback, client_sd, EV_READ); + ev_io_start(loop, client_watcher); + + // add a timer for this fd + /* + timeout_watcher->data = (void *)(long)client_sd; + ev_timer_init(timeout_watcher, timeout_cb, 5, 0); + ev_timer_start(loop, timeout_watcher); + */ + + conn->ev = client_watcher; + conn->timer = timeout_watcher; + conn->fd = client_sd; + conn->loop = loop; + client_watcher->data = conn; + + create_upstream_peer(conn); + + ret = write(conn->up_fd, buffer, ret); + + bzero(buffer, BUFFER_SIZE); +} + +static void conn_finish(struct connection *conn) +{ + struct ev_loop *loop = (struct ev_loop *)conn->loop; + + close(conn->fd); + ev_io_stop(loop, conn->ev); + ev_timer_stop(loop, conn->timer); + + close(conn->up_fd); + ev_io_stop(loop, conn->up_ev); + + free(conn); +} + +void udp_read_callback(struct ev_loop *loop, struct ev_io *watcher, int revents) +{ + struct connection *conn = NULL; + char buffer[BUFFER_SIZE]; + ssize_t len; + + if (EV_ERROR & revents) { + printf("error event in read\n"); + return; + } + + // socket recv + len = recv(watcher->fd, buffer, BUFFER_SIZE, 0); // read stream to buffer + if (read < 0) { + if (errno == EINTR || errno == EAGAIN) { + printf("read error\n"); + return; + } + } + + if(len <= 0) { + printf("client closed.\n"); + goto conn_free; + } + + conn= (struct connection *)watcher->data; + if (watcher->fd == conn->fd) { + // socket send to client + len = send(conn->up_fd, buffer, len, 0); + } else if (watcher->fd == conn->up_fd) { + // socket send to upstream + len = send(conn->fd, buffer, len, 0); + } + + bzero(buffer, BUFFER_SIZE); + +conn_free: + if (watcher->data) { + //conn_finish((struct connection *)watcher->data); + } +} + +int ev_cycle(void *data) +{ + long w = (long)data; + int sd; + struct sockaddr_in addr; + struct ev_loop *loop = NULL; + struct ev_io *socket_watcher = (struct ev_io*)malloc(sizeof(struct ev_io)); + + signal(SIGPIPE, SIG_IGN); + + // socket + sd = socket(PF_INET, SOCK_DGRAM, 0); + if (sd < 0) { + printf("socket error\n"); + return -1; + } + bzero(&addr, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_port = htons(atoi(lport)); + addr.sin_addr.s_addr = inet_addr(local); + + set_nonblocking(sd); + + // bind + if (bind(sd, (struct sockaddr*) &addr, sizeof(addr)) != 0) { + printf("bind error\n"); + return -1; + } + + // set sd reuseful + int bReuseaddr = 1; + if (setsockopt(sd, SOL_SOCKET, SO_REUSEADDR, (const char*) &bReuseaddr, sizeof(bReuseaddr)) != 0) { + printf("setsockopt error in reuseaddr[%d]\n", sd); + return -1; + } + + loop = ev_loop_new(0); + + printf("worker: %ld loop: %p\n", w, loop); + + /* init ev_io */ + socket_watcher->data = NULL; + ev_io_init(socket_watcher, udp_accept_callback, sd, EV_READ); + ev_io_start(loop, socket_watcher); + + /* ev loop */ + while(1) { + ev_run(loop, 1); + } + + return 1; +} + +int main(int argc, char *argv[]) +{ + if (argc != 3) { + fprintf (stderr, "Usage: %s \n", argv[0]); + exit (EXIT_FAILURE); + } + + local = argv[1]; + lport = argv[2]; + + long i = 0; + + pthread_t th[WORKER]; + for (i = 0; i < WORKER; i++) { + + //sleep(2); + if (pthread_create(&th[i], NULL, (void *)ev_cycle, (void *)i)) { + perror("Failed to start all worker threads"); + return 1; + } + } + + for (i = 0; i < WORKER; i++) { + pthread_join(th[i], NULL); + } + + return 0; +} diff --git a/examples/udp_server/Makefile b/examples/udp_server/Makefile new file mode 100644 index 0000000..5706396 --- /dev/null +++ b/examples/udp_server/Makefile @@ -0,0 +1,40 @@ +# Copyright (c) 2018 Ant Financial Services Group. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +ifeq ($(RTE_TARGET),) +$(error "Please define RTE_TARGET environment variable") +endif + +ifeq ($(TLDK_ROOT),) +$(error "Please define TLDK_ROOT environment variable") +endif + +include $(RTE_SDK)/mk/rte.vars.mk + +# binary name +APP = udp_server + +# all source are stored in SRCS-y +SRCS-y += udp_server.c + +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -I$(RTE_OUTPUT)/include + +LDLIBS += -L$(RTE_OUTPUT)/lib +LDLIBS += -ltle_glue -ltle_misc -ltle_l4p -ltle_timer + +include $(TLDK_ROOT)/mk/tle.app.mk diff --git a/examples/udp_server/udp_server.c b/examples/udp_server/udp_server.c new file mode 100644 index 0000000..970f2ce --- /dev/null +++ b/examples/udp_server/udp_server.c @@ -0,0 +1,239 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ECHO_LEN 1025 + +#define MAXBUF 1024 +#define MAXEPOLLSIZE 100 + +#define NI_MAXHOST 1025 +#define NI_MAXSERV 32 + +char *laddr; +char *lport; + +static void ipshow(struct sockaddr *addr) +{ + struct sockaddr_in *ina = (struct sockaddr_in *)addr; + static char szAddr[20] = "\0"; + + char* p = (char *)&ina->sin_addr; + sprintf(szAddr, "%d.%d.%d.%d", *p, *(p + 1), *(p + 2), *(p + 3)); + printf("ip:%s port:%u\n", szAddr, ntohs(ina->sin_port)); +} + + +static int setnonblocking(int sockfd) +{ + if (fcntl(sockfd, F_SETFL, fcntl(sockfd, F_GETFL, 0) | O_NONBLOCK) == -1) { + return -1; + } + + return 0; +} + +static int add_event(int epollfd, int fd, int state) +{ + struct epoll_event ev; + + ev.events = state; + ev.data.fd = fd; + return epoll_ctl(epollfd,EPOLL_CTL_ADD,fd,&ev); +} + +static int delete_event(int epollfd, int fd, int state) +{ + struct epoll_event ev; + + ev.events = state; + ev.data.fd = fd; + + return epoll_ctl(epollfd,EPOLL_CTL_DEL,fd,&ev); +} + +static void do_write(int epollfd, int fd, char *buf) +{ + int nwrite = 0; + + nwrite = write(fd, buf, strlen(buf)); + if (nwrite == -1) { + perror("write error:"); + close(fd); + delete_event(epollfd, fd, EPOLLOUT); + } +} + +static int udp_socket_connect(int epollfd, struct sockaddr_in *servaddr) +{ + struct sockaddr_in my_addr; + int fd = 0; + int opt = SO_REUSEADDR; + + fd = socket(PF_INET, SOCK_DGRAM, 0); + if (fd == -1) { + return -1; + } + + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); + setnonblocking(fd); + + bzero(&my_addr, sizeof(my_addr)); + my_addr.sin_family = PF_INET; + my_addr.sin_port = htons(atoi(lport)); + my_addr.sin_addr.s_addr = inet_addr(laddr); + if (bind(fd, (struct sockaddr *)&my_addr, sizeof(struct sockaddr)) == -1) { + perror("bind"); + exit(1); + } else { + printf("IP and port bind success\n"); + } + + if (fd == -1) { + return -1; + } + + connect(fd, (struct sockaddr*)servaddr, sizeof(struct sockaddr_in)); + add_event(epollfd, fd, EPOLLIN); + + return fd; +} + +static void accept_client(int epollfd,int fd) +{ + struct sockaddr_storage client_addr; + socklen_t addr_size = sizeof(client_addr); + char buf[1024]; + int new_sock; + int ret = 0; + + ret = recvfrom(fd, buf, 1024, 0, (struct sockaddr *)&client_addr, + &addr_size); + if (ret > 0) + printf("recvfrom len = %d\n", ret); + else { + perror("recvfrom"); + return; + } + + buf[ret] = '\0'; + char type = buf[0]; + char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV]; + ret = getnameinfo((struct sockaddr *)&client_addr, addr_size, hbuf, + sizeof(hbuf), sbuf, sizeof(sbuf), + NI_NUMERICHOST | NI_NUMERICSERV); + + ipshow((struct sockaddr *)&client_addr); + printf("recvfrom client [%s:%s] : %c\n", hbuf, sbuf, buf[0]); + + if (type != '0') { + return; + } + + new_sock = udp_socket_connect(epollfd, (struct sockaddr_in*)&client_addr); + buf[0] = '1'; + do_write(epollfd, new_sock, buf); +} + +static void msg_process(int epollfd, int fd) +{ + int nread = 0; + char buf[MAXBUF]; + char type; + + nread = read(fd, buf, MAXBUF); + //check(nread > 0, "recvfrom error"); + if (nread < 2) { + printf("prefix should be [0|1]\n"); + } + + buf[nread] = '\0'; + type = buf[0]; + + if (type == '2') { + printf("recv msg [len: %d]\n", nread - 1); + do_write(epollfd, fd, buf); + } + +} +int main(int argc, char *argv[]) +{ + int listener, kdpfd, nfds, n; + struct sockaddr_in my_addr; + struct epoll_event ev; + struct epoll_event events[MAXEPOLLSIZE]; + + if (argc != 3) { + fprintf (stderr, "Usage: %s \n", argv[0]); + exit (EXIT_FAILURE); + } + + laddr = argv[1]; + lport = argv[2]; + + if ((listener = socket(PF_INET, SOCK_DGRAM, 0)) == -1) { + perror("socket create failed"); + exit(1); + } else { + printf("socket create success\n"); + } + + int opt = SO_REUSEADDR; + setsockopt(listener, SOL_SOCKET, SO_REUSEADDR, &opt,sizeof(opt)); + + setnonblocking(listener); + + bzero(&my_addr, sizeof(my_addr)); + my_addr.sin_family = PF_INET; + my_addr.sin_port = htons(atoi(lport)); + my_addr.sin_addr.s_addr = inet_addr(laddr); + if (bind(listener, (struct sockaddr *)&my_addr, + sizeof(struct sockaddr)) == -1) { + perror("bind"); + exit(1); + } else { + printf("IP and port bind success \n"); + } + + kdpfd = epoll_create(MAXEPOLLSIZE); + ev.events = EPOLLIN | EPOLLET; + ev.data.fd = listener; + if (epoll_ctl(kdpfd, EPOLL_CTL_ADD, listener, &ev) < 0) { + fprintf(stderr, "epoll set insertion error: fd=%d\n", listener); + return -1; + } else { + printf("listen socket added in epoll success\n"); + } + + while (1) { + nfds = epoll_wait(kdpfd, events, MAXEPOLLSIZE, -1); + if (nfds == -1 && errno != EINTR) { + perror("epoll_wait"); + break; + } + + for (n = 0; n < nfds; ++n) { + if (events[n].data.fd == listener) { + accept_client(kdpfd, listener); + } else { + msg_process(kdpfd, events[n].data.fd); + } + } + } + + close(listener); + return 0; +} diff --git a/lib/Makefile b/lib/Makefile index 6317af9..9bbe159 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -25,5 +25,6 @@ DIRS-y += libtle_misc DIRS-y += libtle_dring DIRS-y += libtle_timer DIRS-y += libtle_l4p +DIRS-y += libtle_glue include $(TLDK_ROOT)/mk/tle.subdir.mk diff --git a/lib/libtle_glue/Makefile b/lib/libtle_glue/Makefile new file mode 100644 index 0000000..13ceb82 --- /dev/null +++ b/lib/libtle_glue/Makefile @@ -0,0 +1,62 @@ +# Copyright (c) 2018 Ant Financial Services Group. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +# Default target, can be overwritten by command line or environment +RTE_TARGET ?= x86_64-native-linuxapp-gcc + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = libtle_glue.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) + +EXPORT_MAP := tle_glue_version.map + +LIBABIVER := 1 + +# source files +SRCS-y += fd.c +SRCS-y += ctx.c +SRCS-y += arp.c +SRCS-y += icmp.c +SRCS-y += rxcb.c +SRCS-y += port.c +SRCS-y += sym.c +SRCS-y += init.c +SRCS-y += be.c +SRCS-y += epoll.c +SRCS-y += socket.c +SRCS-y += rxtx.c +SRCS-y += poll.c +SRCS-y += util.c +SRCS-y += tcp.c +SRCS-y += udp.c +SRCS-y += select.c + +ifeq ($(PACKETDRILL),y) +SRCS-y += packetdrill.c +endif + +# install this header file +SYMLINK-y-include += tle_glue.h + +# this lib dependencies +DEPDIRS-y += lib/libtle_l4p + +include $(TLDK_ROOT)/mk/tle.lib.mk diff --git a/lib/libtle_glue/arp.c b/lib/libtle_glue/arp.c new file mode 100644 index 0000000..4398a57 --- /dev/null +++ b/lib/libtle_glue/arp.c @@ -0,0 +1,1006 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "log.h" +#include "ctx.h" +#include "internal.h" +#include "tle_timer.h" +#include "util.h" +#include "../libtle_l4p/net_misc.h" +#include "ndp.h" +#include "gateway.h" + +#define ARP_ENTRY_EXPIRE 60000U +#define ARP_REQUEST_EXPIRE 1000U /* ms */ +#define ARP_MAX_REQ_TIMES 5 + +static inline void +set_multicast_mac_v6(struct ether_addr *addr, const struct in6_addr *ip6_addr) +{ + unaligned_uint16_t *ea_words = (unaligned_uint16_t *)addr; + ea_words[0] = 0x3333; + ea_words[1] = ip6_addr->__in6_u.__u6_addr16[6]; + ea_words[2] = ip6_addr->__in6_u.__u6_addr16[7]; +} + +static inline void +set_multicast_ipv6(uint8_t ipv6[16]) +{ + rte_memcpy(ipv6, &tle_ipv6_multi_mask, IPV6_MULTI_MASK_LEN); +} + +static inline void +set_broadcast_addr(struct ether_addr *addr) +{ + unaligned_uint16_t *ea_words = (unaligned_uint16_t *)addr; + ea_words[0] = 0xFFFF; + ea_words[1] = 0xFFFF; + ea_words[2] = 0xFFFF; +} + +static void +print_arp_entry(const struct in_addr *ip, const struct ether_addr *mac, + const char* action) +{ + char str_ip[16]; + char str_mac[32]; + + ether_format_addr(str_mac, sizeof(str_mac), mac); + inet_ntop(AF_INET, &ip->s_addr, str_ip, sizeof(str_ip)); + GLUE_LOG(DEBUG, "%s ARP entry: ipv4=%s/%u, mac=%s", + action, str_ip, 24, str_mac); +} + +static void +print_arp6_entry(const struct in6_addr *ip6, const struct ether_addr *mac, + const char* action) +{ + char str_ip[64]; + char str_mac[32]; + + ether_format_addr(str_mac, sizeof(str_mac), mac); + inet_ntop(AF_INET6, ip6, str_ip, sizeof(str_ip)); + GLUE_LOG(DEBUG, "%s ARP6 entry: ipv6=%s, mac=%s", + action, str_ip, str_mac); +} + +void +ipv6_dst_add(struct glue_ctx *ctx, const struct in6_addr *addr, + struct ether_addr *e_addr) +{ + struct rte_mbuf *pkt, *pkts[32], *pre; + uint32_t nb_pkts; + struct arp_entry* entry; + struct tle_dest *dst; + struct ether_hdr *eth; + struct ipv6_hdr *ip6h; + uint64_t idx; + int rc; + uint8_t check_arp_wait = 1; + struct in6_addr gate6; + + rc = rte_hash_lookup_data(ctx->arp6_hash, addr, (void**)&idx); + if (rc >= 0) { + entry = &ctx->arp6[idx]; + dst = &entry->dst; + eth = (struct ether_hdr *)dst->hdr; + + if (!is_broadcast_ether_addr(ð->d_addr)) { + check_arp_wait = 0; + } + /* update arp entry, reset timer */ + ether_addr_copy(e_addr, ð->d_addr); + print_arp6_entry(addr, ð->d_addr, "UPDATE"); + if(entry->timer != NULL) + { + tle_timer_stop(ctx->arp_tmw, entry->timer); + } + entry->timer = tle_timer_start(ctx->arp_tmw, entry, ARP_ENTRY_EXPIRE); + entry->inuse = 0; + + if(check_arp_wait == 0) + return; + + /* arp entry start to work */ + entry->req_time = 0; + nb_pkts = 0; + pkt = ctx->arp_wait; + for (pre = NULL; pkt; pkt = pkt->next_pkt) { + ip6h = rte_pktmbuf_mtod_offset(pkt, struct ipv6_hdr *, pkt->l2_len); + if (((ip6h->vtc_flow & 0xffffff00) >> 4) != 6 || memcmp( + ipv6_gateway_lookup(ctx, (struct in6_addr *)&ip6h->dst_addr, &gate6), + addr, sizeof(struct in6_addr)) != 0) { + pre = pkt; + continue; + } + + if (pre == NULL) + ctx->arp_wait = pkt->next_pkt; + else + pre->next_pkt = pkt->next_pkt; + eth = rte_pktmbuf_mtod(pkt, struct ether_hdr *); + ether_addr_copy(e_addr, ð->d_addr); + pkts[nb_pkts++] = pkt; + if (nb_pkts == 32) { + rte_eth_tx_burst(ctx->port_id, ctx->queue_id, pkts, nb_pkts); + TRACE("After ARP learn, send %u pkts", nb_pkts); + nb_pkts = 0; + } + } + if (nb_pkts && + rte_eth_tx_burst(ctx->port_id, ctx->queue_id, pkts, nb_pkts)) + TRACE("After ARP learn, send %u pkts", nb_pkts); + return; + } + + rte_spinlock_lock(&ctx->arp6_lock); + idx = ctx->arp6_num; + entry = &ctx->arp6[idx]; + dst = &entry->dst; + + /* no need to set dst->dev */ + dst->mtu = 1500; + dst->l2_len = sizeof(*eth); + dst->head_mp = get_mempool_by_socket(0); /* fix me */ + + eth = (struct ether_hdr *)dst->hdr; + ether_addr_copy(&ctx->mac, ð->s_addr); + if (e_addr == NULL) { + set_broadcast_addr(ð->d_addr); + entry->timer = tle_timer_start(ctx->arp_tmw, entry, ARP_REQUEST_EXPIRE); + entry->req_time = 1; + } + else { + ether_addr_copy(e_addr, ð->d_addr); + entry->timer = tle_timer_start(ctx->arp_tmw, entry, ARP_ENTRY_EXPIRE); + entry->inuse = 0; + } + eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv6); + + dst->l3_len = sizeof(*ip6h); + ip6h = (struct ipv6_hdr *)(eth + 1); + rte_memcpy(ip6h->dst_addr, addr, sizeof(struct in6_addr)); + ip6h->vtc_flow = 6 << 4; + ip6h->hop_limits = 255; + ip6h->proto = IPPROTO_TCP; + + rc = rte_hash_add_key_data(ctx->arp6_hash, addr, (void*)idx); + if (rc < 0) + rte_panic("Failed to add ARP6 entry"); + + print_arp6_entry(addr, ð->d_addr, "ADD"); + ctx->arp6_num++; + rte_spinlock_unlock(&ctx->arp6_lock); +} + +void +ipv4_dst_add(struct glue_ctx *ctx, const struct in_addr *addr, + struct ether_addr *e_addr) +{ + struct rte_mbuf *pkt, *pkts[32], *pre; + uint32_t nb_pkts; + struct arp_entry* entry; + struct tle_dest *dst; + struct ether_hdr *eth; + struct ipv4_hdr *ip4h; + uint64_t idx; + int rc; + uint8_t check_arp_wait = 1; + struct in_addr gate4; + + rc = rte_hash_lookup_data(ctx->arp_hash, addr, (void**)&idx); + if (rc >= 0) { + entry = &ctx->arp4[idx]; + dst = &entry->dst; + eth = (struct ether_hdr *)dst->hdr; + + if (!is_broadcast_ether_addr(ð->d_addr)) { + check_arp_wait = 0; + } + /* update arp entry, reset timer */ + ether_addr_copy(e_addr, ð->d_addr); + print_arp_entry(addr, ð->d_addr, "UPDATE"); + if(entry->timer != NULL) + { + tle_timer_stop(ctx->arp_tmw, entry->timer); + } + entry->timer = tle_timer_start(ctx->arp_tmw, entry, ARP_ENTRY_EXPIRE); + entry->inuse = 0; + + if(check_arp_wait == 0) + return; + + /* arp entry start to work */ + entry->req_time = 0; + nb_pkts = 0; + pkt = ctx->arp_wait; + for (pre = NULL; pkt; pkt = pkt->next_pkt) { + ip4h = rte_pktmbuf_mtod_offset(pkt, struct ipv4_hdr *, pkt->l2_len); + if ((ip4h->version_ihl >> 4) != 4 + || ipv4_gateway_lookup(ctx, (struct in_addr *)&ip4h->dst_addr, &gate4) + ->s_addr != addr->s_addr) { + pre = pkt; + continue; + } + if (pre == NULL) + ctx->arp_wait = pkt->next_pkt; + else + pre->next_pkt = pkt->next_pkt; + eth = rte_pktmbuf_mtod(pkt, struct ether_hdr *); + ether_addr_copy(e_addr, ð->d_addr); + pkts[nb_pkts++] = pkt; + if (nb_pkts == 32) { + rte_eth_tx_burst(ctx->port_id, ctx->queue_id, pkts, nb_pkts); + TRACE("After ARP learn, send %u pkts", nb_pkts); + nb_pkts = 0; + } + } + if (nb_pkts && + rte_eth_tx_burst(ctx->port_id, ctx->queue_id, pkts, nb_pkts)) + TRACE("After ARP learn, send %u pkts", nb_pkts); + return; + } + + rte_spinlock_lock(&ctx->arp_lock); + idx = ctx->arp4_num; + entry = &ctx->arp4[idx]; + dst = &entry->dst; + + /* no need to set dst->dev */ + dst->mtu = 1500; + dst->l2_len = sizeof(*eth); + dst->head_mp = get_mempool_by_socket(0); /* fix me */ + + eth = (struct ether_hdr *)dst->hdr; + ether_addr_copy(&ctx->mac, ð->s_addr); + if (e_addr == NULL) { + set_broadcast_addr(ð->d_addr); + entry->timer = tle_timer_start(ctx->arp_tmw, entry, ARP_REQUEST_EXPIRE); + entry->req_time = 1; + } + else { + ether_addr_copy(e_addr, ð->d_addr); + entry->timer = tle_timer_start(ctx->arp_tmw, entry, ARP_ENTRY_EXPIRE); + entry->inuse = 0; + } + eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4); + + dst->l3_len = sizeof(*ip4h); + ip4h = (struct ipv4_hdr *)(eth + 1); + ip4h->dst_addr = addr->s_addr; + ip4h->version_ihl = 4 << 4 | sizeof(*ip4h) / IPV4_IHL_MULTIPLIER; + ip4h->time_to_live = 64; + ip4h->next_proto_id = IPPROTO_TCP; + + rc = rte_hash_add_key_data(ctx->arp_hash, addr, (void*)idx); + if (rc < 0) + rte_panic("Failed to add ARP entry"); + + print_arp_entry(addr, ð->d_addr, "ADD"); + ctx->arp4_num++; + rte_spinlock_unlock(&ctx->arp_lock); +} + +static inline int +arp_ip_exist(struct glue_ctx *ctx, uint32_t *ip) +{ + return rte_hash_lookup(ctx->arp_hash, ip) >= 0; +} + +static inline int +arp6_ip_exist(struct glue_ctx *ctx, struct in6_addr* ipv6) +{ + return rte_hash_lookup(ctx->arp6_hash, ipv6) >= 0; +} + +struct rte_mbuf * +ndp_recv(struct glue_ctx *ctx, struct rte_mbuf *m, uint32_t l2len, uint32_t l3len) +{ + struct ether_hdr *eth_h; + struct ipv6_hdr *ipv6_h; + struct nd_neighbor_solicit *ns_h; + struct nd_opt_hdr *opth; + + eth_h = rte_pktmbuf_mtod(m, struct ether_hdr *); + ipv6_h = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr*, l2len); + ns_h = rte_pktmbuf_mtod_offset(m, struct nd_neighbor_solicit*, l2len + l3len); + + if (ipv6_h->payload_len < sizeof(struct nd_neighbor_solicit)) + goto drop; + + /* We only learn mac when: + * 1. Normal NS for my ip, whose TargetAddr is me + * 2. Normal NA to my ip, whose DstIpv6 is me + * 3. Unsolicited NA, and we already have an entry for that IP + */ + + /* NS message */ + if (ns_h->nd_ns_hdr.icmp6_type == ND_NEIGHBOR_SOLICIT) { + /* not support Duplicate Address Detect NS yet */ + if (IN6_IS_ADDR_UNSPECIFIED(ipv6_h->src_addr)) { + goto drop; + } + + /* NS message, target is my ipv6 addr */ + if (memcmp(&ns_h->nd_ns_target, &ctx->ipv6, + sizeof(struct in6_addr)) == 0) { + opth = (struct nd_opt_hdr*)(ns_h + 1); + ipv6_dst_add(ctx, (struct in6_addr*)ipv6_h->src_addr, + (struct ether_addr*)(opth + 1)); + + /* response NA message */ + ether_addr_copy(&ctx->mac, ð_h->s_addr); + ether_addr_copy((struct ether_addr*)(opth + 1), ð_h->d_addr); + + rte_memcpy(ipv6_h->dst_addr, ipv6_h->src_addr, + sizeof(struct in6_addr)); + rte_memcpy(ipv6_h->src_addr, &ctx->ipv6, + sizeof(struct in6_addr)); + + ns_h->nd_ns_hdr.icmp6_type = ND_NEIGHBOR_ADVERT; + ns_h->nd_ns_hdr.icmp6_dataun.icmp6_un_data8[0] = 0x60; + ns_h->nd_ns_hdr.icmp6_cksum = 0; + + opth->nd_opt_type = ND_OPT_TARGET_LINKLAYER_ADDR; + ether_addr_copy(&ctx->mac, (struct ether_addr*)(opth + 1)); + + ns_h->nd_ns_hdr.icmp6_cksum = rte_ipv6_udptcp_cksum(ipv6_h, ns_h); + + if (m->pkt_len < ETHER_MIN_LEN) + rte_pktmbuf_append(m, ETHER_MIN_LEN - m->pkt_len); + + if (rte_eth_tx_burst(ctx->port_id, ctx->queue_id, &m, 1)) + GLUE_LOG(DEBUG, "Send NDP NA reply"); + + return NULL; + } + } else { + /* NA message */ + if (memcmp(ipv6_h->dst_addr, &ctx->ipv6, sizeof(struct in6_addr)) == 0 || + (memcmp(ipv6_h->dst_addr, &tle_ipv6_all_multi, + sizeof(struct in6_addr)) == 0 && + arp6_ip_exist(ctx, &ns_h->nd_ns_target))) { + opth = (struct nd_opt_hdr*)(ns_h + 1); + ipv6_dst_add(ctx, &ns_h->nd_ns_target, (struct ether_addr*)(opth + 1)); + } + } + +drop: + rte_pktmbuf_free(m); + return NULL; +} + +struct rte_mbuf * +arp_recv(struct glue_ctx *ctx, struct rte_mbuf *m, uint32_t l2len) +{ + struct ether_hdr *eth; + struct arp_hdr *ahdr; + struct arp_ipv4 *adata; + uint32_t tip; + + eth = rte_pktmbuf_mtod(m, struct ether_hdr *); + ahdr = rte_pktmbuf_mtod_offset(m, struct arp_hdr *, l2len); + + if (ahdr->arp_hrd != rte_be_to_cpu_16(ARP_HRD_ETHER) || + ahdr->arp_pro != rte_be_to_cpu_16(ETHER_TYPE_IPv4)) + goto drop; + + adata = &ahdr->arp_data; + tip = adata->arp_tip; + + /* We only learn mac when: + * 1. tip is me, or + * 2. this is a RARP, and we already have an entry for that IP + */ + if (tip == ctx->ipv4 || + (tip == INADDR_ANY && arp_ip_exist(ctx, &adata->arp_sip))) + ipv4_dst_add(ctx, (struct in_addr *)&adata->arp_sip, + &adata->arp_sha); + + /* We only do ARP reply when: + * 1. tip is me. + */ + if (ahdr->arp_op == rte_be_to_cpu_16(ARP_OP_REQUEST) && + tip == ctx->ipv4) { + eth->d_addr = eth->s_addr; + eth->s_addr = ctx->mac; + ahdr->arp_op = rte_cpu_to_be_16(ARP_OP_REPLY); + + adata->arp_tip = adata->arp_sip; + adata->arp_sip = tip; + + adata->arp_tha = adata->arp_sha; + adata->arp_sha = ctx->mac; + if (m->pkt_len < ETHER_MIN_LEN) + rte_pktmbuf_append(m, ETHER_MIN_LEN - m->pkt_len); + PKT_DUMP(m); + if (rte_eth_tx_burst(ctx->port_id, ctx->queue_id, &m, 1)) + TRACE("sent arp reply"); + return NULL; + } +drop: + rte_pktmbuf_free(m); + return NULL; +} + +static void +arp6_send_request(struct glue_ctx *ctx, const struct in6_addr *addr) +{ + struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */ + struct ether_hdr *eth; + struct ipv6_hdr *ip6h; + struct nd_neighbor_solicit *nsh; + struct nd_opt_hdr *opth; + struct ether_addr *sll_addr; + struct rte_mbuf *m; +#ifdef ENABLE_TRACE + char str_ip[64]; +#endif + + m = rte_pktmbuf_alloc(mp); + if (m == NULL) + rte_panic("Failed to alloc mbuf for ndp ns request"); + + eth = (struct ether_hdr *)rte_pktmbuf_append(m, sizeof(*eth)); + ether_addr_copy(&ctx->mac, ð->s_addr); + set_multicast_mac_v6(ð->d_addr, addr); + eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv6); + + ip6h = (struct ipv6_hdr*)rte_pktmbuf_append(m, sizeof(struct ipv6_hdr)); + ip6h->vtc_flow = 6 << 4; + ip6h->payload_len = sizeof(struct nd_neighbor_solicit) + + sizeof(struct nd_opt_hdr) + sizeof(struct ether_addr); + ip6h->proto = IPPROTO_ICMPV6; + ip6h->hop_limits = 255; + rte_memcpy(ip6h->src_addr, &ctx->ipv6, sizeof(struct in6_addr)); + rte_memcpy(ip6h->dst_addr, addr, sizeof(struct in6_addr)); + set_multicast_ipv6(ip6h->dst_addr); + + nsh = (struct nd_neighbor_solicit*)rte_pktmbuf_append(m, + sizeof(struct nd_neighbor_solicit)); + nsh->nd_ns_hdr.icmp6_type = ND_NEIGHBOR_SOLICIT; + nsh->nd_ns_hdr.icmp6_code = 0; + nsh->nd_ns_hdr.icmp6_cksum = 0; + nsh->nd_ns_hdr.icmp6_dataun.icmp6_un_data32[0] = 0; + rte_memcpy(&nsh->nd_ns_target, addr, sizeof(struct in6_addr)); + + opth = (struct nd_opt_hdr*)rte_pktmbuf_append(m, sizeof(struct nd_opt_hdr)); + opth->nd_opt_type = ND_OPT_SOURCE_LINKLAYER_ADDR; + opth->nd_opt_len = 1; + + sll_addr = (struct ether_addr*)rte_pktmbuf_append(m, sizeof(struct ether_addr)); + ether_addr_copy(&ctx->mac, sll_addr); + + nsh->nd_ns_hdr.icmp6_cksum = rte_ipv6_udptcp_cksum(ip6h, nsh); + + while (rte_eth_tx_burst(ctx->port_id, ctx->queue_id, &m, 1) == 0); +} + +static void +arp_send_request(struct glue_ctx *ctx, const struct in_addr *addr) +{ + struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */ + struct ether_hdr *eth; + struct arp_hdr *ahdr; + struct arp_ipv4 *adata; + struct rte_mbuf *m; + uint16_t pad_len, i; + char *pad; + + m = rte_pktmbuf_alloc(mp); + if (m == NULL) + rte_panic("Failed to alloc mbuf for arp request"); + + eth = (struct ether_hdr *)rte_pktmbuf_append(m, sizeof(*eth)); + ether_addr_copy(&ctx->mac, ð->s_addr); + set_broadcast_addr(ð->d_addr); + eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_ARP); + + ahdr = (struct arp_hdr *)rte_pktmbuf_append(m, sizeof(*ahdr)); + ahdr->arp_hrd = rte_be_to_cpu_16(ARP_HRD_ETHER); + ahdr->arp_pro = rte_be_to_cpu_16(ETHER_TYPE_IPv4); + ahdr->arp_hln = sizeof(struct ether_addr); + ahdr->arp_pln = sizeof(*addr); + ahdr->arp_op = rte_be_to_cpu_16(ARP_OP_REQUEST); + adata = &ahdr->arp_data; + ether_addr_copy(&ctx->mac, &adata->arp_sha); + adata->arp_sip = ctx->ipv4; + set_broadcast_addr(&adata->arp_tha); + adata->arp_tip = addr->s_addr; + + pad_len = ETHER_MIN_LEN - sizeof(*eth) - sizeof(*ahdr); + pad = rte_pktmbuf_append(m, pad_len); + for (i = 0; i < pad_len; ++i) + pad[i] = 0; + + while (rte_eth_tx_burst(ctx->port_id, ctx->queue_id, &m, 1) == 0); +} + +void +mac_check(struct glue_ctx *ctx, const struct sockaddr* addr) +{ + int rc; + const struct in_addr* addr4 = NULL; + struct in_addr gate4; + const struct in6_addr* addr6 = NULL; + struct in6_addr gate6; + + if(addr->sa_family == AF_INET) { + addr4 = ipv4_gateway_lookup(ctx, + &((const struct sockaddr_in *)addr)->sin_addr, &gate4); + rc = rte_hash_lookup(ctx->arp_hash, addr4); + } + else { + addr6 = ipv6_gateway_lookup(ctx, + &((const struct sockaddr_in6 *)addr)->sin6_addr, &gate6); + rc = rte_hash_lookup(ctx->arp6_hash, addr6); + } + if (rc >= 0) + return; + + if(addr->sa_family == AF_INET) { + arp_send_request(ctx, addr4); + //ipv4_dst_add(ctx, addr, NULL); + } else { + arp6_send_request(ctx, addr6); + //ipv6_dst_add(ctx, addr, NULL); + } +} + +static int +arp_inherit(struct glue_ctx *ctx, const struct in_addr *addr) +{ + struct glue_ctx *next = NULL; + uint64_t idx; + uint16_t i; + struct tle_dest *dst; + struct ether_hdr *eth; + int rc; + + for (i = 0; i < nb_ctx; i++) { + next = &ctx_array[i++]; + if (next == NULL || next == ctx) + continue; + + rc = rte_hash_lookup_data(next->arp_hash, addr, (void**)&idx); + if (rc < 0) + continue; + + dst = &next->arp4[idx].dst; + eth = (struct ether_hdr *)dst->hdr; + ipv4_dst_add(ctx, addr, ð->d_addr); + return 0; + } + + return -1; +} + +static int +arp6_inherit(struct glue_ctx *ctx, const struct in6_addr *addr) +{ + struct glue_ctx *next = NULL; + uint64_t idx; + uint16_t i; + struct tle_dest *dst; + struct ether_hdr *eth; + int rc; + + for (i = 0; i < nb_ctx; i++) { + next = &ctx_array[i++]; + if (next == NULL || next == ctx) + continue; + + rc = rte_hash_lookup_data(next->arp6_hash, addr, (void**)&idx); + if (rc < 0) + continue; + + dst = &next->arp6[idx].dst; + eth = (struct ether_hdr *)dst->hdr; + ipv6_dst_add(ctx, addr, ð->d_addr); + return 0; + } + + return -1; +} + +static int +arp_ipv6_dst_lookup(struct glue_ctx *ctx, const struct in6_addr *addr, + struct tle_dest *res, struct tle_dev *dev) +{ + int32_t rc; + uint64_t idx; + struct tle_dest *dst; + + if (is_ipv6_loopback_addr(addr, ctx)) { + dst = &ctx->lb_dst_v6; + rte_memcpy(res, dst, dst->l2_len + dst->l3_len + + offsetof(struct tle_dest, hdr)); + res->dev = dev; + return 0; + } + +retry: + rc = rte_hash_lookup_data(ctx->arp6_hash, addr, (void**)&idx); + if (rc >= 0) { + if (!ctx->arp6[idx].inuse) + ctx->arp6[idx].inuse = 1; + dst = &ctx->arp6[idx].dst; + rte_memcpy(res, dst, dst->l2_len + dst->l3_len + + offsetof(struct tle_dest, hdr)); + res->dev = dev; + } else { + if (arp6_inherit(ctx, addr) < 0) + ipv6_dst_add(ctx, addr, NULL); + goto retry; + } + + return rc; +} + +static int +arp_ipv4_dst_lookup(struct glue_ctx *ctx, const struct in_addr *addr, + struct tle_dest *res, struct tle_dev *dev) +{ + int32_t rc; + uint64_t idx; + struct tle_dest *dst; + + if (is_ipv4_loopback_addr(addr->s_addr, ctx)) { + dst = &ctx->lb_dst; + rte_memcpy(res, dst, dst->l2_len + dst->l3_len + + offsetof(struct tle_dest, hdr)); + res->dev = dev; + return 0; + } + +retry: + rc = rte_hash_lookup_data(ctx->arp_hash, addr, (void**)&idx); + if (rc >= 0) { + if (!ctx->arp4[idx].inuse) + ctx->arp4[idx].inuse = 1; + dst = &ctx->arp4[idx].dst; + rte_memcpy(res, dst, dst->l2_len + dst->l3_len + + offsetof(struct tle_dest, hdr)); + res->dev = dev; + } else { + if (arp_inherit(ctx, addr) < 0) + ipv4_dst_add(ctx, addr, NULL); + goto retry; + } + + return rc; +} + +int +arp_ipv4_dst_lookup_tcp(void *data, const struct in_addr *addr, + struct tle_dest *res) +{ + struct glue_ctx *ctx = data; + + if (is_ipv4_loopback_addr(addr->s_addr, ctx)) + return arp_ipv4_dst_lookup(ctx, addr, res, ctx->lb_tcp_dev); + else + return arp_ipv4_dst_lookup(ctx, addr, res, ctx->tcp_dev); +} + +int +arp_ipv6_dst_lookup_tcp(void *data, const struct in6_addr *addr, + struct tle_dest *res) +{ + struct glue_ctx *ctx = data; + + if (is_ipv6_loopback_addr(addr, ctx)) + return arp_ipv6_dst_lookup(ctx, addr, res, ctx->lb_tcp_dev); + else + return arp_ipv6_dst_lookup(ctx, addr, res, ctx->tcp_dev); +} + +int +arp_ipv4_dst_lookup_udp(void *data, const struct in_addr *addr, + struct tle_dest *res) +{ + int rc; + struct glue_ctx *ctx = data; + struct ipv4_hdr *ip4h; + + if (is_ipv4_loopback_addr(addr->s_addr, ctx)) + rc = arp_ipv4_dst_lookup(ctx, addr, res, ctx->lb_udp_dev); + else + rc = arp_ipv4_dst_lookup(ctx, addr, res, ctx->udp_dev); + + if (rc >= 0) { + /* fix next_proto_id */ + ip4h = (struct ipv4_hdr *)&res->hdr[res->l2_len]; + ip4h->next_proto_id = IPPROTO_UDP; + } + return rc; +} + +int +arp_ipv6_dst_lookup_udp(void *data, const struct in6_addr *addr, + struct tle_dest *res) +{ + int rc; + struct glue_ctx *ctx = data; + struct ipv6_hdr *ip6h; + + if (is_ipv6_loopback_addr(addr, ctx)) + rc = arp_ipv6_dst_lookup(ctx, addr, res, ctx->lb_udp_dev); + else + rc = arp_ipv6_dst_lookup(ctx, addr, res, ctx->udp_dev); + + if (rc >= 0) { + /* fix next_proto_id */ + ip6h = (struct ipv6_hdr *)&res->hdr[res->l2_len]; + ip6h->proto = IPPROTO_UDP; + } + return rc; +} + +int +mac_fill(struct glue_ctx *ctx, struct rte_mbuf *m) +{ + int32_t rc; + uint64_t idx; + struct arp_entry* entry; + struct ether_addr *dst, *dst1; + struct ipv4_hdr *ipv4_hdr; + struct ipv6_hdr *ipv6_hdr; + uint8_t ipver; + const struct in_addr* addr4 = NULL; + struct in_addr gate4; + const struct in6_addr* addr6 = NULL; + struct in6_addr gate6; + + dst = rte_pktmbuf_mtod(m, struct ether_addr *); + if (!is_broadcast_ether_addr(dst)) + return 0; + + ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, m->l2_len); + ipv6_hdr = NULL; + ipver = ipv4_hdr->version_ihl >> 4; + if (ipver == 4) { + addr4 = ipv4_gateway_lookup(ctx, + (const struct in_addr *)&ipv4_hdr->dst_addr, &gate4); + rc = rte_hash_lookup_data(ctx->arp_hash, addr4, (void**)&idx); + if (rc >= 0) + entry = &ctx->arp4[idx]; + } else { + ipv6_hdr = (struct ipv6_hdr*)ipv4_hdr; + addr6 = ipv6_gateway_lookup(ctx, + (const struct in6_addr *)ipv6_hdr->dst_addr, &gate6); + rc = rte_hash_lookup_data(ctx->arp6_hash, addr6, (void**)&idx); + if (rc >= 0) + entry = &ctx->arp6[idx]; + } + + if (rc >= 0) { + dst1 = (struct ether_addr *)entry->dst.hdr; + if (!is_broadcast_ether_addr(dst1)) { + ether_addr_copy(dst1 , dst); + return 0; + } + + if (ipver == 4) + arp_send_request(ctx, addr4); + else + arp6_send_request(ctx, addr6); + entry->req_time++; + if (entry->timer != NULL) { + tle_timer_stop(ctx->arp_tmw, entry->timer); + } + entry->timer = tle_timer_start(ctx->arp_tmw, entry, ARP_REQUEST_EXPIRE); + } + + return -1; +} + +static inline const struct in_addr * +get_addr_from_entry(struct arp_entry *e) +{ + const struct ipv4_hdr *ipv4; + const struct in_addr *addr; + + ipv4 = (struct ipv4_hdr *)(e->dst.hdr + e->dst.l2_len); + addr = (const struct in_addr *)&ipv4->dst_addr; + + return addr; +} + +static inline const struct in6_addr * +get_addr6_from_entry(struct arp_entry *e) +{ + const struct ipv6_hdr *ipv6; + const struct in6_addr *addr; + + ipv6 = (struct ipv6_hdr *)(e->dst.hdr + e->dst.l2_len); + addr = (const struct in6_addr *)ipv6->dst_addr; + + return addr; +} + +static inline void +arp6_entry_del(struct glue_ctx *ctx, struct arp_entry *e) +{ + const struct in6_addr *addr; + struct ether_addr *eth_addr; + struct rte_mbuf *pkt, *pre; + uint32_t idx, last_idx; + struct ipv6_hdr *ip6h; + + idx = e - ctx->arp6; + last_idx = ctx->arp6_num - 1; + if (idx > last_idx) /* entry has been moved, don't timeout this time */ + return; + + addr = get_addr6_from_entry(e); + eth_addr = (struct ether_addr*)e->dst.hdr; + + print_arp6_entry(addr, eth_addr, "DELETE"); + if (e->req_time > ARP_MAX_REQ_TIMES) { + /* free pkts waiting for the ARP response */ + pkt = ctx->arp_wait; + for (pre = NULL; pkt != NULL; pkt = pkt->next_pkt) { + ip6h = rte_pktmbuf_mtod_offset(pkt, struct ipv6_hdr *, + pkt->l2_len); + if (memcmp(addr, ip6h->dst_addr, sizeof(struct in6_addr)) != 0) { + pre = pkt; + continue; + } + + if (pre == NULL) + ctx->arp_wait = pkt->next_pkt; + else + pre->next_pkt = pkt->next_pkt; + + rte_pktmbuf_free(pkt); + } + } + + rte_hash_del_key(ctx->arp6_hash, addr); + + /* if it's not the last entry, use last entry to replace current entry */ + if (idx < last_idx) { + rte_memcpy(e, ctx->arp6 + last_idx, sizeof(*e)); + rte_hash_add_key_data(ctx->arp6_hash, addr, (void*)(uintptr_t)idx); + tle_timer_stop(ctx->arp_tmw, ctx->arp6[last_idx].timer); + if (e->req_time > 0) { + e->timer = tle_timer_start(ctx->arp_tmw, e, ARP_REQUEST_EXPIRE); + } else { + e->timer = tle_timer_start(ctx->arp_tmw, e, ARP_ENTRY_EXPIRE); + e->inuse = 0; + } + } + + /* we always delete the last entry to keep it contiguous */ + ctx->arp6[last_idx].timer = NULL; + ctx->arp6[last_idx].inuse = 0; + ctx->arp6[last_idx].req_time = 0; + ctx->arp6_num--; +} + +static inline void +arp_entry_del(struct glue_ctx *ctx, struct arp_entry *e) +{ + const struct in_addr *addr; + struct ether_addr *eth_addr; + struct rte_mbuf *pkt, *pre; + uint32_t idx, last_idx; + struct ipv4_hdr *ip4h; + + idx = e - ctx->arp4; + last_idx = ctx->arp4_num - 1; + if (idx > last_idx) /* entry has been moved, don't timeout this time */ + return; + + addr = get_addr_from_entry(e); + eth_addr = (struct ether_addr*)e->dst.hdr; + + print_arp_entry(addr, eth_addr, "DELETE"); + if (e->req_time > ARP_MAX_REQ_TIMES) { + /* free pkts waiting for the ARP response */ + pkt = ctx->arp_wait; + for (pre = NULL; pkt != NULL; pkt = pkt->next_pkt) { + ip4h = rte_pktmbuf_mtod_offset(pkt, struct ipv4_hdr *, + pkt->l2_len); + if (addr->s_addr != ip4h->dst_addr) { + pre = pkt; + continue; + } + + if (pre == NULL) + ctx->arp_wait = pkt->next_pkt; + else + pre->next_pkt = pkt->next_pkt; + + rte_pktmbuf_free(pkt); + } + } + + rte_hash_del_key(ctx->arp_hash, addr); + + /* if it's not the last entry, use last entry to replace current entry */ + if (idx < last_idx) { + rte_memcpy(e, ctx->arp4 + last_idx, sizeof(*e)); + rte_hash_add_key_data(ctx->arp_hash, addr, (void*)(uintptr_t)idx); + tle_timer_stop(ctx->arp_tmw, ctx->arp4[last_idx].timer); + if (e->req_time > 0) { + e->timer = tle_timer_start(ctx->arp_tmw, e, ARP_REQUEST_EXPIRE); + } else { + e->timer = tle_timer_start(ctx->arp_tmw, e, ARP_ENTRY_EXPIRE); + e->inuse = 0; + } + } + + /* we always delete the last entry to keep it contiguous */ + ctx->arp4[last_idx].timer = NULL; + ctx->arp4[last_idx].inuse = 0; + ctx->arp4[last_idx].req_time = 0; + ctx->arp4_num--; +} + +void +mac_timeout(struct glue_ctx *ctx) +{ +#define ARP_PROCESS_MAX 32 + struct arp_entry *entry[ARP_PROCESS_MAX], *e; + struct tle_timer_wheel *tw; + uint32_t i, cnt; + uint8_t *l3h; + + tw = ctx->arp_tmw; + tle_timer_expire(tw, rte_get_tsc_cycles() >> ctx->cycles_ms_shift); + cnt = tle_timer_get_expired_bulk(tw, (void**)entry, ARP_PROCESS_MAX); + if (cnt == 0) + return; + + rte_spinlock_lock(&ctx->arp_lock); + for(i = 0; i < cnt; i++) { + e = entry[i]; + e->timer = NULL; + l3h = e->dst.hdr + e->dst.l2_len; + if (e->inuse || + (e->req_time > 0 && e->req_time <= ARP_MAX_REQ_TIMES)) + { + if (((struct ipv4_hdr*)l3h)->version_ihl >> 4 == 4) + arp_send_request(ctx, (struct in_addr*) + (&((struct ipv4_hdr*)l3h)->dst_addr)); + else + arp6_send_request(ctx, (struct in6_addr*) + (((struct ipv6_hdr*)l3h)->dst_addr)); + + e->timer = tle_timer_start(ctx->arp_tmw, e, + ARP_REQUEST_EXPIRE); + e->inuse = 0; + e->req_time++; + } else { + if (((struct ipv4_hdr*)l3h)->version_ihl >> 4 == 4) + arp_entry_del(ctx, e); + else + arp6_entry_del(ctx, e); + } + } + rte_spinlock_unlock(&ctx->arp_lock); +} diff --git a/lib/libtle_glue/be.c b/lib/libtle_glue/be.c new file mode 100644 index 0000000..5170b55 --- /dev/null +++ b/lib/libtle_glue/be.c @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include "config.h" +#include "log.h" +#include "util.h" +#include "internal.h" + +static inline void +rte_pktmbuf_copy_seg(struct rte_mbuf *dst, struct rte_mbuf* src) +{ + size_t offset = offsetof(struct rte_mbuf, data_off); + rte_memcpy((char*)dst + offset, (char*)src + offset, + sizeof(struct rte_mbuf) - offset); + rte_mbuf_refcnt_set(dst, 1); + dst->ol_flags &= ~IND_ATTACHED_MBUF; + rte_memcpy(rte_pktmbuf_mtod(dst, void*), rte_pktmbuf_mtod(src, void*), + src->data_len); +} + +static inline struct rte_mbuf* +rte_pktmbuf_copy(struct rte_mbuf *md, struct rte_mempool* mp) +{ + struct rte_mbuf *mc, *mi, **prev; + uint32_t pktlen; + uint16_t nseg; + + if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) + return NULL; + + mi = mc; + prev = &mi->next; + pktlen = md->pkt_len; + nseg = 0; + + do { + nseg++; + rte_pktmbuf_copy_seg(mi, md); + *prev = mi; + prev = &mi->next; + } while ((md = md->next) != NULL && + (mi = rte_pktmbuf_alloc(mp)) != NULL); + + *prev = NULL; + mc->nb_segs = nseg; + mc->pkt_len = pktlen; + + /* Allocation of new indirect segment failed */ + if (unlikely(mi == NULL)) { + rte_pktmbuf_free(mc); + return NULL; + } + + __rte_mbuf_sanity_check(mc, 1); + return mc; +} + +static inline int +process_rx_pkts(struct glue_ctx *ctx, struct rte_mbuf *pkts[], uint32_t n, uint8_t from_loopback) +{ + uint32_t i, j, k, jt, ju, jd; + struct rte_mbuf *tcp[MAX_PKTS_BURST]; + struct rte_mbuf *udp[MAX_PKTS_BURST]; + struct rte_mbuf *drop[MAX_PKTS_BURST]; + int32_t rc[MAX_PKTS_BURST]; + struct tle_dev *tcp_dev, *udp_dev; + struct rte_mempool *mp; + struct rte_mbuf *tmp; + uint64_t ts; + + if (n == 0) + return 0; + + if (unlikely(from_loopback)) { + tcp_dev = ctx->lb_tcp_dev; + udp_dev = ctx->lb_udp_dev; + mp = pkts[0]->pool; + for (i = 0; i < n; i++) { + tmp = rte_pktmbuf_copy(pkts[i], mp); + if (tmp != NULL) { + rte_pktmbuf_free(pkts[i]); + pkts[i] = tmp; + pkts[i]->ol_flags |= PKT_RX_IP_CKSUM_GOOD; + pkts[i]->ol_flags |= PKT_RX_L4_CKSUM_GOOD; + } else { + k = i; + for (; i < n; i++) { + rte_pktmbuf_free(pkts[i]); + } + n = k; + } + } + } else { + tcp_dev = ctx->tcp_dev; + udp_dev = ctx->udp_dev; + } + + ts = rte_get_tsc_cycles() >> (ctx->cycles_ms_shift - 10); + + for (j = 0, jt = 0, ju = 0, jd = 0; j < n; j++) { + pkts[j]->timestamp = ts; + switch (pkts[j]->packet_type & RTE_PTYPE_L4_MASK) { + case RTE_PTYPE_L4_TCP: + tcp[jt++] = pkts[j]; + break; + case RTE_PTYPE_L4_UDP: + udp[ju++] = pkts[j]; + break; + case RTE_PTYPE_L4_ICMP: + /* TODO */ + case RTE_PTYPE_L4_FRAG: + /* TODO */ + default: + drop[jd++] = pkts[j]; + } + } + + if (jt > 0) { + k = tle_tcp_rx_bulk(tcp_dev, tcp, drop + jd, rc, jt); + jd += jt - k; + + TRACE("(port=%u, queue=%u), %u/%u (TCP) pkts are received", + port_id, queue_id, k, n); + } + + if (ju > 0) { + k = tle_udp_rx_bulk(udp_dev, udp, drop + jd, rc, ju); + jd += ju - k; + + TRACE("(port=%u, queue=%u), %u/%u (UDP) pkts are received", + port_id, queue_id, k, n); + } + + for (j = 0; j < jd; j++) + rte_pktmbuf_free(drop[j]); + + return jt + ju - jd; +} + +static inline int +be_rx(struct glue_ctx *ctx) +{ + int ret; + uint32_t n; + struct rte_mbuf *pkts[MAX_PKTS_BURST]; + uint16_t port_id = ctx->port_id; + uint16_t queue_id = ctx->queue_id; + + n = rte_eth_rx_burst(port_id, queue_id, pkts, RTE_DIM(pkts)); + ret = process_rx_pkts(ctx, pkts, n, 0); + + return ret; +} + +int +be_tx(struct glue_ctx *ctx) +{ + uint32_t n, j, k, s, ret; + const uint16_t max_pkts = MAX_PKTS_BURST; + struct rte_mbuf *pkts[max_pkts]; + struct rte_mbuf *_pkts[max_pkts]; + uint16_t port_id = ctx->port_id; + uint16_t queue_id = ctx->queue_id; + + ret = 0; + tle_tcp_process(ctx->tcp_ctx, TCP_MAX_PROCESS); + + n = tle_tcp_tx_bulk(ctx->lb_tcp_dev, pkts, max_pkts); + n += tle_udp_tx_bulk(ctx->lb_udp_dev, pkts + n, max_pkts - n); + if (n > 0) { + ret += n; + rte_eth_tx_burst(ctx->lb_port_id, 0, pkts, n); + /* loopback device could receive after transmit immediately */ + n = rte_eth_rx_burst(ctx->lb_port_id, 0, pkts, RTE_DIM(pkts)); + process_rx_pkts(ctx, pkts, n, 1); + + /* wake up look-aside backend */ + wake_lookaside_backend(ctx); + } + + n = tle_tcp_tx_bulk(ctx->tcp_dev, pkts, max_pkts); + n += tle_udp_tx_bulk(ctx->udp_dev, pkts + n, max_pkts - n); + if (n == 0) + return 0; + + ret += n; + s = 0; + for (j = 0; j != n; j++) { + if (mac_fill(ctx, pkts[j]) == 0) { + PKT_DUMP(pkts[j]); + _pkts[s++] = pkts[j]; + continue; + } + + pkts[j]->next_pkt = ctx->arp_wait; + ctx->arp_wait = pkts[j]; + } + + /* For virtio-user/vhost-kernel test case, it's normal that vhost + * kthread cannot catch up with packets generation speed in stack. + * Shall we drop those packets immdiately or retry some times to + * keep those packets? We find dropping packets here is not a good + * idea, which leads to lots of retrans and inefficiency of vhost + * kthread. Even below code does not work well: + * + * for (k = 0, retry = 0; k < s && retry < 10000; retry++) + * k += rte_eth_tx_burst(port_id, queue_id, _pkts + k, s - k); + * + * So we choose to blockingly send out packes. + */ + k = 0; + while (k < s) + k += rte_eth_tx_burst(port_id, queue_id, _pkts + k, s - k); + + for (j = k; j != s; j++) + rte_pktmbuf_free(_pkts[j]); + + TRACE("(port=%u, queue=%u), %u/%u pkts are sent", + port_id, queue_id, k, s); + + return ret; +} + +int +be_process(struct glue_ctx *ctx) +{ + int ret; + + if (unlikely(stopped)) + return 0; + + ret = be_rx(ctx); + mac_timeout(ctx); + ret += be_tx(ctx); + + return ret; +} diff --git a/lib/libtle_glue/config.h b/lib/libtle_glue/config.h new file mode 100644 index 0000000..6112380 --- /dev/null +++ b/lib/libtle_glue/config.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_GLUE_CONFIG_H_ +#define _TLE_GLUE_CONFIG_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_STREAMS_PER_CORE 64 * 1024 +#define MIN_STREAMS_PER_CORE 16 +#define DELTA_STREAMS 64 +#define FRAG_BUCKET 8 +#define FRAG_ENTRIES_PER_BUCKET 8 +#define MAX_ARP_ENTRY (1 << 10) + +/* RCV buffer & SND buffer + * This is not a reall rcv/snd buffer implementation. Below number means + * the slots to store mbufs of sent or received data. Each slot could + * contains a single mbuf with size of (1500B or 2048B) or a chained + * mbuf with size <= 64KB. + * + * TODO: add real snd/rcv buffer + */ +#define MAX_RECV_BUFS_PER_STREAM 256 +#define MAX_SEND_BUFS_PER_STREAM 256 + +#ifdef LOOK_ASIDE_BACKEND +#define MAX_NB_CTX 1 +#else +#define MAX_NB_CTX 16 +#endif + +#define MAX_MBUFS 0x80000 +/* should calculated by: + * MAX_NB_CTX * MAX_STREAMS_PER_CORE * (MAX_RECV_BUFS_PER_STREAM + MAX_SEND_BUFS_PER_STREAM)) + */ + +#define MBUF_DYNAMIC_SIZE 0x800 + +#define MBUF_PERCORE_CACHE 64 + +#define MAX_PKTS_BURST 0x20 + +#define TCP_MAX_PROCESS 32 + +#ifdef __cplusplus +} +#endif + +#endif /*_TLE_GLUE_CONFIG_H_ */ diff --git a/lib/libtle_glue/ctx.c b/lib/libtle_glue/ctx.c new file mode 100644 index 0000000..a7f4c0a --- /dev/null +++ b/lib/libtle_glue/ctx.c @@ -0,0 +1,502 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "config.h" +#include "ctx.h" +#include "log.h" +#include "util.h" +#include "internal.h" +#include "gateway.h" +#include "tle_timer.h" + +RTE_DEFINE_PER_LCORE(struct glue_ctx *, glue_ctx); + +int nb_ctx; +struct glue_ctx ctx_array[MAX_NB_CTX]; +struct glue_ctx *default_ctx = &ctx_array[0]; + +static int ipv4_dst_lookup_tcp(void *data, + const struct in_addr *addr, struct tle_dest *res) +{ + struct in_addr gate; + addr = ipv4_gateway_lookup(data, addr, &gate); + return arp_ipv4_dst_lookup_tcp(data, addr, res); +} + +static int ipv4_dst_lookup_udp(void *data, + const struct in_addr *addr, struct tle_dest *res) +{ + struct in_addr gate; + addr = ipv4_gateway_lookup(data, addr, &gate); + return arp_ipv4_dst_lookup_udp(data, addr, res); +} + +static int ipv6_dst_lookup_tcp(void *data, + const struct in6_addr *addr, struct tle_dest *res) +{ + struct in6_addr gate; + addr = ipv6_gateway_lookup(data, addr, &gate); + return arp_ipv6_dst_lookup_tcp(data, addr, res); +} + +static int ipv6_dst_lookup_udp(void *data, + const struct in6_addr *addr, struct tle_dest *res) +{ + struct in6_addr gate; + addr = ipv6_gateway_lookup(data, addr, &gate); + return arp_ipv6_dst_lookup_udp(data, addr, res); +} + +static struct tle_ctx *proto_ctx_create(uint32_t socket_id, uint32_t proto, void *data) +{ + struct tle_ctx_param cprm; + + if (proto != TLE_PROTO_TCP && proto != TLE_PROTO_UDP) + rte_panic("Invalid proto [%u]\n", proto); + + cprm.socket_id = socket_id; + cprm.proto = proto; + cprm.max_streams = MAX_STREAMS_PER_CORE; + cprm.min_streams = MIN_STREAMS_PER_CORE; + cprm.delta_streams = DELTA_STREAMS; + cprm.max_stream_rbufs = MAX_RECV_BUFS_PER_STREAM; + cprm.max_stream_sbufs = MAX_SEND_BUFS_PER_STREAM; + if (proto == TLE_PROTO_TCP) { + cprm.lookup4 = ipv4_dst_lookup_tcp; + cprm.lookup6 = ipv6_dst_lookup_tcp; + } else { + cprm.lookup4 = ipv4_dst_lookup_udp; + cprm.lookup6 = ipv6_dst_lookup_udp; + } + cprm.lookup4_data = data; + cprm.lookup6_data = data; +#ifdef LOOK_ASIDE_BACKEND + cprm.flags = 0; +#else + cprm.flags = TLE_CTX_FLAG_ST; /* ctx will be used by single thread*/ +#endif + cprm.send_bulk_size = 0; /* 32 if 0 */ + cprm.hash_alg = TLE_SIPHASH; + cprm.secret_key.u64[0] = rte_rand(); + cprm.secret_key.u64[1] = rte_rand(); + cprm.icw = 0; /**< congestion window, default is 2*MSS if 0. */ + cprm.timewait = 1; /* TLE_TCP_TIMEWAIT_DEFAULT */ + + return tle_ctx_create(&cprm); +} + +static int evq_init(struct glue_ctx *ctx, uint32_t socket_id) +{ + struct tle_evq_param eprm; + + eprm.socket_id = socket_id; + eprm.max_events = MAX_STREAMS_PER_CORE; + ctx->syneq = tle_evq_create(&eprm); + if (ctx->syneq == NULL) + rte_panic("Cannot create syneq"); + + ctx->ereq = tle_evq_create(&eprm); + if (ctx->ereq == NULL) + rte_panic("Cannot create ereq"); + + ctx->rxeq = tle_evq_create(&eprm); + if (ctx->rxeq == NULL) + rte_panic("Cannot create rxeq"); + + ctx->txeq = tle_evq_create(&eprm); + if (ctx->txeq == NULL) + rte_panic("Cannot create txeq"); + + return 0; +} + +static void tle_ctx_init(struct glue_ctx *ctx, uint32_t socket_id) +{ + struct tle_dev_param dprm; + struct rte_eth_dev_info dev_info; + uint16_t port_id = 0; /* currently only use one port */ + + ctx->tcp_ctx = proto_ctx_create(socket_id, TLE_PROTO_TCP, ctx); + if (ctx->tcp_ctx == NULL) + rte_panic("Cannot create tle_ctx for tcp"); + + ctx->udp_ctx = proto_ctx_create(socket_id, TLE_PROTO_UDP, ctx); + if (ctx->udp_ctx == NULL) + rte_panic("Cannot create tle_ctx for udp"); + + memset(&dprm, 0, sizeof(dprm)); + + /* offloading check and set */ + rte_eth_dev_info_get(port_id, &dev_info); + dprm.rx_offload = dev_info.rx_offload_capa & rx_offload; + dprm.tx_offload = dev_info.tx_offload_capa & tx_offload; + + dprm.local_addr4.s_addr = ctx->ipv4; + rte_memcpy(&dprm.local_addr6, &ctx->ipv6, sizeof(struct in6_addr)); + dprm.bl4.nb_port = 0; + dprm.bl4.port = NULL; + dprm.bl6.nb_port = 0; + dprm.bl6.port = NULL; + + ctx->tcp_dev = tle_add_dev(ctx->tcp_ctx, &dprm); + if (ctx->tcp_dev == NULL) + rte_panic("add tle_dev for tcp failed: %u", rte_errno); + + ctx->udp_dev = tle_add_dev(ctx->udp_ctx, &dprm); + if (ctx->udp_dev == NULL) + rte_panic("add tle_dev for udp failed: %u", rte_errno); + + if (ctx == default_ctx) { + dprm.rx_offload = rx_offload; + dprm.tx_offload = tx_offload; + dprm.local_addr4.s_addr = htonl(INADDR_LOOPBACK); + rte_memcpy(&dprm.local_addr6, &in6addr_loopback, + sizeof(struct in6_addr)); + + ctx->lb_tcp_dev = tle_add_dev(ctx->tcp_ctx, &dprm); + if (ctx->lb_tcp_dev == NULL) + rte_panic("failed to add loopback tcp dev: %u\n", + rte_errno); + + ctx->lb_udp_dev = tle_add_dev(ctx->udp_ctx, &dprm); + if (ctx->lb_udp_dev == NULL) + rte_panic("failed to add loopback udp dev: %u\n", + rte_errno); + } + + evq_init(ctx, socket_id); +} + +static uint32_t +get_ip(void) +{ + struct in_addr addr; + const char *ip_str = getenv(DPDK_IP); + + if (ip_str == NULL) { + ip_str = DPDK_IP_DEF; + GLUE_LOG(INFO, "will use the default IP %s", DPDK_IP_DEF); + } else + GLUE_LOG(INFO, "will use the IP %s", ip_str); + + if (inet_aton(ip_str, &addr) == 0) + rte_panic("Invalid addr from env DPDK_IP: %s", ip_str); + + return addr.s_addr; +} + +static uint8_t +get_ip_mask(void) +{ + const char *mask_str = getenv(DPDK_IP_MASK); + + if (mask_str == NULL) { + mask_str = DPDK_IP_MASK_DEF; + GLUE_LOG(INFO, "will use the default IP Mask %s", DPDK_IP_MASK_DEF); + } else + GLUE_LOG(INFO, "will use the IP Mask %s", mask_str); + + return (uint8_t)atoi(mask_str); +} + +static uint32_t +get_ip_gate(void) +{ + struct in_addr addr; + const char *ip_str = getenv(DPDK_IP_GATEWAY); + + if (ip_str == NULL) { + ip_str = DPDK_IP_GATEWAY_DEF; + GLUE_LOG(INFO, "will use the default IP gateway %s", DPDK_IP_GATEWAY_DEF); + } else + GLUE_LOG(INFO, "will use the IP gateway %s", ip_str); + + if (inet_aton(ip_str, &addr) == 0) + rte_panic("Invalid addr from env DPDK_IP_GATEWAY: %s", ip_str); + + return addr.s_addr; +} + +static struct in6_addr* +get_ipv6(void) +{ + static struct in6_addr addr; + const char *ip_str = getenv(DPDK_IPV6); + + if (ip_str == NULL) { + ip_str = DPDK_IPV6_DEF; + GLUE_LOG(INFO, "will use the default IP(V6) %s", DPDK_IPV6_DEF); + } else + GLUE_LOG(INFO, "will use the IP(V6) %s", ip_str); + + if (inet_pton(AF_INET6, ip_str, &addr) == 0) + rte_panic("Invalid addr from env DPDK_IPV6: %s", ip_str); + + return &addr; +} + +static uint8_t +get_ipv6_mask(void) +{ + const char *mask_str = getenv(DPDK_IPV6_MASK); + + if (mask_str == NULL) { + mask_str = DPDK_IPV6_MASK_DEF; + GLUE_LOG(INFO, "will use the default IPV6 Mask %s", DPDK_IPV6_MASK_DEF); + } else + GLUE_LOG(INFO, "will use the IPV6 Mask %s", mask_str); + + return (uint8_t)atoi(mask_str); +} + +static struct in6_addr* +get_ipv6_gate(void) +{ + static struct in6_addr addr; + const char *ip_str = getenv(DPDK_IPV6_GATEWAY); + + if (ip_str == NULL) { + ip_str = DPDK_IPV6_GATEWAY_DEF; + GLUE_LOG(INFO, "will use the default IP(V6) gateway %s", DPDK_IPV6_GATEWAY_DEF); + } else + GLUE_LOG(INFO, "will use the IP(V6) gateway %s", ip_str); + + if (inet_pton(AF_INET6, ip_str, &addr) == 0) + rte_panic("Invalid addr from env DPDK_IPV6_GATEWAY: %s", ip_str); + + return &addr; +} + +static void +loopback_dst_init(struct glue_ctx *ctx) +{ + struct tle_dest *dst; + struct ether_hdr *eth; + struct ipv4_hdr *ip4h; + struct ipv6_hdr *ip6h; + + /* init ipv4 dst */ + dst = &ctx->lb_dst; + dst->mtu = 65535; + + dst->l2_len = sizeof(*eth); + dst->head_mp = get_mempool_by_socket(0); /* fix me */ + eth = (struct ether_hdr *)dst->hdr; + memset(eth, 0, 2 * sizeof(eth->d_addr)); + eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4); + + dst->l3_len = sizeof(*ip4h); + ip4h = (struct ipv4_hdr *)(eth + 1); + ip4h->dst_addr = htonl(INADDR_LOOPBACK); /* fixme: loopback is not only for 127.0.0.1 */ + ip4h->version_ihl = 4 << 4 | sizeof(*ip4h) / IPV4_IHL_MULTIPLIER; + ip4h->time_to_live = 64; + ip4h->next_proto_id = IPPROTO_TCP; + + /* init ipv6 dst */ + dst = &ctx->lb_dst_v6; + dst->mtu = 65535; + + dst->l2_len = sizeof(*eth); + dst->head_mp = get_mempool_by_socket(0); /* fix me */ + eth = (struct ether_hdr *)dst->hdr; + memset(eth, 0, 2 * sizeof(eth->d_addr)); + eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv6); + + dst->l3_len = sizeof(*ip6h); + ip6h = (struct ipv6_hdr *)(eth + 1); + rte_memcpy(ip6h->dst_addr, &in6addr_loopback, sizeof(struct in6_addr)); + ip6h->vtc_flow = 6 << 4; + ip6h->hop_limits = 255; + ip6h->proto = IPPROTO_TCP; +} + +static void +arp_hash_init(struct glue_ctx *ctx, unsigned socket_id) +{ + char str[RTE_HASH_NAMESIZE]; + struct rte_hash_parameters hprm; + + /* init ipv4 arp hash */ + snprintf(str, sizeof(str), "arp_hash_4@ctx%u", ctx->queue_id); + memset(&hprm, 0, sizeof(hprm)); + hprm.name = str; + hprm.entries = MAX_ARP_ENTRY * 2; + hprm.socket_id = socket_id; + hprm.key_len = sizeof(struct in_addr); + ctx->arp_hash = rte_hash_create(&hprm); + if (ctx->arp_hash == NULL) { + rte_panic("Failed to init hashtable for ARP"); + } + + /* init ipv6 arp hash */ + snprintf(str, sizeof(str), "arp_hash_6@ctx%u", ctx->queue_id); + memset(&hprm, 0, sizeof(hprm)); + hprm.name = str; + hprm.entries = MAX_ARP_ENTRY * 2; + hprm.socket_id = socket_id; + hprm.key_len = sizeof(struct in6_addr); + ctx->arp6_hash = rte_hash_create(&hprm); + if (ctx->arp6_hash == NULL) { + rte_panic("Failed to init hashtable for ARP6"); + } +} + +static void +arp_timer_init(struct glue_ctx *ctx, unsigned socket_id) +{ + struct tle_timer_wheel_args twprm; + + twprm.tick_size = 1000U; + twprm.max_timer = MAX_ARP_ENTRY + 8; + twprm.socket_id = socket_id; + ctx->arp_tmw = tle_timer_create(&twprm, tcp_get_tms(ctx->cycles_ms_shift)); + if (ctx->arp_tmw == NULL) { + rte_panic("Failed to init timer wheel for ARP"); + } +} + +static void +glue_ctx_init(struct glue_ctx *ctx, uint32_t socket_id) +{ + uint64_t ms; + + ctx->arp4 = rte_zmalloc_socket(NULL, + sizeof(struct arp_entry) * MAX_ARP_ENTRY, + RTE_CACHE_LINE_SIZE, socket_id); + ctx->arp6 = rte_zmalloc_socket(NULL, + sizeof(struct arp_entry) * MAX_ARP_ENTRY, + RTE_CACHE_LINE_SIZE, socket_id); + if (!ctx->arp4 || !ctx->arp6) + rte_panic("Failed to allocate arp table"); + + ctx->port_id = 0; + ctx->queue_id = nb_ctx - 1; + ctx->ipv4 = get_ip(); + ctx->ipv4_ml = get_ip_mask(); + ctx->ipv4_gw = get_ip_gate(); + rte_memcpy(&ctx->ipv6, get_ipv6(), sizeof(struct in6_addr)); + ctx->ipv6_ml = get_ipv6_mask(); + rte_memcpy(&ctx->ipv6_gw, get_ipv6_gate(), sizeof(struct in6_addr)); + + /* caclulate closest shift to convert from cycles to ms (approximate) */ + ms = (rte_get_tsc_hz() + MS_PER_S - 1) / MS_PER_S; + ctx->cycles_ms_shift = sizeof(ms) * CHAR_BIT - __builtin_clzll(ms) - 1; + + rte_spinlock_init(&ctx->arp_lock); + rte_spinlock_init(&ctx->arp6_lock); + arp_hash_init(ctx, socket_id); + arp_timer_init(ctx, socket_id); + ctx->arp_wait = NULL; + + ctx->frag_tbl = rte_ip_frag_table_create(FRAG_BUCKET, + FRAG_ENTRIES_PER_BUCKET, FRAG_BUCKET * FRAG_ENTRIES_PER_BUCKET, + rte_get_tsc_hz(), socket_id); + if (ctx->frag_tbl == NULL) + rte_panic("Failed to create ip defrag table"); + + PERCPU_MIB = &ctx->mib; +} + +static int ctx_seq; +static rte_spinlock_t ctx_lock = RTE_SPINLOCK_INITIALIZER; + +uint8_t +glue_ctx_alloc(void) +{ + uint32_t socket_id; + struct glue_ctx *ctx; + + /* fix me: we need a fine grainer lock */ + rte_spinlock_lock(&ctx_lock); + + GLUE_LOG(INFO, "allocate ctx: %d", ctx_seq); + if (ctx_seq == 0) { + /* Called from constructor init() */ + ctx_seq = 1; + } else if (ctx_seq == 1) { + /* Called from first epoll_create() or poll() */ + ctx_seq = 2; + ctx = default_ctx; + goto unlock; + } + + if (nb_ctx >= MAX_NB_CTX) + rte_panic("Exceed the max number of ctx"); + + ctx = &ctx_array[nb_ctx++]; + GLUE_LOG(INFO, "%u ctx allocated, and will init", nb_ctx); + + socket_id = get_socket_id(); + + glue_ctx_init(ctx, socket_id); + + /* reconfigure the "physical" port whenever # of ctx changes */ + port_reconfig(); + + if (ctx == default_ctx) { + loopback_dst_init(ctx); + + ctx->lb_port_id = create_loopback(socket_id); + GLUE_LOG(INFO, "loopback port_id: %u", ctx->lb_port_id); + } + + rte_eth_macaddr_get(ctx->port_id, &ctx->mac); + + tle_ctx_init(ctx, socket_id); + +unlock: + rte_spinlock_unlock(&ctx_lock); + return ctx - ctx_array; +} + +void +glue_ctx_free(struct glue_ctx *ctx __rte_unused) +{ + if (nb_ctx == 1 && ctx_seq == 2) { + GLUE_LOG(INFO, "free ctx"); + ctx_seq = 1; + return; + } + + rte_panic("close epoll fd on running is not supported\n"); +} + +struct glue_ctx * +glue_ctx_lookup(uint16_t port_id, uint16_t queue_id) +{ + int i; + + if (port_id == 1) /* loopback */ + return default_ctx; + + for (i = 0; i < nb_ctx; i++) { + if (ctx_array[i].port_id == port_id && + ctx_array[i].queue_id == queue_id) + return &ctx_array[i]; + } + + return NULL; +} diff --git a/lib/libtle_glue/ctx.h b/lib/libtle_glue/ctx.h new file mode 100644 index 0000000..2197c8a --- /dev/null +++ b/lib/libtle_glue/ctx.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_GLUE_SOCK_H_ +#define _TLE_GLUE_SOCK_H_ + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct arp_entry { + struct tle_dest dst; + uint8_t inuse; + uint8_t req_time; + void* timer; +}; + +struct glue_ctx { + struct tle_ctx *tcp_ctx; + struct tle_dev *tcp_dev; + struct tle_dev *lb_tcp_dev; + struct tle_ctx *udp_ctx; + struct tle_dev *udp_dev; + struct tle_dev *lb_udp_dev; + + struct tle_evq *syneq; + struct tle_evq *ereq; + struct tle_evq *rxeq; + struct tle_evq *txeq; + + uint16_t port_id; + uint16_t queue_id; + uint16_t lb_port_id; + + struct { + uint8_t ipv4_ml; + uint8_t ipv6_ml; + }; + + struct ether_addr mac; + struct rte_mbuf *arp_wait; + struct tle_timer_wheel *arp_tmw; + uint32_t cycles_ms_shift; /* to convert from cycles to ms */ + + struct { + uint32_t ipv4; + uint32_t ipv4_gw; + + uint32_t arp4_num; + rte_spinlock_t arp_lock; + struct arp_entry *arp4; + struct rte_hash *arp_hash; + }; + + struct { + struct in6_addr ipv6; + struct in6_addr ipv6_gw; + + uint32_t arp6_num; + rte_spinlock_t arp6_lock; + struct arp_entry *arp6; + struct rte_hash *arp6_hash; + }; + + struct { + rte_spinlock_t frag_lock; + struct rte_ip_frag_tbl *frag_tbl; + struct rte_ip_frag_death_row frag_dr; + }; + + struct tle_dest lb_dst; + struct tle_dest lb_dst_v6; + + struct tle_mib mib; +} __rte_cache_aligned; + +extern int nb_ctx; +extern struct glue_ctx *default_ctx; +extern struct glue_ctx ctx_array[MAX_NB_CTX]; + +RTE_DECLARE_PER_LCORE(struct glue_ctx *, glue_ctx); + +static inline struct glue_ctx * +get_ctx(void) +{ + if (RTE_PER_LCORE(glue_ctx)) + return RTE_PER_LCORE(glue_ctx); + return default_ctx; +} + +static inline uint8_t +get_cid(void) +{ + return get_ctx() - ctx_array; +} + +uint8_t glue_ctx_alloc(void); + +struct glue_ctx * glue_ctx_lookup(uint16_t port_id, uint16_t queue_id); + +void glue_ctx_free(struct glue_ctx *ctx); + +#define DPDK_IP "DPDK_IP" +#define DPDK_IP_DEF "0.0.0.0" +#define DPDK_IP_MASK "DPDK_IP_MASK" +#define DPDK_IP_MASK_DEF "16" +#define DPDK_IP_GATEWAY "DPDK_IP_GATEWAY" +#define DPDK_IP_GATEWAY_DEF "0.0.0.0" +#define DPDK_IPV6 "DPDK_IPV6" +#define DPDK_IPV6_DEF "::" +#define DPDK_IPV6_MASK "DPDK_IPV6_MASK" +#define DPDK_IPV6_MASK_DEF "64" +#define DPDK_IPV6_GATEWAY "DPDK_IPV6_GATEWAY" +#define DPDK_IPV6_GATEWAY_DEF "::" + +#ifdef __cplusplus +} +#endif + +#endif /* _TLE_GLUE_SOCK_H_ */ diff --git a/lib/libtle_glue/epoll.c b/lib/libtle_glue/epoll.c new file mode 100644 index 0000000..2ac6923 --- /dev/null +++ b/lib/libtle_glue/epoll.c @@ -0,0 +1,513 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include "fd.h" +#include "ctx.h" +#include "sym.h" +#include "log.h" +#include "util.h" +#include "sock.h" +#include "internal.h" +#include "tle_glue.h" +#include "../libtle_l4p/udp_stream.h" + +#define EPOLL_DATA_SPECIAL 0xFFFFFFFFFFFFFF01 + +/* We don't use rte_eth_dev_rx_intr_ctl_q as it has its + * own way to specify event.data + */ +static int +dev_rx_intr_ctl_q(uint16_t port_id, uint16_t queue_id, int efd, int op, int rx) +{ + int fd, ret; + uint32_t vec, efd_idx; + struct rte_eth_dev *dev; + struct rte_intr_handle *intr_handle; + static struct epoll_event ev = { + .events = EPOLLIN | EPOLLPRI | EPOLLET, + .data = { + .u64 = EPOLL_DATA_SPECIAL, + }, + }; + char buf[32]; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + if (queue_id >= dev->data->nb_rx_queues) + return -EINVAL; + + if (!dev->intr_handle) + return -ENOTSUP; + + intr_handle = dev->intr_handle; + if (!intr_handle->intr_vec) + return -EPERM; + + vec = intr_handle->intr_vec[queue_id]; + + efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ? + (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec; + + fd = intr_handle->efds[efd_idx]; + + if (rx) { + /* almost all devices use eventfd, we shall read out */ + ret = read(fd, buf, sizeof(uint64_t)); + RTE_SET_USED(ret); + } + + return k_epoll_ctl(efd, op, fd, &ev); +} + +int +PRE(epoll_create)(int size) +{ + int epfd; + struct sock *so; + + if (!fd_table_initialized) + return k_epoll_create(size); + + epfd = get_unused_fd(); + if (epfd == -1) { + errno = EMFILE; + return -1; + } + + + so = fd2sock(epfd); + so->cid = glue_ctx_alloc(); + + so->shadow_efd = k_epoll_create(1); + if (so->shadow_efd < 0) + rte_panic("Failed to create shadow efd"); + + if (dev_rx_intr_ctl_q(CTX(so)->port_id, CTX(so)->queue_id, + so->shadow_efd, RTE_INTR_EVENT_ADD, 0) < 0) + rte_panic("Failed to epoll_ctl rxq interrupt fd"); + + so->epoll = 1; + + return epfd; +} + +int +PRE(epoll_create1)(int flags __rte_unused) +{ + return PRE(epoll_create)(1); +} + +int +PRE(epoll_ctl)(int epfd, int op, int fd, struct epoll_event *event) +{ + struct sock *so_ep; + struct sock *so; + + if (is_kernel_fd(epfd)) { + if (!is_kernel_fd(fd)) + rte_panic("kernel epoll (%d) on an userspace fd: %d", + epfd, fd); + + return k_epoll_ctl(epfd, op, fd, event); + } + + so_ep = fd2sock(epfd); + + if (is_kernel_fd(fd)) { + /* Use a shadow epoll fd for possible kernel I/O events. */ + return k_epoll_ctl(so_ep->shadow_efd, op, fd, event); + } + + so = fd2sock(fd); + + if (unlikely(so->cid != so_ep->cid)) + rte_panic("Different ctx %d and %d for epoll fd and socket fd", + so_ep->cid, so->cid); + + GLUE_DEBUG("epoll_ctl: op = %x, fd = %d, event = %x", + op, fd, event->events); + switch (op) { + case EPOLL_CTL_ADD: + if (so->event.events) { + errno = EEXIST; + return -1; + } + + so->event = *event; + + break; + case EPOLL_CTL_MOD: + if (so->event.events == 0) { + errno = ENOENT; + return -1; + } + + so->event = *event; + break; + case EPOLL_CTL_DEL: + if (so->event.events == 0) { + errno = ENOENT; + return -1; + } + + so->event.events = 0; + break; + default: + errno = EINVAL; + return -1; + } + + return 0; +} + +static inline int32_t +tle_evq_fetch(struct tle_evq *evq, const void *evd[], uint32_t num, uint32_t event) +{ + uint32_t polled; + uint32_t i, k; + struct tle_event *ev; + struct tle_event *next; + + k = 0; + + if (evq->nb_armed == 0) + return 0; + + rte_compiler_barrier(); + + rte_spinlock_lock(&evq->lock); + ev = TAILQ_FIRST(&evq->armed); + for (i = 0; i != evq->nb_armed; i++) { + next = TAILQ_NEXT(ev, ql); + polled = ((const struct sock *)ev->data)->event.events; + /* Always report EPOLLHUP, see man epoll_ctl(2) */ + if (polled && ((polled | EPOLLHUP) & event)) { + evd[k++] = ev->data; + TAILQ_REMOVE(&evq->armed, ev, ql); + /* don't down erev; and assign NULL to data means this + * ev is already removed from the queue, refer to + * tle_event_idle_err(). + */ + if (event != EPOLLHUP) + ev->state = TLE_SEV_DOWN; + else + ev->data = NULL; + } + if (k == num) + break; + ev = next; + } + evq->nb_armed -= k; + rte_spinlock_unlock(&evq->lock); + return k; +} + +static int +evq_drain(struct tle_evq *q, uint32_t event, + struct epoll_event *events, int maxevents) +{ + uint32_t i, n; + struct sock *socks[maxevents]; + + n = tle_evq_fetch(q, (const void **)(uintptr_t)socks, maxevents, event); + for (i = 0; i < n; ++i) { + events[i].events = event; + events[i].data = socks[i]->event.data; + + GLUE_DEBUG("event for fd = %d, event = %x", + socks[i]->event.data.fd, event); + } + return n; +} + +#ifdef LOOK_ASIDE_BACKEND +rte_atomic32_t flag_sleep; + +int +epoll_kernel_wait(struct glue_ctx *ctx, int efd, + struct epoll_event *events, + int maxevents, int timeout, int *rx) +{ + int rc; + struct epoll_event event; + uint16_t port_id = ctx->port_id; + uint16_t queue_id = ctx->queue_id; + + RTE_SET_USED(events); + RTE_SET_USED(maxevents); + RTE_SET_USED(rx); + + rte_eth_dev_rx_intr_enable(port_id, queue_id); + + /* TODO: timeout shall be limited by the latest tcp timer */ + + if (be_process(ctx) > 0) /* use this way to avoid concurrency */ + rc = 0; + else + rc = sleep_with_lock(efd, &event, 1, timeout); + + rte_eth_dev_rx_intr_disable(port_id, queue_id); + return rc; +} +#else +int +epoll_kernel_wait(struct glue_ctx *ctx, int efd, + struct epoll_event *events, + int maxevents, int timeout, int *rx) +{ + int i, j, rc; + int flag_tmp = 0; + uint16_t port_id = ctx->port_id; + uint16_t queue_id = ctx->queue_id; +#define LEAST_EVENTS 8 + struct epoll_event s_events[LEAST_EVENTS]; + struct epoll_event *r_events; + int r_maxevents; + int fastpath = 0; + + *rx = 0; + + if (efd == -1) { + flag_tmp = 1; + efd = k_epoll_create(1); + if (efd < 0) + rte_panic("Failed to create tmp efd"); + } + + if (stopped) { + rc = k_epoll_pwait(efd, events, maxevents, timeout, NULL); + goto check; + } + + if (maxevents < LEAST_EVENTS) { + r_events = s_events; + r_maxevents = maxevents + 1; + } else { + r_events = events; + r_maxevents = maxevents; + } + + if (flag_tmp && dev_rx_intr_ctl_q(port_id, queue_id, + efd, RTE_INTR_EVENT_ADD, 0) < 0) + /* TODO: fall back to busy polling */ + rte_panic("Failed to enable rxq interrupt"); + + rte_eth_dev_rx_intr_enable(port_id, queue_id); + + /* TODO: timeout shall be limited by the latest tcp timer */ + + if (timeout != 0 && be_process(ctx) > 0) { + /* use this way to avoid concurrency */ + rc = 0; + fastpath = 1; + } else + rc = sleep_with_lock(efd, r_events, r_maxevents, timeout); + + rte_eth_dev_rx_intr_disable(port_id, queue_id); + + /* filter out rxq event */ + for (i = 0, j = 0; i < rc; ++i) { + if (r_events[i].data.u64 == EPOLL_DATA_SPECIAL) { + *rx = true; + if (i + 1 < rc) { + memcpy(&r_events[j], &r_events[i+1], + (rc-i-1) * sizeof(*events)); + } + rc -= 1; + break; + } else { + if (i != j) + r_events[j] = r_events[i]; + j++; + } + } + + if (rc > 0 && maxevents < LEAST_EVENTS) + memcpy(events, r_events, rc * sizeof(*events)); + + if (flag_tmp) + dev_rx_intr_ctl_q(port_id, queue_id, efd, RTE_INTR_EVENT_DEL, *rx); + + if (fastpath) + *rx = true; +check: + if (flag_tmp) + close(efd); + + return rc; +} +#endif + +/* If only there are some packets to process, we don't sleep; we will poll + * for some number of iterations to check packets. + * + * TODO: change to wait for a period of time? + */ +#define IDLE_ITERATIONS 5 + +int +poll_common(struct glue_ctx *ctx, struct epoll_event *events, + int maxevents, int timeout, int shadow_efd) +{ + int rx; + int total = 0; + int idle = IDLE_ITERATIONS; + /* We will start with send, then recv, and accept, as we want to + * serve exiting connections firstly, then new connections, and + * lastly, the wrong connections. + */ +again: + /* 0. send evq */ + total += evq_drain(ctx->txeq, EPOLLOUT, + events + total, maxevents-total); + if (total == maxevents) + return total; + + /* 1. recv evq */ + total += evq_drain(ctx->rxeq, EPOLLIN, + events + total, maxevents-total); + if (total == maxevents) + return total; + + /* 2. accept evq */ + total += evq_drain(ctx->syneq, EPOLLIN, + events + total, maxevents-total); + if (total == maxevents) + return total; + + /* 3. err evq */ + total += evq_drain(ctx->ereq, EPOLLHUP, + events + total, maxevents-total); + + if (total > 0) + return total; + + while ((idle--) > 0) { + be_process(ctx); + goto again; + } + + if (timeout == 0) + return 0; + + /* Setup rxq interrupt mode, and check kernel I/O events */ + total = epoll_kernel_wait(ctx, shadow_efd, events, + maxevents, timeout, &rx); + + /* Kernel I/O events are available (total > 0) or + * timeout (total < 0) or something bad happens. + */ + if (total != 0) + return total; + + /* Check userspace I/O events */ + idle = IDLE_ITERATIONS; + be_process(ctx); + goto again; +} + +int +PRE(epoll_wait)(int epfd, struct epoll_event *events, int maxevents, int timeout) +{ + struct sock *so; + + if (is_kernel_fd(epfd)) + return k_epoll_pwait(epfd, events, maxevents, timeout, NULL); + + so = fd2sock(epfd); + + /* thread <> context binding happens here */ + if (RTE_PER_LCORE(glue_ctx) == NULL) + RTE_PER_LCORE(glue_ctx) = CTX(so); + + return poll_common(CTX(so), events, maxevents, timeout, so->shadow_efd); +} + +int +PRE(epoll_pwait)(int epfd, struct epoll_event *events, + int maxevents, int timeout, const sigset_t *sigmask) +{ + if (sigmask != NULL) { + rte_panic("epoll_pwait with signal is not supported"); + } + + return epoll_wait(epfd, events, maxevents, timeout); +} + +int +fd_ready(int fd, int events) +{ + int ret = 0; + struct sock *so = fd2sock(fd); + + if (unlikely(!so->s)) { + if (so->erev) /* socket has been shutdown */ + return events | EPOLLHUP; + else /* socket is not set up yet */ + return 0; + } + + if (unlikely(IS_TCP(so) && + TCP_STREAM(so->s)->tcb.state == TCP_ST_CLOSED)) { + return events | EPOLLHUP | EPOLLERR; + } + + if (so->erev && tle_event_state(so->erev) == TLE_SEV_UP) + ret |= EPOLLHUP; + + if (events & EPOLLIN) { + if (so->rx_left || + (IS_TCP(so) && rte_ring_count(TCP_STREAM(so->s)->rx.q) > 0) || + (IS_UDP(so) && rte_ring_count(UDP_STREAM(so->s)->rx.q) > 0)) + ret |= EPOLLIN; + } + + if (events & EPOLLOUT) { + if ((IS_TCP(so) && + TCP_STREAM(so->s)->tcb.state >= TCP_ST_ESTABLISHED && + rte_ring_free_count(TCP_STREAM(so->s)->tx.q) > 0) || + (IS_UDP(so) && + rte_ring_count(UDP_STREAM(so->s)->tx.drb.r) > 0)) + ret |= EPOLLOUT; + } + + return ret; +} + +void +v_get_stats_snmp(unsigned long mibs[]) +{ + int i, j, k; + + memcpy(mibs, &default_mib, sizeof(default_mib)); + + for (i = 0; i < nb_ctx; ++i) { + for (j = 0; j < TCP_MIB_MAX; ++j) + mibs[j] += ctx_array[i].mib.tcp.mibs[j]; + + for (k = 0; k < UDP_MIB_MAX; ++k) + mibs[j+k] += ctx_array[i].mib.udp.mibs[k]; + } +} diff --git a/lib/libtle_glue/fd.c b/lib/libtle_glue/fd.c new file mode 100644 index 0000000..b01ac87 --- /dev/null +++ b/lib/libtle_glue/fd.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "fd.h" +#include "log.h" +#include "util.h" +#include "config.h" + +bool fd_table_initialized; + +struct fd_table fd_table = { .fd_base = INT_MAX, }; + +static int +get_ulimit_nofile(void) +{ + struct rlimit rlim; + +#define GLUE_BASE_FD 1024 + if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) + return GLUE_BASE_FD; + + return rlim.rlim_cur; /* soft limit, rlim_max is the hard limit */ +} + +static void +fd_num_set(int *fd_base, int *fd_num) +{ + int limit = get_ulimit_nofile(); + + /* fix me: alignment of power of two */ + /* fix me: use dup2 to occupy these fds */ + *fd_num = limit / 2; + *fd_num = RTE_MIN(MAX_STREAMS_PER_CORE * 2 * MAX_NB_CTX, *fd_num); + + *fd_base = limit - *fd_num; + GLUE_LOG(INFO, "fd_base = %d, fd_num = %d", *fd_base, *fd_num); +} + +static void +add_fd(struct rte_mempool *mp __rte_unused, void *opaque __rte_unused, + void *obj, unsigned obj_idx) +{ + ((struct sock *)obj)->fd = obj_idx + fd_table.fd_base; + fd_table.socks[obj_idx] = obj; +} + +void +fd_init(void) +{ + int ret; + size_t sz; + uint32_t socket_id; + int fd_base, fd_num; + struct rte_mempool *mp = NULL; + char name[RTE_MEMPOOL_NAMESIZE]; + + socket_id = get_socket_id(); + + fd_num_set(&fd_base, &fd_num); + + sz = sizeof(fd_table.socks[0]) * fd_num; + fd_table.socks = rte_zmalloc_socket("fdtable", sz, + RTE_CACHE_LINE_SIZE, socket_id); + if (fd_table.socks == NULL) { + GLUE_LOG(ERR, "Failed to malloc fd table"); + goto err; + } + + snprintf(name, RTE_MEMPOOL_NAMESIZE, "mp_fd_%d_%d", fd_base, fd_num); + mp = rte_mempool_create_empty(name, fd_num - 1, sizeof(struct sock), + 32, 0, socket_id, MEMPOOL_F_DYNAMIC); + if (mp == NULL) { + GLUE_LOG(ERR, "Failed to create mp for fd table"); + goto err; + } + + ret = rte_mempool_set_ops_byname(mp, "ring_mp_mc", NULL); + if (ret != 0) { + GLUE_LOG(ERR, "Failed to set mp ops: %d", ret); + goto err; + } + + rte_mempool_set_dynamic_size(mp, 1024); + rte_mempool_set_dynamic_cb(mp, add_fd); + + fd_table.mp = mp; + fd_table.fd_base = fd_base; + fd_table.fd_num = fd_num; + + /* should populate after fd_table is set */ + ret = rte_mempool_populate_default(mp); + if (ret < 0) { + GLUE_LOG(ERR, "Failed to populate mp: %d", ret); + goto err; + } + + fd_table_initialized = true; + + return; +err: + rte_mempool_free(mp); + rte_panic("Failed to init fd_table"); +} diff --git a/lib/libtle_glue/fd.h b/lib/libtle_glue/fd.h new file mode 100644 index 0000000..abffac2 --- /dev/null +++ b/lib/libtle_glue/fd.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_GLUE_FD_H_ +#define _TLE_GLUE_FD_H_ + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "log.h" +#include "sock.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct fd_table { + int fd_base; /* The mininum fd, 64 aligned */ + int fd_num; /* The number of fds, 64 aligned */ + struct rte_mempool *mp; /* O(1) get and put */ + struct sock **socks; +}; + +extern bool fd_table_initialized; +extern struct fd_table fd_table; + +static inline struct sock *fd2sock(int fd) +{ + return fd_table.socks[fd - fd_table.fd_base]; +} + +static inline int sock2fd(struct sock *so) +{ + return so->fd; +} + +static inline int +get_unused_fd(void) +{ + struct sock *so; + + if (unlikely(rte_mempool_get(fd_table.mp, (void **)&so) < 0)) { + GLUE_LOG(ERR, "FDs have been exhausted"); + errno = ENFILE; + return -1; + } + + so->valid = 1; + return sock2fd(so); +} + +static inline void +tle_event_idle_err(struct tle_event *ev) +{ + struct tle_evq *q; + + if (ev->state == TLE_SEV_IDLE) + return; + + q = ev->head; + rte_compiler_barrier(); + + rte_spinlock_lock(&q->lock); + if (ev->state == TLE_SEV_UP && ev->data) { + TAILQ_REMOVE(&q->armed, ev, ql); + q->nb_armed--; + } + ev->state = TLE_SEV_IDLE; + rte_spinlock_unlock(&q->lock); +} + +static inline void +free_fd_event(struct sock *so) +{ + if (so->erev) { + tle_event_idle_err(so->erev); + tle_event_free(so->erev); + } + if (so->rxev) { + tle_event_idle(so->rxev); + tle_event_free(so->rxev); + } + if (so->txev) { + tle_event_idle(so->txev); + tle_event_free(so->txev); + } +} + +static inline void +put_free_fd(int fd) +{ + struct sock *so = fd2sock(fd); + + rte_mempool_put(fd_table.mp, so); +} + +static inline bool +is_kernel_fd(int fd) +{ + return fd < fd_table.fd_base; +} + +void fd_init(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _TLE_GLUE_FD_H_ */ diff --git a/lib/libtle_glue/gateway.h b/lib/libtle_glue/gateway.h new file mode 100644 index 0000000..4b5f12c --- /dev/null +++ b/lib/libtle_glue/gateway.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2019 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_GATEWAY_H_ +#define _TLE_GATEWAY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +static inline int +is_ipv4_loopback_addr(in_addr_t addr, struct glue_ctx *ctx) +{ + if (addr == ctx->ipv4 || addr == htonl(INADDR_LOOPBACK)) + return 1; + else + return 0; +} + +static inline int +is_ipv6_loopback_addr(const struct in6_addr *addr, struct glue_ctx *ctx) +{ + if (memcmp(addr, &ctx->ipv6, sizeof(struct in6_addr)) == 0 || + IN6_IS_ADDR_LOOPBACK(addr) || + (IN6_IS_ADDR_V4COMPAT(addr) && + addr->__in6_u.__u6_addr32[3] == htonl(INADDR_LOOPBACK)) || + (IN6_IS_ADDR_V4MAPPED(addr) && + addr->__in6_u.__u6_addr32[3] == htonl(INADDR_LOOPBACK))) + return 1; + else + return 0; +} + +static inline const struct in_addr* ipv4_gateway_lookup(void *data, + const struct in_addr *addr, struct in_addr *gate) +{ + struct glue_ctx *ctx = data; + + if (is_ipv4_loopback_addr(addr->s_addr, ctx)) + return addr; + + uint8_t ls = 32 - ctx->ipv4_ml; + if ((addr->s_addr << ls) == (ctx->ipv4 << ls)) { + return addr; + } + else { + if (ctx->ipv4_gw != 0) { + gate->s_addr = ctx->ipv4_gw; + return gate; + } else { + return addr; + } + } +} + +static inline const struct in6_addr* ipv6_gateway_lookup(void *data, + const struct in6_addr *addr, struct in6_addr *gate) +{ + struct glue_ctx *ctx = data; + uint8_t ls; + + if (is_ipv6_loopback_addr(addr, ctx)) + return addr; + + if (ctx->ipv6_ml <= 64) { + ls = 64 - ctx->ipv6_ml; + if ((*(const uint64_t*)addr << ls) + == (*(const uint64_t*)&ctx->ipv6 << ls)) { + return addr; + } + } else if (*(const uint64_t*)addr == *(const uint64_t*)&ctx->ipv6) { + ls = 128 - ctx->ipv6_ml; + if ((*((const uint64_t*)addr + 1) << ls) + == (*((const uint64_t*)&ctx->ipv6 + 1) << ls)) { + return addr; + } + } + + if (!IN6_IS_ADDR_UNSPECIFIED(&ctx->ipv6_gw)) { + rte_memcpy(gate, &ctx->ipv6_gw, sizeof(struct in6_addr)); + return gate; + } else { + return addr; + } +} + +#ifdef __cplusplus +} +#endif + +#endif /* _TLE_GATEWAY_H_ */ diff --git a/lib/libtle_glue/icmp.c b/lib/libtle_glue/icmp.c new file mode 100644 index 0000000..c0a622a --- /dev/null +++ b/lib/libtle_glue/icmp.c @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "log.h" +#include "ctx.h" +#include "internal.h" + +#define ICMP_ECHOREPLY 0 /* Echo Reply */ +#define ICMP_ECHO 8 /* Echo Request */ +#define ICMP_TIMESTAMP 13 /* Timestamp Request */ +#define ICMP_TIMESTAMPREPLY 14 /* Timestamp Reply */ + +/* Codes for TIME_EXCEEDED. */ +#define ICMP_EXC_TTL 0 /* TTL count exceeded */ +#define ICMP_EXC_FRAGTIME 1 /* Fragment Reass time exceeded */ + +/* Parameters used to convert the timespec values */ +#define SECONDS_PER_DAY 86400L +#define MSEC_PER_SEC 1000L +#define USEC_PER_MSEC 1000L +#define NSEC_PER_USEC 1000L +#define NSEC_PER_MSEC (NSEC_PER_USEC * USEC_PER_MSEC) + +#define IS_IPV4_BCAST(x) ((x) == (uint32_t)0xFFFFFFFF) + +struct icmp_pkt { + struct icmp_hdr icmp_h; + uint32_t times[3]; +}; + +/* Return remainder for ``dividend / divisor`` */ +static inline uint32_t +div_uint64_rem(uint64_t dividend, uint32_t divisor) +{ + return dividend % divisor; +} + +/* Return milliseconds since midnight (UTC) in network byte order. */ +static uint32_t +current_timestamp(void) +{ + struct timespec ts; + uint32_t msecs; + uint32_t secs; + + (void)clock_gettime(CLOCK_REALTIME, &ts); + + /* Get secs since midnight. */ + secs = div_uint64_rem(ts.tv_sec, SECONDS_PER_DAY); + /* Convert to msecs. */ + msecs = secs * MSEC_PER_SEC; + /* Convert nsec to msec. */ + msecs += (uint32_t)ts.tv_nsec / NSEC_PER_MSEC; + + /* Convert to network byte order. */ + return rte_cpu_to_be_32(msecs); +} + +/* + * Process the checksum of an ICMP packet. The checksum field must be set + * to 0 by the caller. + */ +static uint16_t +icmp_cksum(const struct icmp_hdr *icmp, uint32_t data_len) +{ + uint16_t cksum; + + cksum = rte_raw_cksum(icmp, sizeof(struct icmp_hdr) + data_len); + return (cksum == 0xffff) ? cksum : ~cksum; +} + +/** + * Receive and handle an ICMP packet. + * + * @param ctx + * The pointer to the glue context. + * @param pkt + * The pointer to the raw packet data. + * @param l2_len + * The the size of the l2 header. + * @return + * MUST return NULL now. :-) + */ +struct rte_mbuf * +icmp_recv(struct glue_ctx *ctx, struct rte_mbuf *pkt, uint32_t l2_len, uint32_t l3_len) +{ + struct ether_addr eth_addr; + struct icmp_pkt *icmp_pkt; + struct ether_hdr *eth_h; + struct icmp_hdr *icmp_h; + struct ipv4_hdr *ip_h; + uint32_t ip_addr; + uint32_t cksum; + + eth_h = rte_pktmbuf_mtod(pkt, struct ether_hdr *); + ip_h = (struct ipv4_hdr *) ((char *)eth_h + l2_len); + + icmp_h = (struct icmp_hdr *)((char *)ip_h + l3_len); + if (icmp_h->icmp_type != IP_ICMP_ECHO_REQUEST && + icmp_h->icmp_type != ICMP_TIMESTAMP) + goto drop_pkt; + + icmp_pkt = (struct icmp_pkt *)icmp_h; + + ether_addr_copy(ð_h->s_addr, ð_addr); + ether_addr_copy(ð_h->d_addr, ð_h->s_addr); + ether_addr_copy(ð_addr, ð_h->d_addr); + + /* + * Similar to Linux implementation, we silently drop the broadcast or + * multicast ICMP pakcets. + * + * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be + * silently ignored. + * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently + * discarded if to broadcast/multicast. + */ + ip_addr = rte_be_to_cpu_32(ip_h->dst_addr); + if (IS_IPV4_MCAST(ip_addr) || IS_IPV4_BCAST(ip_addr)) + goto drop_pkt; + + ip_addr = ip_h->src_addr; + ip_h->src_addr = ip_h->dst_addr; + ip_h->dst_addr = ip_addr; + + if (icmp_h->icmp_type == IP_ICMP_ECHO_REQUEST && + icmp_h->icmp_code == 0) { + + /* Must clear checksum field before calling the helper. */ + ip_h->hdr_checksum = 0; + ip_h->hdr_checksum = rte_ipv4_cksum(ip_h); + + icmp_h->icmp_type = IP_ICMP_ECHO_REPLY; + icmp_h->icmp_code = 0; + + /* + * Fix me: the data part of an ICMP echo request/reply + * message is implementation specific, we don't know + * how to verify or calculate the checksum. + * + * Need to see BSD or LINUX implementation. + */ + cksum = ~icmp_h->icmp_cksum & 0xffff; + cksum += ~rte_cpu_to_be_16(IP_ICMP_ECHO_REQUEST << 8) & 0xffff; + cksum += rte_cpu_to_be_16(IP_ICMP_ECHO_REPLY << 8); + cksum = (cksum & 0xffff) + (cksum >> 16); + cksum = (cksum & 0xffff) + (cksum >> 16); + icmp_h->icmp_cksum = ~cksum; + + } else if (icmp_h->icmp_type == ICMP_TIMESTAMP && + icmp_h->icmp_code == 0) { + + /* + * RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests. + * SHOULD be in the kernel for minimum random latency. + * MUST be accurate to a few minutes. + * MUST be updated at least at 15Hz. + */ + icmp_h->icmp_type = ICMP_TIMESTAMPREPLY; + icmp_h->icmp_code = 0; + icmp_pkt->times[1] = current_timestamp(); + icmp_pkt->times[2] = icmp_pkt->times[1]; + + icmp_h->icmp_cksum = 0; + /* the data part of an ICMP timestamp reply is 12 bytes. */ + icmp_h->icmp_cksum = icmp_cksum(icmp_h, 12); + } else { + goto drop_pkt; + } + + if (pkt->pkt_len < ETHER_MIN_LEN) + rte_pktmbuf_append(pkt, ETHER_MIN_LEN - pkt->pkt_len); + + if (rte_eth_tx_burst(ctx->port_id, ctx->queue_id, &pkt, 1)) + GLUE_LOG(DEBUG, "Send ICMP echo reply OK"); + + return NULL; + +drop_pkt: + rte_pktmbuf_free(pkt); + return NULL; +} + +/** + * Receive and handle an ICMPv6 packet. + * + * @param ctx + * The pointer to the glue context. + * @param pkt + * The pointer to the raw packet data. + * @param l2_len + * The the size of the l2 header. + * @return + * MUST return NULL now. :-) + */ +struct rte_mbuf * +icmp6_recv(struct glue_ctx *ctx, struct rte_mbuf *pkt, uint32_t l2_len, uint32_t l3_len) +{ + struct ether_addr eth_addr; + struct ether_hdr *eth_h; + struct icmp6_hdr *icmp6_h; + struct ipv6_hdr *ipv6_h; + struct in6_addr ipv6_addr; + uint32_t cksum; + + eth_h = rte_pktmbuf_mtod(pkt, struct ether_hdr *); + ipv6_h = (struct ipv6_hdr *) ((char *)eth_h + l2_len); + + icmp6_h = (struct icmp6_hdr *)((char *)ipv6_h + l3_len); + + /* NDP pkt */ + if ((icmp6_h->icmp6_type == ND_NEIGHBOR_SOLICIT || + icmp6_h->icmp6_type == ND_NEIGHBOR_ADVERT) && + icmp6_h->icmp6_code == 0) { + return ndp_recv(ctx, pkt, l2_len, l3_len); + } + + /* only support ECHO now, other types of pkts are dropped */ + if ((icmp6_h->icmp6_type != ICMP6_ECHO_REQUEST && + icmp6_h->icmp6_type != ICMP6_ECHO_REPLY) || + icmp6_h->icmp6_code != 0) + goto drop_pkt; + + ether_addr_copy(ð_h->s_addr, ð_addr); + ether_addr_copy(ð_h->d_addr, ð_h->s_addr); + ether_addr_copy(ð_addr, ð_h->d_addr); + + /* + * Now, we silently drop the anycast or multicast ICMP pakcets. + * But it does not conform to RFC 4443. Maybe fix it latter. + * + * RFC 4443: 4.2 An Echo Reply SHOULD be sent in response to an + * Echo Request message sent to an IPv6 multicast or anycast address. + * In this case, thesource address of the reply MUST be a unicast + * address belonging to the interface on which the Echo Request + * message was received. + */ + switch (icmp6_h->icmp6_type) { + case ICMP6_ECHO_REQUEST: + if (memcmp(ipv6_h->dst_addr, &ctx->ipv6, sizeof(struct in6_addr)) != 0) + goto drop_pkt; + + rte_memcpy(&ipv6_addr, ipv6_h->src_addr, sizeof(struct in6_addr)); + rte_memcpy(ipv6_h->src_addr, ipv6_h->dst_addr, sizeof(struct in6_addr)); + rte_memcpy(ipv6_h->dst_addr, &ipv6_addr, sizeof(struct in6_addr)); + + icmp6_h->icmp6_type = ICMP6_ECHO_REPLY; + + cksum = ~icmp6_h->icmp6_cksum & 0xffff; + cksum += ~rte_cpu_to_be_16(ICMP6_ECHO_REQUEST << 8) & 0xffff; + cksum += rte_cpu_to_be_16(ICMP6_ECHO_REPLY << 8); + cksum = (cksum & 0xffff) + (cksum >> 16); + cksum = (cksum & 0xffff) + (cksum >> 16); + icmp6_h->icmp6_cksum = ~cksum; + + break; + default: + goto drop_pkt; + } + + if (pkt->pkt_len < ETHER_MIN_LEN) + rte_pktmbuf_append(pkt, ETHER_MIN_LEN - pkt->pkt_len); + + if (rte_eth_tx_burst(ctx->port_id, ctx->queue_id, &pkt, 1)) + GLUE_LOG(DEBUG, "Send ICMP echo reply OK"); + + return NULL; + +drop_pkt: + rte_pktmbuf_free(pkt); + return NULL; +} diff --git a/lib/libtle_glue/init.c b/lib/libtle_glue/init.c new file mode 100644 index 0000000..298db4a --- /dev/null +++ b/lib/libtle_glue/init.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include + +#include +#include +#include +#include + +#include "util.h" +#include "fd.h" +#include "ctx.h" +#include "sym.h" +#include "log.h" +#include "internal.h" +#include "tle_glue.h" + +void +glue_init1(int argc, char **argv) +{ + GLUE_LOG(INFO, "init: DPDK and fd table..."); + + if (rte_eal_init(argc, argv) < 0) + rte_panic("Failed to init DPDK"); + + fd_init(); +} + +static void __attribute__((constructor)) +glue_init(void) +{ + char *p; + int i, err, argc = 0; + char **argv = NULL, **argv_to_release = NULL; + char *vnic, *params, *no_huge; + cpu_set_t cpuset; + pthread_t tid = pthread_self(); + + symbol_init(); + +#define DPDK_PARAMS "DPDK_PARAMS" + params = getenv(DPDK_PARAMS); +#define DPDK_NO_HUGE "DPDK_NO_HUGE" + no_huge = getenv(DPDK_NO_HUGE); +#define DPDK_VNIC "DPDK_VNIC" + vnic = getenv(DPDK_VNIC); + + if (params == NULL && no_huge == NULL && vnic == NULL) + return; + + argv = grow_argv(argv, argc, 1); + argv[argc++] = xstrdup("userspace-stack"); + + /* Get the main thread affinity */ + CPU_ZERO(&cpuset); + err = pthread_getaffinity_np(tid, sizeof(cpu_set_t), &cpuset); + if (!err) { + for (i = 0; i < CPU_SETSIZE; i++) { + if (CPU_ISSET(i, &cpuset)) { + argv = grow_argv(argv, argc, 2); + argv[argc++] = xstrdup("-l"); + argv[argc++] = xasprintf("%d", i); + i = CPU_SETSIZE; + } + } + } else { + argv = grow_argv(argv, argc, 2); + argv[argc++] = xstrdup("-l"); + argv[argc++] = xasprintf("0"); + } + + if (params) + p = strtok(params, " "); + else + p = NULL; + while (p != NULL) { + argv = grow_argv(argv, argc, 1); + argv[argc++] = xstrdup(p); + p = strtok(NULL, " "); + } + + if (no_huge) { + argv = grow_argv(argv, argc, 3); + argv[argc++] = xstrdup("-m"); + argv[argc++] = xstrdup("2048"); + argv[argc++] = xstrdup("--no-huge"); + } + + if (vnic) { + argv = grow_argv(argv, argc, 2); + argv[argc++] = xstrdup(vnic); + argv[argc++] = xstrdup("--no-pci"); + } + + argv = grow_argv(argv, argc, 1); + argv[argc++] = xstrdup("--"); + + argv_to_release = grow_argv(argv_to_release, 0, argc); + for (i = 0; i < argc; ++i) + argv_to_release[i] = argv[i]; + + glue_init1(argc, argv); + + /* Alloc and setup this default ctx for any sockets operations before + * thread/ctx binding which happens when epoll_wait. + */ + glue_ctx_alloc(); + + release_argv(argc, argv_to_release, argv); + + /* Set back the affinity */ + err = pthread_setaffinity_np(tid, sizeof(cpu_set_t), &cpuset); + if (err) + GLUE_LOG(ERR, "Failed to set back affinity"); +} + +static void __attribute__((destructor)) +glue_uninit(void) +{ + struct sock *so; + struct glue_ctx *ctx; + int i, max = fd_table.fd_base + fd_table.fd_num; + + /* TODO: lets optimize it */ + for (i = fd_table.fd_base; i < max; i++) { + so = fd2sock(i); + if (!so || !so->valid) + continue; + if (IS_TCP(so)) + tle_tcp_stream_kill(so->s); + } + + for (i = 0; i < nb_ctx; ++i) { + ctx = glue_ctx_lookup(0, i); + while (be_process(ctx)) { /* empty */ }; + } +} diff --git a/lib/libtle_glue/internal.h b/lib/libtle_glue/internal.h new file mode 100644 index 0000000..f0da012 --- /dev/null +++ b/lib/libtle_glue/internal.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_GLUE_INTERNAL_H_ +#define _TLE_GLUE_INTERNAL_H_ + +#include +#include + +#include + +#include +#include +#include +#include + +#include "ctx.h" +#include "sym.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern int stopped; + +extern uint64_t rx_offload; +extern uint64_t tx_offload; + +extern uint32_t timestamp_needed; + +void port_reconfig(void); + +uint16_t create_loopback(uint32_t socket_id); + +struct rte_mempool * get_mempool_by_socket(int32_t socket_id); + +int be_process(struct glue_ctx *ctx); + +int be_tx(struct glue_ctx *ctx); + +struct rte_mbuf * arp_recv(struct glue_ctx *ctx, + struct rte_mbuf *m, uint32_t l2len); + +struct rte_mbuf * ndp_recv(struct glue_ctx *ctx, + struct rte_mbuf *m, uint32_t l2len, uint32_t l3len); + + +void mac_check(struct glue_ctx *ctx, const struct sockaddr* addr); + +int arp_ipv4_dst_lookup_tcp(void *data, const struct in_addr *addr, + struct tle_dest *res); + +int arp_ipv4_dst_lookup_udp(void *data, const struct in_addr *addr, + struct tle_dest *res); + +int arp_ipv6_dst_lookup_tcp(void *data, const struct in6_addr *addr, + struct tle_dest *res); + +int arp_ipv6_dst_lookup_udp(void *data, const struct in6_addr *addr, + struct tle_dest *res); + +int mac_fill(struct glue_ctx *ctx, struct rte_mbuf *m); + +void mac_timeout(struct glue_ctx *ctx); + +int setup_rx_cb(uint16_t port_id, uint16_t qid); + +int epoll_kernel_wait(struct glue_ctx *ctx, int efd, + struct epoll_event *events, + int maxevents, int timeout, int *rx); + +int poll_common(struct glue_ctx *ctx, struct epoll_event *events, + int maxevents, int timeout, int shadow_efd); + +int dev_rxq_wakeup(uint16_t port_id); + +struct rte_mbuf * icmp_recv(struct glue_ctx *ctx, struct rte_mbuf *pkt, + uint32_t l2len, uint32_t l3len); + +struct rte_mbuf * icmp6_recv(struct glue_ctx *ctx, struct rte_mbuf *pkt, + uint32_t l2len, uint32_t l3len); + +uint16_t typen_rx_callback(uint16_t port, uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, + uint16_t max_pkts, void *user_param); + +void ipv4_dst_add(struct glue_ctx *ctx, const struct in_addr *addr, + struct ether_addr *e_addr); + +void ipv6_dst_add(struct glue_ctx *ctx, const struct in6_addr *addr, + struct ether_addr *e_addr); + +#ifdef LOOK_ASIDE_BACKEND +extern rte_atomic32_t flag_sleep; + +enum { + IOTHREAD_BUSY = 0, /* io thread is busy */ + IOTHREAD_SLEEP, /* io thread is sleeping */ + IOTHREAD_PREEMPT, /* io thread is preempted by another worker thread */ +}; + +static inline int +sleep_with_lock(int efd, struct epoll_event *events, int max, int to) +{ + int rc; + + rte_atomic32_set(&flag_sleep, IOTHREAD_SLEEP); + rc = k_epoll_pwait(efd, events, max, to, NULL); + while (rte_atomic32_cmpset((volatile uint32_t *)&flag_sleep, + IOTHREAD_SLEEP, IOTHREAD_BUSY) == 0); + + return rc; +} + +static inline void +be_tx_with_lock(struct glue_ctx *ctx) +{ + if (rte_atomic32_cmpset((volatile uint32_t *)&flag_sleep, + IOTHREAD_SLEEP, IOTHREAD_PREEMPT)) { + while (be_tx(ctx) > 0) {}; + rte_atomic32_set(&flag_sleep, IOTHREAD_SLEEP); + } +} + +static inline void +wake_lookaside_backend(struct glue_ctx *ctx) +{ + if (rte_atomic32_read(&flag_sleep) == IOTHREAD_PREEMPT) + dev_rxq_wakeup(ctx->port_id); +} + +static inline bool +io_thread_in_sleep(void) +{ + return rte_atomic32_read(&flag_sleep) == IOTHREAD_SLEEP; +} +#else +#define sleep_with_lock k_epoll_wait +#define be_tx_with_lock(ctx) do {} while(0) +#define wake_lookaside_backend(ctx) do {} while(0) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _TLE_GLUE_INTERNAL_H_ */ diff --git a/lib/libtle_glue/log.h b/lib/libtle_glue/log.h new file mode 100644 index 0000000..fe2ac9a --- /dev/null +++ b/lib/libtle_glue/log.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _GLUE_LOG_H_ +#define _GLUE_LOG_H_ + +#include + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * logging related macros. + */ + +#define GLUE_LOG(lvl, fmt, args...) RTE_LOG(lvl, USER1, fmt "\n", ##args) + +#define DUMMY_MACRO do {} while (0) + +#ifdef ENABLE_DEBUG +#define GLUE_DEBUG(fmt, arg...) printf(fmt "\n", ##arg) +#else +#define GLUE_DEBUG(fmt, arg...) DUMMY_MACRO +#endif + +#ifdef ENABLE_TRACE +#define TRACE(fmt, arg...) printf(fmt "\n", ##arg) +#define PKT_DUMP(p) rte_pktmbuf_dump(stdout, (p), 64) +#else +#define TRACE(fmt, arg...) DUMMY_MACRO +#define PKT_DUMP(p) DUMMY_MACRO +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _GLUE_LOG_H_ */ diff --git a/lib/libtle_glue/ndp.h b/lib/libtle_glue/ndp.h new file mode 100644 index 0000000..a61ff5b --- /dev/null +++ b/lib/libtle_glue/ndp.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2019 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_NDP_H_ +#define _TLE_NDP_H_ + +#define ND_OPT_SOURCE_LINKLAYER_ADDR 1 +#define ND_OPT_TARGET_LINKLAYER_ADDR 2 +#define ND_OPT_PREFIX_INFORMATION 3 +#define ND_OPT_REDIRECTED_HEADER 4 +#define ND_OPT_MTU 5 + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _TLE_NDP_H_ */ diff --git a/lib/libtle_glue/packetdrill.c b/lib/libtle_glue/packetdrill.c new file mode 100644 index 0000000..55bb317 --- /dev/null +++ b/lib/libtle_glue/packetdrill.c @@ -0,0 +1,544 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include + +#include "packetdrill.h" +#include "tle_glue.h" +#include "internal.h" +#include "fd.h" + +#include +#include +#include +#include +#include + +static int vhost_vid; +enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; +static const char *sockname = "/tmp/sock0"; + +static int +new_device(int vid) +{ + vhost_vid = vid; + + /* Disable notifications. */ + rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); + rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); + + return 0; +} + +static void +destroy_device(int vid) +{ + RTE_SET_USED(vid); +} + +static const struct vhost_device_ops device_ops = +{ + .new_device = new_device, + .destroy_device = destroy_device, +}; + +static void +vhost_init(void) +{ + unlink(sockname); + + if (rte_vhost_driver_register(sockname, 0) != 0) + rte_exit(EXIT_FAILURE, "failed to register vhost driver \n"); + + if (rte_vhost_driver_callback_register(sockname, &device_ops) != 0) + rte_exit(EXIT_FAILURE, "failed to register vhost driver callbacks.\n"); + + if (rte_vhost_driver_start(sockname) < 0) + rte_exit(EXIT_FAILURE, "failed to start vhost driver.\n"); + + rte_log_set_level(RTE_LOGTYPE_USER1, RTE_LOG_NOTICE); +} + +static uint64_t +now_usecs(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return ((uint64_t) tv.tv_sec * 1000000) + tv.tv_usec; +} + +static void +pd_free(void *userdata) +{ + RTE_SET_USED(userdata); +} + +static int +pd_socket(void *userdata, int domain, int type, int protocol) +{ + RTE_SET_USED(userdata); + return PRE(socket)(domain, type, protocol); +} + +static int +pd_bind(void *userdata, int sockfd, const struct sockaddr *addr, + socklen_t addrlen) +{ + RTE_SET_USED(userdata); + return PRE(bind)(sockfd, addr, addrlen); +} + +static int +pd_listen(void *userdata, int sockfd, int backlog) +{ + RTE_SET_USED(userdata); + return PRE(listen)(sockfd, backlog); +} + +static int +pd_accept(void *userdata, int sockfd, struct sockaddr *addr, + socklen_t *addrlen) +{ + RTE_SET_USED(userdata); + return PRE(accept)(sockfd, addr, addrlen); +} + +static int +pd_connect(void *userdata, int sockfd, const struct sockaddr *addr, + socklen_t addrlen) +{ + RTE_SET_USED(userdata); + return PRE(connect)(sockfd, addr, addrlen); +} + +static ssize_t +pd_read(void *userdata, int fd, void *buf, size_t count) +{ + RTE_SET_USED(userdata); + return PRE(read)(fd, buf, count); +} + +static ssize_t +pd_readv(void *userdata, int fd, const struct iovec *iov, int iovcnt) +{ + RTE_SET_USED(userdata); + return PRE(readv)(fd, iov, iovcnt); +} + +static ssize_t +pd_recv(void *userdata, int sockfd, void *buf, size_t len, int flags) +{ + RTE_SET_USED(userdata); + return PRE(recv)(sockfd, buf, len, flags); +} + +static ssize_t +pd_recvfrom(void *userdata, int sockfd, void *buf, size_t len, + int flags, struct sockaddr *src_addr, socklen_t *addrlen) +{ + RTE_SET_USED(userdata); + return PRE(recvfrom)(sockfd, buf, len, flags, src_addr, addrlen); +} + +static ssize_t +pd_recvmsg(void *userdata, int sockfd, struct msghdr *msg, int flags) +{ + RTE_SET_USED(userdata); + return PRE(recvmsg)(sockfd, msg, flags); +} + +static ssize_t +pd_write(void *userdata, int fd, const void *buf, size_t count) +{ + RTE_SET_USED(userdata); + return PRE(write)(fd, buf, count); +} + +static ssize_t +pd_writev(void *userdata, int fd, const struct iovec *iov, int iovcnt) +{ + RTE_SET_USED(userdata); + return PRE(writev)(fd, iov, iovcnt); +} + +static ssize_t +pd_send(void *userdata, int sockfd, const void *buf, size_t len, int flags) +{ + RTE_SET_USED(userdata); + return PRE(send)(sockfd, buf, len, flags); +} + +static ssize_t +pd_sendto(void *userdata, int sockfd, const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen) +{ + RTE_SET_USED(userdata); + return PRE(sendto)(sockfd, buf, len, flags, dest_addr, addrlen); +} + +static ssize_t +pd_sendmsg(void *userdata, int sockfd, const struct msghdr *msg, int flags) +{ + RTE_SET_USED(userdata); + return PRE(sendmsg)(sockfd, msg, flags); +} + +static int +pd_fcntl(void *userdata, int fd, int cmd, ...) +{ + void *arg; + va_list ap; + + va_start(ap, cmd); + arg = va_arg(ap, void *); + va_end(ap); + + RTE_SET_USED(userdata); + return PRE(fcntl)(fd, cmd, arg); +} + +static int +pd_ioctl(void *userdata, int fd, unsigned long request, ...) +{ + void *arg; + va_list ap; + + va_start(ap, request); + arg = va_arg(ap, void *); + va_end(ap); + + RTE_SET_USED(userdata); + return PRE(ioctl)(fd, request, arg); +} + +static int +pd_close(void *userdata, int fd) +{ + RTE_SET_USED(userdata); + return PRE(close)(fd); +} + +static int +pd_shutdown(void *userdata, int sockfd, int how) +{ + RTE_SET_USED(userdata); + return PRE(shutdown)(sockfd, how); +} + +static int +pd_getsockopt(void *userdata, int sockfd, int level, int optname, + void *optval, socklen_t *optlen) +{ + RTE_SET_USED(userdata); + return PRE(getsockopt)(sockfd, level, optname, optval, optlen); +} + +static int +pd_setsockopt(void *userdata, int sockfd, int level, int optname, + const void *optval, socklen_t optlen) +{ + RTE_SET_USED(userdata); + return PRE(setsockopt)(sockfd, level, optname, optval, optlen); +} + +static int +pd_poll(void *userdata, struct pollfd *fds, nfds_t nfds, int timeout) +{ + RTE_SET_USED(userdata); + return PRE(poll)(fds, nfds, timeout); +} + +static struct rte_mbuf * +from_buf_to_mbuf(const void *buf, size_t count) +{ + struct rte_mempool *mp = get_mempool_by_socket(0); + uint16_t nb_mbufs = (count + RTE_MBUF_DEFAULT_DATAROOM - 1) / + RTE_MBUF_DEFAULT_DATAROOM; + struct rte_mbuf *mbufs[nb_mbufs + 1]; + uint16_t i, copy_len; + size_t done = 0; + char *dst; + + if (unlikely(rte_pktmbuf_alloc_bulk(mp, mbufs, nb_mbufs) < 0)) + rte_exit(EXIT_FAILURE, "allocate mbuf fails\n"); + + for (i = 0; i < nb_mbufs; ++i) { + copy_len = RTE_MIN((size_t)RTE_MBUF_DEFAULT_DATAROOM, + count - done); + dst = rte_pktmbuf_mtod(mbufs[i], char *); + rte_memcpy(dst, (const char *)buf + done, copy_len); + done += copy_len; + mbufs[i]->data_len = copy_len; + if (i > 0) + mbufs[i-1]->next = mbufs[i]; + } + + mbufs[0]->pkt_len = count; + mbufs[0]->nb_segs = nb_mbufs; + + return mbufs[0]; +} + +/* Send @count bytes of data starting from @buf to the TCP stack. + * Return 0 on success or -1 on error. + */ +static int +pd_netdev_send(void *userdata, const void *buf, size_t count) +{ + struct ether_hdr *hdr; + struct rte_mbuf *m; + + RTE_SET_USED(userdata); + + m = from_buf_to_mbuf(buf, count); + + // add l2 header + hdr = (struct ether_hdr *)rte_pktmbuf_prepend(m, sizeof(struct ether_hdr)); + hdr->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4); + + if (rte_vhost_enqueue_burst(vhost_vid, VIRTIO_RXQ, &m, 1) == 1) + return 0; + + return -1; +} + +static inline struct rte_mbuf * +from_mbuf_to_buf(struct rte_mbuf *m, char *buf, size_t len, int ispeek, int needcpy) +{ + void *src; + uint32_t done = 0; + uint32_t left = len, orig_pkt_len; + uint16_t copy_len, seg_len; + struct rte_mbuf *m_next, *orig_pkt; + + if (len == 0) + return m; + + orig_pkt = m; + orig_pkt_len = m->pkt_len; + + do { + seg_len = rte_pktmbuf_data_len(m); + copy_len = RTE_MIN(seg_len, left); + src = rte_pktmbuf_mtod(m, void *); + if (needcpy) + rte_memcpy(buf + done, src, copy_len); + done += copy_len; + left -= copy_len; + if (copy_len < seg_len) { + if (!ispeek) { + rte_pktmbuf_adj(m, copy_len); + } + break; + } + m_next = m->next; + if (!ispeek) { + rte_pktmbuf_free_seg(m); + } + m = m_next; + } while (left && m); + + if (m && !ispeek) + m->pkt_len = orig_pkt_len - done; + + if(ispeek) + return orig_pkt; + else + return m; +} + +/* Sniff the next packet leaving the TCP stack. + * Put packet data in @buf. @count is passed in as the buffer size. + * The actual number of bytes received should be put in @count. + * Set @count to 0 if received nothing. + * Set @time_usecs to the receive timestamp. + * Return 0 on success or -1 on error. */ +static int +pd_netdev_recv(void *userdata, void *buf, size_t *count, long long *time_usecs) +{ + struct rte_mbuf *m; + struct rte_mempool *mp = get_mempool_by_socket(0); + + RTE_SET_USED(userdata); + + while (rte_vhost_dequeue_burst(vhost_vid, VIRTIO_TXQ, mp, &m, 1) == 0); + + // remove l2 header + rte_pktmbuf_adj(m, sizeof(struct ether_hdr)); + + *count = m->pkt_len; + from_mbuf_to_buf(m, buf, *count, 0, 1); + + *time_usecs = now_usecs(); + return 0; +} + +static int +pd_usleep(void *userdata, useconds_t usec) +{ + RTE_SET_USED(userdata); + return usleep(usec); +} + +static int +pd_gettimeofday(void *userdata, struct timeval *tv, struct timezone *tz) +{ + RTE_SET_USED(userdata); + return gettimeofday(tv, tz); +} + +static int +pd_epoll_create(void *userdata, int size) +{ + RTE_SET_USED(userdata); + return PRE(epoll_create)(size); +} + +static int +pd_epoll_ctl(void *userdata, int epfd, int op, int fd, + struct epoll_event *event) +{ + RTE_SET_USED(userdata); + return PRE(epoll_ctl)(epfd, op, fd, event); +} + +static int +pd_epoll_wait(void *userdata, int epfd, struct epoll_event *events, + int maxevents, int timeout) +{ + RTE_SET_USED(userdata); + return PRE(epoll_wait)(epfd, events, maxevents, timeout); +} + +static int +pd_pipe(void *userdata, int pipefd[2]) +{ + RTE_SET_USED(userdata); + return pipe(pipefd); +} + +static int +pd_splice(void *userdata, int fd_in, loff_t *off_in, int fd_out, + loff_t *off_out, size_t len, unsigned int flags) +{ + RTE_SET_USED(userdata); + return PRE(splice)(fd_in, off_in, fd_out, off_out, len, flags); +} + +static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; + +static void * +io(void *arg) +{ + int epfd; + struct in_addr ipv4; + struct ether_addr mac = { .addr_bytes = { 0xee, 0xff, 0xff, 0xff, 0xff, 0xff}, }; + struct epoll_event events[128]; + + RTE_SET_USED(arg); + + setenv(DPDK_IP, "192.168.0.2", 1); + setenv(DPDK_IP_MASK, "16", 1); + setenv(DPDK_IP_GATEWAY, "192.168.0.1", 1); + + setenv(DPDK_IPV6, "fd3d:fa7b:d17d::0", 1); + setenv(DPDK_IPV6_MASK, "48", 1); + setenv(DPDK_IPV6_GATEWAY, "fd3d:fa7b:d17d:8888::0", 1); + + epfd = PRE(epoll_create)(0); + + inet_pton(AF_INET, "192.168.0.1", &ipv4); + + ipv4_dst_add(default_ctx, &ipv4, &mac); + + pthread_mutex_unlock(&lock); + + while (1) { + PRE(epoll_wait)(epfd, events, 128, 1000); + } + + return NULL; +} + +void +packetdrill_interface_init(const char *flags, + struct packetdrill_interface *ifc) +{ + int argc = 0; + char *argv[16]; + pthread_t tid; + + RTE_SET_USED(flags); + + argv[argc++] = strdup("test"); + argv[argc++] = strdup("-l"); + argv[argc++] = strdup("0"); + argv[argc++] = strdup("--no-pci"); + argv[argc++] = strdup("--in-memory"); + argv[argc++] = strdup("--single-file-segments"); + argv[argc++] = strdup("--"); + + if (rte_eal_init(argc, argv) < 0) + rte_exit(EXIT_FAILURE, "Failed to init DPDK\n"); + + fd_init(); + + vhost_init(); + + if (rte_eal_hotplug_add("vdev", "virtio_user0", "path=/tmp/sock0") < 0) + rte_exit(EXIT_FAILURE, "hot plug virtio-user failed\n"); + + pthread_mutex_lock(&lock); + + pthread_create(&tid, NULL, io, NULL); + + pthread_mutex_lock(&lock); + + ifc->free = pd_free; + ifc->socket = pd_socket; + ifc->bind = pd_bind; + ifc->listen = pd_listen; + ifc->accept = pd_accept; + ifc->connect = pd_connect; + ifc->read = pd_read; + ifc->readv = pd_readv; + ifc->recv = pd_recv; + ifc->recvfrom = pd_recvfrom; + ifc->recvmsg = pd_recvmsg; + ifc->write = pd_write; + ifc->writev = pd_writev; + ifc->send = pd_send; + ifc->sendto = pd_sendto; + ifc->sendmsg = pd_sendmsg; + ifc->fcntl = pd_fcntl; + ifc->ioctl = pd_ioctl; + ifc->close = pd_close; + ifc->shutdown = pd_shutdown; + ifc->getsockopt = pd_getsockopt; + ifc->setsockopt = pd_setsockopt; + ifc->poll = pd_poll; + ifc->netdev_send = pd_netdev_send; + ifc->netdev_receive = pd_netdev_recv; + ifc->usleep = pd_usleep; + ifc->gettimeofday = pd_gettimeofday; + ifc->epoll_create = pd_epoll_create; + ifc->epoll_ctl = pd_epoll_ctl; + ifc->epoll_wait = pd_epoll_wait; + ifc->pipe = pd_pipe; + ifc->splice = pd_splice; +} diff --git a/lib/libtle_glue/packetdrill.h b/lib/libtle_glue/packetdrill.h new file mode 100644 index 0000000..6f84a87 --- /dev/null +++ b/lib/libtle_glue/packetdrill.h @@ -0,0 +1,111 @@ +/* + * Copyright 2015 Google Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +/* + * Author: xiaoj@google.com (Xiao Jia) + * + * Interface for packetdrill. + * + * To be tested against as a shared object (*.so) file, implement this + * interface, export a function "packetdrill_interface_init", and + * initialize the interface struct passed in with your own functions. + */ + +#ifndef __PACKETDRILL_H__ +#define __PACKETDRILL_H__ + +#include +#include +#include +#include +#include +#include +#include + +struct packetdrill_interface { + void *userdata; + void (*free)(void *userdata); + int (*socket)(void *userdata, int domain, int type, int protocol); + int (*bind)(void *userdata, int sockfd, const struct sockaddr *addr, + socklen_t addrlen); + int (*listen)(void *userdata, int sockfd, int backlog); + int (*accept)(void *userdata, int sockfd, struct sockaddr *addr, + socklen_t *addrlen); + int (*connect)(void *userdata, int sockfd, const struct sockaddr *addr, + socklen_t addrlen); + ssize_t (*read)(void *userdata, int fd, void *buf, size_t count); + ssize_t (*readv)(void *userdata, int fd, const struct iovec *iov, + int iovcnt); + ssize_t (*recv)(void *userdata, int sockfd, void *buf, size_t len, + int flags); + ssize_t (*recvfrom)(void *userdata, int sockfd, void *buf, size_t len, + int flags, struct sockaddr *src_addr, + socklen_t *addrlen); + ssize_t (*recvmsg)(void *userdata, int sockfd, struct msghdr *msg, + int flags); + ssize_t (*write)(void *userdata, int fd, const void *buf, size_t count); + ssize_t (*writev)(void *userdata, int fd, const struct iovec *iov, + int iovcnt); + ssize_t (*send)(void *userdata, int sockfd, const void *buf, size_t len, + int flags); + ssize_t (*sendto)(void *userdata, int sockfd, const void *buf, + size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen); + ssize_t (*sendmsg)(void *userdata, int sockfd, const struct msghdr *msg, + int flags); + int (*fcntl)(void *userdata, int fd, int cmd, ...); + int (*ioctl)(void *userdata, int fd, unsigned long request, ...); + int (*close)(void *userdata, int fd); + int (*shutdown)(void *userdata, int sockfd, int how); + int (*getsockopt)(void *userdata, int sockfd, int level, int optname, + void *optval, socklen_t *optlen); + int (*setsockopt)(void *userdata, int sockfd, int level, int optname, + const void *optval, socklen_t optlen); + int (*poll)(void *userdata, struct pollfd *fds, nfds_t nfds, + int timeout); + /* Send @count bytes of data starting from @buf to the TCP stack. + * Return 0 on success or -1 on error. */ + int (*netdev_send)(void *userdata, const void *buf, size_t count); + /* Sniff the next packet leaving the TCP stack. + * Put packet data in @buf. @count is passed in as the buffer size. + * The actual number of bytes received should be put in @count. + * Set @count to 0 if received nothing. + * Set @time_usecs to the receive timestamp. + * Return 0 on success or -1 on error. */ + int (*netdev_receive)(void *userdata, void *buf, size_t *count, + long long *time_usecs); + int (*usleep)(void *userdata, useconds_t usec); + int (*gettimeofday)(void *userdata, struct timeval *tv, + struct timezone *tz); + int (*epoll_create)(void *userdata, int size); + int (*epoll_ctl)(void *userdata, int epfd, int op, int fd, + struct epoll_event *event); + int (*epoll_wait)(void *userdata, int epfd, struct epoll_event *events, + int maxevents, int timeout); + int (*pipe)(void *userdata, int pipefd[2]); + int (*splice)(void *userdata, int fd_in, loff_t *off_in, int fd_out, + loff_t *off_out, size_t len, unsigned int flags); +}; + +typedef void (*packetdrill_interface_init_t)(const char *flags, + struct packetdrill_interface *); + +void +packetdrill_interface_init(const char *flags, struct packetdrill_interface *ifc); + +#endif /* __PACKETDRILL_H__ */ diff --git a/lib/libtle_glue/poll.c b/lib/libtle_glue/poll.c new file mode 100644 index 0000000..e1e2afb --- /dev/null +++ b/lib/libtle_glue/poll.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include + +#include "fd.h" +#include "ctx.h" +#include "sym.h" +#include "log.h" +#include "util.h" +#include "internal.h" +#include "tle_glue.h" + +int +PRE(poll)(struct pollfd *fds, nfds_t nfds, int timeout) +{ + int efd; + int total = 0, j; + int tmp_ev; + uint32_t i; + uint32_t k_n = 0; + int k_fds[nfds]; + struct epoll_event k_ev; + struct sock *so; + struct glue_ctx *ctx; + struct epoll_event events[nfds]; + + for (i = 0; i < nfds; ++i) { + if (is_kernel_fd(fds[i].fd)) { + k_fds[k_n++] = i; + } else { + so = fd2sock(fds[i].fd); + /* fix me: check if fd is opened */ + + /* To check, if we already have some ready events */ + if (TLE_SEV_DOWN != tle_event_state(so->erev)) { + fds[i].revents = POLLERR | POLLHUP | + (fds[i].events & (POLLIN | POLLOUT)); + total++; + continue; + } + if ((fds[i].events & POLLIN) && + TLE_SEV_DOWN != tle_event_state(so->rxev)) { + fds[i].revents = POLLIN; + total++; + continue; + } + if ((fds[i].events & POLLOUT) && + TLE_SEV_DOWN != tle_event_state(so->txev)) { + fds[i].revents = POLLOUT; + total++; + continue; + } + + /* We fill sock->event here as we need this when + * we filter events in poll_common(). But it was + * originally set by epoll_ctl(). Now we have to + * assume that there are no application which + * uses epoll and poll at the same time. + */ + so->event.events = fds[i].events; + so->event.data.u32 = i; /* store idx */ + } + } + + if (k_n == nfds) + return k_poll(fds, nfds, timeout); + + if (total > 0) + return total; + + /* thread <> context binding happens here */ + if (RTE_PER_LCORE(glue_ctx) == NULL) { + ctx = &ctx_array[glue_ctx_alloc()]; + RTE_PER_LCORE(glue_ctx) = ctx; + } else + ctx = RTE_PER_LCORE(glue_ctx); + + total = poll_common(ctx, events, nfds, 0, -1); + + /* We assume kernel I/O events are not as important as user ones */ + if (total > 0) + goto format; + + efd = k_epoll_create(1); + if (efd < 0) + rte_panic("k_epoll_create failed %d", errno); + + for (i = 0; i < k_n; ++i) { + k_ev.events = fds[k_fds[i]].events; + k_ev.data.u32 = k_fds[i]; /* store idx */ + k_epoll_ctl(efd, EPOLL_CTL_ADD, fds[k_fds[i]].fd, &k_ev); + } + + total = poll_common(ctx, events, nfds, timeout, efd); + k_close(efd); +format: + for (j = 0; j < total; ++j) { + tmp_ev = events[j].events; + if (tmp_ev == POLLHUP) { + tmp_ev |= POLLERR | + (fds[events[j].data.u32].events & (POLLIN | POLLOUT)); + } + fds[events[j].data.u32].revents = tmp_ev; + } + + return total; +} + +int +PRE(ppoll)(struct pollfd *fds, nfds_t nfds, + const struct timespec *tmo_p, const sigset_t *sigmask) +{ + int timeout; + + if (sigmask != NULL) { + rte_panic("ppoll with signal is not supported"); + } + + if (tmo_p == NULL) + timeout = -1; + else + timeout = tmo_p->tv_sec * 1000 + tmo_p->tv_nsec / 1000000; + + return poll(fds, nfds, timeout); +} + +extern int __poll_chk(struct pollfd *fds, nfds_t nfds, int timeout, + __SIZE_TYPE__ fdslen); +int +__poll_chk(struct pollfd *fds, nfds_t nfds, int timeout, + __SIZE_TYPE__ fdslen __rte_unused) +{ + return poll(fds, nfds, timeout); +} diff --git a/lib/libtle_glue/port.c b/lib/libtle_glue/port.c new file mode 100644 index 0000000..7e95bee --- /dev/null +++ b/lib/libtle_glue/port.c @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include "log.h" +#include "ctx.h" +#include "config.h" +#include "internal.h" + +int stopped; + +static struct rte_mempool *mpool[RTE_MAX_NUMA_NODES]; + +struct rte_mempool * +get_mempool_by_socket(int32_t socket_id) +{ + struct rte_mempool *mp; + char name[RTE_MEMPOOL_NAMESIZE]; + + if (socket_id == SOCKET_ID_ANY) + socket_id = 0; + + if (mpool[socket_id]) + return mpool[socket_id]; + + snprintf(name, sizeof(name), "MP%u", socket_id); + mp = rte_pktmbuf_dynamic_pool_create(name, MAX_MBUFS - 1, + MBUF_PERCORE_CACHE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, + socket_id, MBUF_DYNAMIC_SIZE); + + if (mp == NULL) + rte_panic("Failed to create mbuf mempool"); + + mpool[socket_id] = mp; + return mp; +} + +static void +update_rss_conf(uint16_t port_id) +{ + struct rte_eth_rss_conf rss_conf = { + .rss_key = NULL, + .rss_key_len = 0, + .rss_hf = ETH_RSS_IP | ETH_RSS_TCP | ETH_RSS_UDP, + }; + + if (rte_eth_dev_rss_hash_update(port_id, &rss_conf) < 0) + rte_panic("Failed to update rss hash"); +} + +static void +queue_init(uint16_t port_id, uint16_t nb_queues, + struct rte_eth_dev_info *dev_info, + struct rte_eth_conf *port_conf) +{ + uint16_t q; + int32_t socket_id, rc; + uint16_t nb_rxd = 1024, nb_txd = 1024; + struct rte_mempool *mp; + struct rte_eth_txconf txq_conf = dev_info->default_txconf; + struct rte_eth_rxconf rxq_conf = dev_info->default_rxconf; + + socket_id = rte_eth_dev_socket_id(port_id); + mp = get_mempool_by_socket(socket_id); + + dev_info->default_rxconf.rx_drop_en = 1; + + rc = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd); + if (rc < 0) + rte_panic("Cannot adjust number of desc"); + + rxq_conf.offloads = port_conf->rxmode.offloads; + txq_conf.offloads = port_conf->txmode.offloads; + + /* faster free of tx entries */ + txq_conf.tx_free_thresh = nb_txd - 64; + + for (q = 0; q < nb_queues; q++) { + rc = rte_eth_rx_queue_setup(port_id, q, nb_rxd, + socket_id, &rxq_conf, mp); + if (rc < 0) + rte_panic("rx queue=%u setup failed: %d", q, rc); + + rc = setup_rx_cb(port_id, q); + if (rc < 0) + rte_panic("rx queue=%u rx setup failed: %d", q, rc); + } + + for (q = 0; q < nb_queues; q++) { + rc = rte_eth_tx_queue_setup(port_id, q, nb_txd, + socket_id, &txq_conf); + if (rc < 0) + rte_panic("tx queue=%u setup failed: %d", q, rc); + } +} + +uint64_t rx_offload = + DEV_RX_OFFLOAD_IPV4_CKSUM | + DEV_RX_OFFLOAD_UDP_CKSUM | + DEV_RX_OFFLOAD_TCP_CKSUM; +/* nice to have: + DEV_RX_OFFLOAD_CRC_STRIP | + DEV_RX_OFFLOAD_TCP_LRO | + DEV_RX_OFFLOAD_HEADER_SPLIT | + DEV_RX_OFFLOAD_SCATTER | + DEV_RX_OFFLOAD_TIMESTAMP +*/ + +uint64_t tx_offload = + DEV_TX_OFFLOAD_UDP_CKSUM | + DEV_TX_OFFLOAD_TCP_CKSUM | + DEV_TX_OFFLOAD_TCP_TSO | + DEV_TX_OFFLOAD_MULTI_SEGS; + +int +dev_rxq_wakeup(uint16_t port_id) +{ + int fd; + uint16_t qid; + uint32_t vec, efd_idx; + struct rte_eth_dev *dev; + struct rte_intr_handle *intr_handle; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + intr_handle = dev->intr_handle; + if (!intr_handle) + return -ENOTSUP; + if (!intr_handle->intr_vec) + return -EPERM; + + for (qid = 0; qid < dev->data->nb_rx_queues; qid++) { + vec = intr_handle->intr_vec[qid]; + efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ? + (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec; + fd = intr_handle->efds[efd_idx]; + if (eventfd_write(fd, (eventfd_t) 1) < 0) + return -errno; + } + + return 0; +} + +void +port_reconfig(void) +{ + int32_t rc; + struct rte_eth_dev_info dev_info; + uint16_t port_id = 0; /* We use and only use port 0 */ + uint16_t nb_port; + uint16_t nb_queues = nb_ctx; + + struct rte_eth_conf port_conf = { + .intr_conf = { + .rxq = 1, + }, + }; + + /* 0. dev number check */ + nb_port = rte_eth_dev_count_avail(); + if (nb_port < 1 || nb_port >2) + rte_panic("One port is mandatory with an optional loopback device\n"); + + stopped = 1; + rte_wmb(); + /* wake up all rxqs */ + if (nb_ctx > 1) + dev_rxq_wakeup(port_id); + + usleep(1); /* fix me: this cannot gurantee correctness */ + + rte_eth_dev_stop(port_id); + + /* 1. offloading check and set*/ + rte_eth_dev_info_get(port_id, &dev_info); + rx_offload &= dev_info.rx_offload_capa; + port_conf.rxmode.offloads = rx_offload; + tx_offload &= dev_info.tx_offload_capa; + port_conf.txmode.offloads = tx_offload; + + GLUE_LOG(INFO, "configure queues = %d, offloads: rx = %"PRIx64", tx = %"PRIx64, + nb_queues, rx_offload, tx_offload); + + /* 2. dev configure */ + rc = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); + if (rc != 0) + rte_panic("Failed to configure device, %d", rc); + + /* 3. queue setup */ + queue_init(port_id, nb_queues, &dev_info, &port_conf); + + /* 4. rss conf */ + if (nb_queues > 1) + update_rss_conf(port_id); + + /* 5. dev start */ + if (rte_eth_dev_start(port_id) < 0) + rte_panic("Failed to start device"); + + stopped = 0; +} + +uint16_t +create_loopback(uint32_t socket_id) +{ + int ret; + struct rte_ring* lb_queue; + static uint16_t lb_port_id = 0xFFFF; + const char *ring_name = "loopback-ring"; + + if (lb_port_id != 0xFFFF) + return lb_port_id; + + lb_queue = rte_ring_create(ring_name, MAX_PKTS_BURST * 8, socket_id, + RING_F_SP_ENQ | RING_F_SC_DEQ); + if (!lb_queue) + rte_panic("Failed to create ring for loopback\n"); + ret = rte_eth_from_ring(lb_queue); + if (ret < 0) + rte_panic("Failed to create ethdev from ring\n"); + lb_port_id = ret; + + if (setup_rx_cb(lb_port_id, 0) < 0) + rte_panic("Failed to set up rx cb for loopback\n"); + + return lb_port_id; +} diff --git a/lib/libtle_glue/rxcb.c b/lib/libtle_glue/rxcb.c new file mode 100644 index 0000000..cdcd756 --- /dev/null +++ b/lib/libtle_glue/rxcb.c @@ -0,0 +1,834 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "log.h" +#include "ctx.h" +#include "internal.h" + +struct ptype2cb { + uint32_t mask; + const char *name; + rte_rx_callback_fn fn; +}; + +enum { + ETHER_ARP_PTYPE = 0x1, + IPV4_PTYPE = 0x2, + IPV4_EXT_PTYPE = 0x4, + IPV6_PTYPE = 0x8, + IPV6_EXT_PTYPE = 0x10, + TCP_PTYPE = 0x20, + UDP_PTYPE = 0x40, + ICMP_PTYPE = 0x80, +}; + +static inline uint64_t +_mbuf_tx_offload(uint64_t il2, uint64_t il3, uint64_t il4, uint64_t tso, + uint64_t ol3, uint64_t ol2) +{ + return il2 | il3 << 7 | il4 << 16 | tso << 24 | ol3 << 40 | ol2 << 49; +} + +static inline int32_t +fill_pkt_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t l3, uint32_t l4) +{ + if (l2 + l3 + l4 > m->pkt_len) + return -1; + m->tx_offload = _mbuf_tx_offload(l2, l3, l4, 0, 0, 0); + return 0; +} + +static inline int +is_ipv4_frag(const struct ipv4_hdr *iph) +{ + const uint16_t mask = rte_cpu_to_be_16(~IPV4_HDR_DF_FLAG); + + return ((mask & iph->fragment_offset) != 0); +} + +static inline uint32_t +get_tcp_header_size(struct rte_mbuf *m, uint32_t l2_len, uint32_t l3_len) +{ + const struct tcp_hdr *tcp; + + tcp = rte_pktmbuf_mtod_offset(m, struct tcp_hdr *, l2_len + l3_len); + return (tcp->data_off >> 4) * 4; +} + +static inline int32_t +adjust_ipv4_pktlen(struct rte_mbuf *m, uint32_t l2_len) +{ + uint32_t plen, trim; + const struct ipv4_hdr *iph; + + iph = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr *, l2_len); + plen = rte_be_to_cpu_16(iph->total_length) + l2_len; + if (plen < m->pkt_len) { + trim = m->pkt_len - plen; + rte_pktmbuf_trim(m, trim); + } else if (plen > m->pkt_len) { + return -1; + } + return 0; +} + +static inline int32_t +adjust_ipv6_pktlen(struct rte_mbuf *m, uint32_t l2_len) +{ + uint32_t plen, trim; + const struct ipv6_hdr *iph; + + iph = rte_pktmbuf_mtod_offset(m, const struct ipv6_hdr *, l2_len); + plen = rte_be_to_cpu_16(iph->payload_len) + sizeof(*iph) + l2_len; + if (plen < m->pkt_len) { + trim = m->pkt_len - plen; + rte_pktmbuf_trim(m, trim); + } else if (plen > m->pkt_len) { + return -1; + } + return 0; +} + +static inline uint32_t +get_ipv4_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t proto, uint32_t frag) +{ + const struct ipv4_hdr *iph; + int32_t dlen, len; + + dlen = rte_pktmbuf_data_len(m); + dlen -= l2; + + iph = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr *, l2); + len = (iph->version_ihl & IPV4_HDR_IHL_MASK) * IPV4_IHL_MULTIPLIER; + + if (frag != 0 && is_ipv4_frag(iph)) { + m->packet_type &= ~RTE_PTYPE_L4_MASK; + m->packet_type |= RTE_PTYPE_L4_FRAG; + } + + if (len > dlen || (proto <= IPPROTO_MAX && iph->next_proto_id != proto)) + m->packet_type = RTE_PTYPE_UNKNOWN; + + return len; +} + +static inline uint32_t +get_ipv6x_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t *fproto) +{ + const struct ipv6_hdr *ip6h; + const struct ip6_ext *ipx; + uint32_t nproto; + int32_t dlen, len, ofs; + + ip6h = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr*, l2); + nproto = ip6h->proto; + len = sizeof(struct ipv6_hdr); + + dlen = rte_pktmbuf_data_len(m); + dlen -= l2; + + ofs = l2 + len; + ipx = rte_pktmbuf_mtod_offset(m, const struct ip6_ext *, ofs); + + while (ofs > 0 && len < dlen) { + + switch (nproto) { + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: + ofs = (ipx->ip6e_len + 1) << 3; + break; + case IPPROTO_AH: + ofs = (ipx->ip6e_len + 2) << 2; + break; + case IPPROTO_FRAGMENT: + /* + * tso_segsz is not used by RX, so use it as temporary + * buffer to store the fragment offset. + */ + m->tso_segsz = l2 + len; + ofs = sizeof(struct ip6_frag); + m->packet_type &= ~RTE_PTYPE_L4_MASK; + m->packet_type |= RTE_PTYPE_L4_FRAG; + break; + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_ICMPV6: + ofs = 0; + if (*fproto == 0) + *fproto = nproto; + break; + default: + ofs = 0; + } + + if (ofs > 0) { + nproto = ipx->ip6e_nxt; + len += ofs; + ipx += ofs / sizeof(*ipx); + } + } + + /* unrecognized or invalid packet. */ + if (*fproto == 0 || len > dlen) + m->packet_type = RTE_PTYPE_UNKNOWN; + + return len; +} + +static inline uint32_t +get_ipv6_hdr_len(struct rte_mbuf *m, uint32_t l2, uint32_t fproto) +{ + const struct ipv6_hdr *iph; + + iph = rte_pktmbuf_mtod_offset(m, const struct ipv6_hdr *, + sizeof(struct ether_hdr)); + + if (iph->proto == fproto) + return sizeof(struct ipv6_hdr); + else + return get_ipv6x_hdr_len(m, l2, &fproto); +} + +static inline struct rte_mbuf* +process_ipv4_frag(struct rte_mbuf *m, struct glue_ctx *ctx, uint32_t l2_len, uint32_t l3_len) +{ + struct ipv4_hdr* iph; + + m->l2_len = l2_len; + m->l3_len = l3_len; + /* fixme: ip checksum should be checked here. + * After reassemble, the ip checksum would be invalid. + */ + m = rte_ipv4_frag_reassemble_packet(ctx->frag_tbl, + &ctx->frag_dr, m, rte_rdtsc(), + rte_pktmbuf_mtod_offset(m, struct ipv4_hdr*, m->l2_len)); + rte_ip_frag_free_death_row(&ctx->frag_dr, 3); + if (m == NULL) + return NULL; + iph = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr*, m->l2_len); + switch (iph->next_proto_id) { + case IPPROTO_TCP: + m->packet_type &= ~RTE_PTYPE_L4_MASK; + m->packet_type |= RTE_PTYPE_L4_TCP; + break; + case IPPROTO_UDP: + m->packet_type &= ~RTE_PTYPE_L4_MASK; + m->packet_type |= RTE_PTYPE_L4_UDP; + break; + } + return m; +} + +static inline struct rte_mbuf* +process_ipv6_frag(struct rte_mbuf *m, struct glue_ctx *ctx, uint32_t l2_len, uint32_t l3_len) +{ + struct ipv6_hdr* ip6h; + + m->l2_len = l2_len; + m->l3_len = l3_len; + m = rte_ipv6_frag_reassemble_packet(ctx->frag_tbl, + &ctx->frag_dr, m, rte_rdtsc(), + rte_pktmbuf_mtod_offset(m, struct ipv6_hdr*, l2_len), + rte_pktmbuf_mtod_offset(m, struct ipv6_extension_fragment*, m->tso_segsz)); + rte_ip_frag_free_death_row(&ctx->frag_dr, 3); + if (m == NULL) + return NULL; + ip6h = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr*, m->l2_len); + switch (ip6h->proto) { + case IPPROTO_TCP: + m->packet_type &= ~RTE_PTYPE_L4_MASK; + m->packet_type |= RTE_PTYPE_L4_TCP; + break; + case IPPROTO_UDP: + m->packet_type &= ~RTE_PTYPE_L4_MASK; + m->packet_type |= RTE_PTYPE_L4_UDP; + break; + } + return m; +} + +static inline struct rte_mbuf * +fill_ptypes_and_hdr_len(struct glue_ctx *ctx, struct rte_mbuf *m) +{ + uint32_t dlen, l2_len, l3_len, l4_len, proto; + const struct ether_hdr *eth; + uint32_t ptypes; + uint16_t etp; + int32_t error = 0; + + dlen = rte_pktmbuf_data_len(m); + + /* L2 */ + l2_len = sizeof(*eth); + + eth = rte_pktmbuf_mtod(m, const struct ether_hdr *); + etp = eth->ether_type; + while (etp == rte_be_to_cpu_16(ETHER_TYPE_VLAN)) { + etp = rte_pktmbuf_mtod_offset(m, struct vlan_hdr*, l2_len)->eth_proto; + l2_len += sizeof(struct vlan_hdr); + } + + if (etp == rte_be_to_cpu_16(ETHER_TYPE_ARP)) + return arp_recv(ctx, m, l2_len); + + if (etp == rte_be_to_cpu_16(ETHER_TYPE_IPv4)) { + const struct ipv4_hdr *hdr; + + /* L3 */ + hdr = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr *, l2_len); + error = adjust_ipv4_pktlen(m, l2_len); + if (error) { + rte_pktmbuf_free(m); + return NULL; + } + l3_len = get_ipv4_hdr_len(m, l2_len, IPPROTO_MAX + 1, 1); + + if ((m->packet_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_FRAG) { + m = process_ipv4_frag(m, ctx, l2_len, l3_len); + if (m == NULL) + return NULL; + hdr = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr*, m->l2_len); + l3_len = get_ipv4_hdr_len(m, m->l2_len, IPPROTO_MAX + 1, 0); + } + + /* L4 */ + switch (hdr->next_proto_id) { + case IPPROTO_ICMP: + return icmp_recv(ctx, m, l2_len, l3_len); + case IPPROTO_TCP: + ptypes = RTE_PTYPE_L4_TCP | + RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L2_ETHER; + l4_len = get_tcp_header_size(m, l2_len, l3_len); + break; + case IPPROTO_UDP: + ptypes = RTE_PTYPE_L4_UDP | + RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L2_ETHER; + l4_len = sizeof(struct udp_hdr); + break; + default: + GLUE_LOG(ERR, "drop ipv4 pkt of unknow L4: (%d)", + hdr->next_proto_id); + rte_pktmbuf_free(m); + return NULL; + } + + } else if (etp == rte_be_to_cpu_16(ETHER_TYPE_IPv6) && + dlen >= l2_len + sizeof(struct ipv6_hdr) + sizeof(struct udp_hdr)) { + /* L3 */ + error = adjust_ipv6_pktlen(m, l2_len); + if (error) { + rte_pktmbuf_free(m); + return NULL; + } + proto = 0; + l3_len = get_ipv6x_hdr_len(m, l2_len, &proto); + + if ((m->packet_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_FRAG) { + m = process_ipv6_frag(m, ctx, l2_len, l3_len); + if (m == NULL) + return NULL; + l3_len = get_ipv6x_hdr_len(m, m->l2_len, &proto); + } + + /* L4 */ + switch (proto) { + case IPPROTO_TCP: + ptypes = RTE_PTYPE_L4_TCP | + RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L2_ETHER; + l4_len = get_tcp_header_size(m, l2_len, l3_len); + break; + case IPPROTO_UDP: + ptypes = RTE_PTYPE_L4_UDP | + RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L2_ETHER; + l4_len = sizeof(struct udp_hdr); + break; + case IPPROTO_ICMPV6: + return icmp6_recv(ctx, m, l2_len, l3_len); + default: + GLUE_DEBUG("drop ipv6 pkt of unknown L4: (%x)", proto); + rte_pktmbuf_free(m); + return NULL; + } + } else { + GLUE_DEBUG("Drop unknown L3 packet: %x", etp); + rte_pktmbuf_free(m); + return NULL; + } + + m->packet_type = ptypes; + error = fill_pkt_hdr_len(m, l2_len, l3_len, l4_len); + if (error) { + rte_pktmbuf_free(m); + return NULL; + } + + return m; +} + +/* exclude NULLs from the final list of packets. */ +static inline uint32_t +compress_pkt_list(struct rte_mbuf *pkt[], uint32_t nb_pkt, uint32_t nb_zero) +{ + uint32_t i, j, k, l; + + for (j = nb_pkt; nb_zero != 0 && j-- != 0; ) { + + /* found a hole. */ + if (pkt[j] == NULL) { + + /* find how big is it. */ + for (i = j; i-- != 0 && pkt[i] == NULL; ) + ; + /* fill the hole. */ + for (k = j + 1, l = i + 1; k != nb_pkt; k++, l++) + pkt[l] = pkt[k]; + + nb_pkt -= j - i; + nb_zero -= j - i; + j = i + 1; + } + } + + return nb_pkt; +} + +static inline struct rte_mbuf * +common_fill_hdr_len(struct rte_mbuf *m, uint32_t tp, struct glue_ctx *ctx) +{ + uint32_t l4_len, l3_len, l2_len = sizeof(struct ether_hdr); + int32_t error = 0; + + switch (tp) { + /* possibly fragmented packets. */ + case (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L2_ETHER): + case (RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L2_ETHER): + l3_len = get_ipv4_hdr_len(m, l2_len, IPPROTO_MAX + 1, 1); + if ((m->packet_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_FRAG) { + m = process_ipv4_frag(m, ctx, l2_len, l3_len); + if (m == NULL) + return NULL; + tp = m->packet_type & (RTE_PTYPE_L2_MASK | RTE_PTYPE_L3_MASK | + RTE_PTYPE_L4_MASK); + } + break; + case (RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L2_ETHER): + case (RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L2_ETHER): + l3_len = get_ipv6_hdr_len(m, l2_len, IPPROTO_MAX + 1); + if ((m->packet_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_FRAG) { + m = process_ipv6_frag(m, ctx, l2_len, l3_len); + if (m == NULL) + return NULL; + tp = m->packet_type & (RTE_PTYPE_L2_MASK | RTE_PTYPE_L3_MASK | + RTE_PTYPE_L4_MASK); + } + break; + } + + switch (tp) { + /* non fragmented tcp packets. */ + case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV4 | + RTE_PTYPE_L2_ETHER): + l3_len = sizeof(struct ipv4_hdr); + l4_len = get_tcp_header_size(m, l2_len, l3_len); + error = adjust_ipv4_pktlen(m, l2_len); + break; + case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV6 | + RTE_PTYPE_L2_ETHER): + l3_len = sizeof(struct ipv6_hdr); + l4_len = get_tcp_header_size(m, l2_len, l3_len); + error = adjust_ipv6_pktlen(m, l2_len); + break; + case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L2_ETHER): + l3_len = get_ipv4_hdr_len(m, l2_len, + IPPROTO_TCP, 0); + l4_len = get_tcp_header_size(m, l2_len, l3_len); + error = adjust_ipv4_pktlen(m, l2_len); + break; + case (RTE_PTYPE_L4_TCP | RTE_PTYPE_L3_IPV6_EXT | + RTE_PTYPE_L2_ETHER): + l3_len = get_ipv6_hdr_len(m, l2_len, IPPROTO_TCP); + l4_len = get_tcp_header_size(m, l2_len, l3_len); + error = adjust_ipv6_pktlen(m, l2_len); + break; + + /* non fragmented udp packets. */ + case (RTE_PTYPE_L4_UDP | RTE_PTYPE_L3_IPV4 | + RTE_PTYPE_L2_ETHER): + l3_len = sizeof(struct ipv4_hdr); + l4_len = sizeof(struct udp_hdr); + error = adjust_ipv4_pktlen(m, l2_len); + break; + case (RTE_PTYPE_L4_UDP | RTE_PTYPE_L3_IPV6 | + RTE_PTYPE_L2_ETHER): + l3_len = sizeof(struct ipv6_hdr); + l4_len = sizeof(struct udp_hdr); + error = adjust_ipv6_pktlen(m, l2_len); + break; + case (RTE_PTYPE_L4_UDP | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L2_ETHER): + l3_len = get_ipv4_hdr_len(m, l2_len, + IPPROTO_UDP, 0); + l4_len = sizeof(struct udp_hdr); + error = adjust_ipv4_pktlen(m, l2_len); + break; + case (RTE_PTYPE_L4_UDP | RTE_PTYPE_L3_IPV6_EXT | + RTE_PTYPE_L2_ETHER): + l3_len = get_ipv6_hdr_len(m, l2_len, IPPROTO_UDP); + l4_len = sizeof(struct udp_hdr); + error = adjust_ipv6_pktlen(m, l2_len); + break; + default: + GLUE_LOG(ERR, "drop unknown pkt"); + rte_pktmbuf_free(m); + return NULL; + } + + if (error) { + rte_pktmbuf_free(m); + return NULL; + } + error = fill_pkt_hdr_len(m, l2_len, l3_len, l4_len); + if (error) { + rte_pktmbuf_free(m); + return NULL; + } + return m; +} + + +/* + * HW can recognize L2-arp/L3 with/without extensions/L4 (i40e) + */ +static uint16_t +type0_rx_callback(uint16_t port, + uint16_t queue, + struct rte_mbuf *pkt[], + uint16_t nb_pkts, + uint16_t max_pkts, + void *user_param) +{ + uint32_t j, tp, l2_len, l3_len; + struct glue_ctx *ctx; + uint16_t nb_zero = 0; + + RTE_SET_USED(port); + RTE_SET_USED(queue); + RTE_SET_USED(max_pkts); + + ctx = user_param; + + for (j = 0; j != nb_pkts; j++) { + tp = pkt[j]->packet_type & (RTE_PTYPE_L4_MASK | + RTE_PTYPE_L3_MASK | RTE_PTYPE_L2_MASK); + + switch (tp) { + case (RTE_PTYPE_L2_ETHER_ARP): + arp_recv(ctx, pkt[j], sizeof(struct ether_hdr)); + pkt[j] = NULL; + nb_zero++; + break; + case (RTE_PTYPE_L4_ICMP | RTE_PTYPE_L3_IPV4 | + RTE_PTYPE_L2_ETHER): + case (RTE_PTYPE_L4_ICMP | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L2_ETHER): + l2_len = sizeof(struct ether_hdr); + l3_len = get_ipv4_hdr_len(pkt[j], l2_len, IPPROTO_ICMP, 0); + icmp_recv(ctx, pkt[j], l2_len, l3_len); + pkt[j] = NULL; + nb_zero++; + break; + case (RTE_PTYPE_L4_ICMP | RTE_PTYPE_L3_IPV6 | + RTE_PTYPE_L2_ETHER): + case (RTE_PTYPE_L4_ICMP | RTE_PTYPE_L3_IPV6_EXT | + RTE_PTYPE_L2_ETHER): + l2_len = sizeof(struct ether_hdr); + l3_len = get_ipv6_hdr_len(pkt[j], l2_len, IPPROTO_ICMPV6); + icmp6_recv(ctx, pkt[j], l2_len, l3_len); + pkt[j] = NULL; + nb_zero++; + break; + default: + if (common_fill_hdr_len(pkt[j], tp, ctx) == NULL) { + pkt[j] = NULL; + nb_zero++; + } + break; + } + } + + if (nb_zero == 0) + return nb_pkts; + + return compress_pkt_list(pkt, nb_pkts, nb_zero); +} + +/* + * HW can recognize L2/L3/L4 and fragments; but cannot recognize ARP + * nor ICMP (ixgbe). + */ +static uint16_t +type1_rx_callback(uint16_t port, + uint16_t queue, + struct rte_mbuf *pkt[], + uint16_t nb_pkts, + uint16_t max_pkts, + void *user_param) +{ + uint32_t j, tp, l2_len, l3_len; + struct glue_ctx *ctx; + uint16_t nb_zero = 0; + const struct ether_hdr *eth; + const struct ipv4_hdr *ip4; + const struct ipv6_hdr *ip6; + uint16_t etp; + + RTE_SET_USED(port); + RTE_SET_USED(queue); + RTE_SET_USED(max_pkts); + + ctx = user_param; + + for (j = 0; j != nb_pkts; j++) { + tp = pkt[j]->packet_type & (RTE_PTYPE_L4_MASK | + RTE_PTYPE_L3_MASK | RTE_PTYPE_L2_MASK); + + switch (tp) { + case RTE_PTYPE_L2_ETHER: + eth = rte_pktmbuf_mtod(pkt[j], const struct ether_hdr *); + etp = eth->ether_type; + if (etp == rte_be_to_cpu_16(ETHER_TYPE_ARP)) + arp_recv(ctx, pkt[j], sizeof(*eth)); + pkt[j] = NULL; + nb_zero++; + break; + case (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L2_ETHER): + case (RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L2_ETHER): + ip4 = rte_pktmbuf_mtod_offset(pkt[j], + const struct ipv4_hdr *, + sizeof(*eth)); + if (ip4->next_proto_id == IPPROTO_ICMP) { + l2_len = sizeof(struct ether_hdr); + l3_len = get_ipv4_hdr_len(pkt[j], l2_len, IPPROTO_ICMP, 0); + icmp_recv(ctx, pkt[j], l2_len, l3_len); + } else { + rte_pktmbuf_free(pkt[j]); + } + pkt[j] = NULL; + nb_zero++; + break; + case (RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L2_ETHER): + case (RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L2_ETHER): + ip6 = rte_pktmbuf_mtod_offset(pkt[j], + const struct ipv6_hdr *, + sizeof(*eth)); + if (ip6->proto == IPPROTO_ICMPV6) { + l2_len = sizeof(struct ether_hdr); + l3_len = get_ipv6_hdr_len(pkt[j], l2_len, IPPROTO_ICMPV6); + icmp6_recv(ctx, pkt[j], l2_len, l3_len); + } else { + rte_pktmbuf_free(pkt[j]); + } + pkt[j] = NULL; + nb_zero++; + break; + default: + if (common_fill_hdr_len(pkt[j], tp, ctx) == NULL) { + pkt[j] = NULL; + nb_zero++; + } + break; + } + } + + if (nb_zero == 0) + return nb_pkts; + + return compress_pkt_list(pkt, nb_pkts, nb_zero); +} + +/* + * generic, assumes HW doesn't recognize any packet type. + */ +uint16_t +typen_rx_callback(uint16_t port, + uint16_t queue, + struct rte_mbuf *pkt[], + uint16_t nb_pkts, + uint16_t max_pkts, + void *user_param) +{ + uint32_t j; + uint16_t nb_zero; + struct glue_ctx *ctx; + + RTE_SET_USED(port); + RTE_SET_USED(queue); + RTE_SET_USED(max_pkts); + + ctx = user_param; + + nb_zero = 0; + for (j = 0; j != nb_pkts; j++) { + /* fix me: now we avoid checking ip checksum */ + pkt[j]->ol_flags &= (~PKT_RX_IP_CKSUM_BAD); + pkt[j]->packet_type = 0; + pkt[j] = fill_ptypes_and_hdr_len(ctx, pkt[j]); + nb_zero += (pkt[j] == NULL); + } + + if (nb_zero == 0) + return nb_pkts; + + return compress_pkt_list(pkt, nb_pkts, nb_zero); +} + +static uint32_t +get_ptypes(uint16_t port_id) +{ + uint32_t smask; + int32_t i, rc; + const uint32_t pmask = + RTE_PTYPE_L2_MASK | RTE_PTYPE_L3_MASK | RTE_PTYPE_L4_MASK; + + smask = 0; + rc = rte_eth_dev_get_supported_ptypes(port_id, pmask, NULL, 0); + if (rc < 0) { + RTE_LOG(ERR, USER1, + "%s(port=%u) failed to get supported ptypes;\n", + __func__, port_id); + return smask; + } + + uint32_t ptype[rc]; + rc = rte_eth_dev_get_supported_ptypes(port_id, pmask, ptype, rc); + + for (i = 0; i != rc; i++) { + switch (ptype[i]) { + case RTE_PTYPE_L2_ETHER_ARP: + smask |= ETHER_ARP_PTYPE; + break; + case RTE_PTYPE_L3_IPV4: + case RTE_PTYPE_L3_IPV4_EXT_UNKNOWN: + smask |= IPV4_PTYPE; + break; + case RTE_PTYPE_L3_IPV4_EXT: + smask |= IPV4_EXT_PTYPE; + break; + case RTE_PTYPE_L3_IPV6: + case RTE_PTYPE_L3_IPV6_EXT_UNKNOWN: + smask |= IPV6_PTYPE; + break; + case RTE_PTYPE_L3_IPV6_EXT: + smask |= IPV6_EXT_PTYPE; + break; + case RTE_PTYPE_L4_TCP: + smask |= TCP_PTYPE; + break; + case RTE_PTYPE_L4_UDP: + smask |= UDP_PTYPE; + break; + case RTE_PTYPE_L4_ICMP: + smask |= ICMP_PTYPE; + break; + } + } + + return smask; +} + +/* In rx callbacks, we need to check and make sure below things are done, + * either by hw or by sw: + * 1. filter out arp packets, and handle arp packets properly + * - for arp request packet, reply arp if it's requesting myself. + * 2. fill l2, l3, l4 header length + * + * 3. GSO/GRO setup (TODO) + * + */ +int +setup_rx_cb(uint16_t port_id, uint16_t qid) +{ + int32_t rc; + uint32_t i, n, smask; + const void *cb; + struct glue_ctx *ctx; + const struct ptype2cb *ptype2cb; + + static const struct ptype2cb tcp_arp_ptype2cb[] = { + { /* i40e */ + .mask = ETHER_ARP_PTYPE | + ICMP_PTYPE | + IPV4_PTYPE | IPV4_EXT_PTYPE | + IPV6_PTYPE | IPV6_EXT_PTYPE | + TCP_PTYPE | UDP_PTYPE, + .name = "HW l2-arp/l3x/l4-tcp ptype", + .fn = type0_rx_callback, + }, + { /* ixgbe does not support ARP ptype */ + .mask = IPV4_PTYPE | IPV4_EXT_PTYPE | + IPV6_PTYPE | IPV6_EXT_PTYPE | + TCP_PTYPE | UDP_PTYPE, + .name = "HW l3x/l4-tcp ptype", + .fn = type1_rx_callback, + }, + { /* virtio */ + .mask = 0, + .name = "HW does not support any ptype", + .fn = typen_rx_callback, + }, + }; + + ctx = glue_ctx_lookup(port_id, qid); + if (ctx == NULL) { + GLUE_LOG(ERR, "no ctx fount by port(%d) and queue (%d)", + port_id, qid); + return -EINVAL; + } + + smask = get_ptypes(port_id); + + ptype2cb = tcp_arp_ptype2cb; + n = RTE_DIM(tcp_arp_ptype2cb); + + for (i = 0; i != n; i++) { + if ((smask & ptype2cb[i].mask) == ptype2cb[i].mask) { + cb = rte_eth_add_rx_callback(port_id, qid, + ptype2cb[i].fn, ctx); + rc = -rte_errno; + GLUE_LOG(ERR, "%s(port=%u), setup RX callback \"%s\";", + __func__, port_id, ptype2cb[i].name); + return ((cb == NULL) ? rc : 0); + } + } + + GLUE_LOG(ERR, "%s(port=%u) failed to find an appropriate callback", + __func__, port_id); + return -ENOENT; +} diff --git a/lib/libtle_glue/rxtx.c b/lib/libtle_glue/rxtx.c new file mode 100644 index 0000000..4b6c391 --- /dev/null +++ b/lib/libtle_glue/rxtx.c @@ -0,0 +1,532 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sym.h" + +#include +#include +#include +#include + +#include + +#include +#include + +#include "tle_glue.h" +#include "fd.h" +#include "util.h" +#include "internal.h" + +#define MAX_UDP_PKT_LEN ((2 << 16) - 1 - sizeof(struct ipv4_hdr) - sizeof(struct udp_hdr)) + +static inline struct rte_mbuf * +from_mbuf_to_buf(struct rte_mbuf *m, char *buf, size_t len, int ispeek, int needcpy) +{ + void *src; + uint32_t done = 0; + uint32_t left = len, orig_pkt_len; + uint16_t copy_len, seg_len, segs; + struct rte_mbuf *m_next, *orig_pkt; + + if (len == 0) + return m; + + orig_pkt = m; + orig_pkt_len = m->pkt_len; + segs = m->nb_segs; + + do { + seg_len = rte_pktmbuf_data_len(m); + copy_len = RTE_MIN(seg_len, left); + src = rte_pktmbuf_mtod(m, void *); + if (needcpy) + rte_memcpy(buf + done, src, copy_len); + done += copy_len; + left -= copy_len; + if (copy_len < seg_len) { + if (!ispeek) { + rte_pktmbuf_adj(m, copy_len); + } + break; + } + m_next = m->next; + if (!ispeek) { + rte_pktmbuf_free_seg(m); + segs--; + } + m = m_next; + } while (left && m); + + if (m && !ispeek) { + m->nb_segs = segs; + m->pkt_len = orig_pkt_len - done; + } + + if(ispeek) + return orig_pkt; + else + return m; +} + +static inline bool +is_peer_closed(struct sock *so) +{ + if (errno == EAGAIN && so->erev && + tle_event_state(so->erev) == TLE_SEV_UP) + return true; + + return false; +} + +static ssize_t +_recv(int sockfd, void *buf, size_t len, struct sockaddr *src_addr, int flags) +{ + int rx; + ssize_t rc; + ssize_t recvlen; + size_t tmplen; + struct sock *so; + struct rte_mbuf *m; + struct epoll_event event; + int needcpy; + + so = fd2sock(sockfd); + + if (so->s == NULL || so->txev == NULL || + tle_event_state(so->txev) == TLE_SEV_IDLE) { + if (IS_UDP(so) && is_nonblock(so, flags)) + errno = EAGAIN; + else + errno = ENOTCONN; + return -1; + } + + if (so->rx_left) { + m = so->rx_left; + so->rx_left = NULL; + if (src_addr) { + OPS(so)->getname(so, src_addr, 1); + /* fixme: cannot get addr for UDP in this way */ + } + } else { + rc = OPS(so)->recv(so->s, &m, 1, src_addr); + if (rc == 0) { + if (is_nonblock(so, flags)) { + /* socket closed, return 0 */ + if (is_peer_closed(so)) { + GLUE_DEBUG("peer closed: %d", sockfd); + return 0; + } +#ifndef LOOK_ASIDE_BACKEND + /* Receive from shutdown socket will generate + * EAGAIN. Special case, let upper application + * handle the error code. + */ + if (errno == ESHUTDOWN) + errno = EAGAIN; +#endif + return -1; + } else { + do { + /* in blocking mode, recv from shutdown socket + * return 0 immediately */ + if (errno == ESHUTDOWN) + return 0; + + /* some error occured, return -1 */ + if (errno != EAGAIN) + return -1; + + /* socket closed, return 0 */ + if (is_peer_closed(so)) { + GLUE_DEBUG("peer closed: %d", sockfd); + return 0; + } + + epoll_kernel_wait(CTX(so), -1, &event, 1, 1, &rx); + + be_process(CTX(so)); + } while((rc = OPS(so)->recv(so->s, &m, 1, src_addr)) == 0); + } + } + } + + /* get one pkt */ + if (!timestamp_needed) + so->s->timestamp = m->timestamp; + + needcpy = 1; + recvlen = RTE_MIN(m->pkt_len, len); + if (flags & MSG_TRUNC) { + if (IS_UDP(so)) + recvlen = m->pkt_len; + else + /* According to linux manual, data will be discarded + * if recv TCP stream with MSG_TRUNC flag */ + needcpy = 0; + } + + so->rx_left = from_mbuf_to_buf(m, buf, len, flags & MSG_PEEK, needcpy); + + if (((flags & MSG_PEEK) == 0) && IS_UDP(so) && so->rx_left) { + rte_pktmbuf_free(so->rx_left); + so->rx_left = NULL; + } + + /* UDP socket only receive one pkt at one time */ + if (IS_UDP(so) || (flags & MSG_PEEK)) { + return recvlen; + } + /* TCP socket: try best to fill buf */ + len -= recvlen; + buf = (char*)buf + recvlen; + while (len) { + if (OPS(so)->recv(so->s, &m, 1, src_addr) == 0) + break; + + tmplen = (m->pkt_len < len) ? m->pkt_len : len; + so->rx_left = from_mbuf_to_buf(m, buf, tmplen, 0, needcpy); + len -= tmplen; + recvlen += tmplen; + buf = (char*)buf + tmplen; + } + + if (so->rx_left) + tle_event_raise(so->rxev); + + /* may send window increase ACK after receive*/ + if (recvlen > 0) + be_tx_with_lock(CTX(so)); + + return recvlen; +} + +ssize_t PRE(recv)(int sockfd, void *buf, size_t len, int flags) +{ + if (is_kernel_fd(sockfd)) + return k_read(sockfd, buf, len); + + return _recv(sockfd, buf, len, NULL, flags); +} + +ssize_t PRE(recvfrom)(int sockfd, void *buf, size_t len, int flags, + struct sockaddr *src_addr, socklen_t *addrlen) +{ + ssize_t rc; + if (is_kernel_fd(sockfd)) + return k_recv(sockfd, buf, len, flags); + + if (src_addr && !addrlen) { + errno = EINVAL; + return -1; + } + rc = _recv(sockfd, buf, len, src_addr, flags); + if (rc >= 0 && src_addr) { + if (src_addr->sa_family == AF_INET) { + *addrlen = sizeof(struct sockaddr_in); + } else { + *addrlen = sizeof(struct sockaddr_in6); + } + } + return rc; +} + +#define RECV_CONTINUE (-2) +static inline ssize_t +try_readv(struct sock *so, int percall_flags, const struct iovec *iov, + int iovcnt, struct msghdr *msg) +{ + ssize_t sz; + + if (so->s == NULL) { + if (IS_UDP(so) && is_nonblock(so, percall_flags)) + errno = EAGAIN; + else + errno = ENOTCONN; + return -1; + } + + sz = OPS(so)->readv(so->s, iov, iovcnt, msg); + if (sz >= 0) { /* get data */ + /* may send window increase ACK after receive*/ + if (sz > 0) + be_tx_with_lock(CTX(so)); + return sz; + } + else if (errno != EAGAIN) /* error occurred */ + return -1; + else if (is_peer_closed(so)) { + GLUE_DEBUG("peer closed: %d", sockfd); + return 0; + } else if (is_nonblock(so, percall_flags)) + return -1; + + return RECV_CONTINUE; +} + +ssize_t PRE(recvmsg)(int sockfd, struct msghdr *msg, int flags) +{ + ssize_t sz; + struct sock *so; + + if (is_kernel_fd(sockfd)) + return k_recvmsg(sockfd, msg, flags); + + so = fd2sock(sockfd); + + if (so->rx_left == NULL && OPS(so)->readv) { + sz = try_readv(so, flags, msg->msg_iov, msg->msg_iovlen, msg); + if (sz != RECV_CONTINUE) + return sz; + } + + /* 1. rx_left != NULL; 2. get no data, fall back to blocking read */ + + if (so->rx_left != NULL && msg != NULL && msg->msg_control != NULL) { + if (timestamp_needed) + set_msg_timestamp(msg, so->rx_left); + else + msg->msg_controllen = 0; + } + + sz = PRE(recvfrom)(sockfd, msg->msg_iov[0].iov_base, msg->msg_iov[0].iov_len, + flags, (struct sockaddr *)msg->msg_name, &msg->msg_namelen); + + return sz; +} + +ssize_t PRE(read)(int fd, void *buf, size_t count) +{ + if (is_kernel_fd(fd)) + return k_read(fd, buf, count); + + return _recv(fd, buf, count, NULL, 0); +} + +ssize_t PRE(readv)(int fd, const struct iovec *iov, int iovcnt) +{ + ssize_t sz; + struct sock *so; + + if (is_kernel_fd(fd)) + return k_readv(fd, iov, iovcnt); + + so = fd2sock(fd); + + if (so->rx_left == NULL && OPS(so)->readv) { + sz = try_readv(so, 0, iov, iovcnt, NULL); + if (sz != RECV_CONTINUE) + return sz; + } + + /* 1. rx_left != NULL; 2. get no data, fall back to blocking read */ + + /* fixme: when so->rx_left != NULL, also needs readv. + * maybe need to modify readv interface args of ops */ + return _recv(fd, iov[0].iov_base, iov[0].iov_len, NULL, 0); +} + +static ssize_t +_send(int sockfd, const void *buf, size_t len, + const struct sockaddr *peer, int flags) +{ + struct sock *so = fd2sock(sockfd); + struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */ + uint16_t nb_mbufs = (len + RTE_MBUF_DEFAULT_DATAROOM - 1) / + RTE_MBUF_DEFAULT_DATAROOM; + uint16_t i, cnt, copy_len; + int rc; + struct rte_mbuf *mbufs[nb_mbufs + 1]; + size_t done = 0; + uint32_t left = 0; + char *dst; + int blocking = !is_nonblock(so, flags); + + if (!blocking && len > def_sndbuf && so->proto == PROTO_TCP) { + len = def_sndbuf; + nb_mbufs = (len + RTE_MBUF_DEFAULT_DATAROOM - 1) / + RTE_MBUF_DEFAULT_DATAROOM; + } + + if (unlikely(len == 0)) { + if (so->proto == PROTO_TCP) + return 0; + else + nb_mbufs += 1; + } + + if (unlikely(len > MAX_UDP_PKT_LEN && IS_UDP(so))) { + errno = EMSGSIZE; + return -1; + } + + if (blocking) + be_process(get_ctx()); + + if (unlikely(rte_pktmbuf_alloc_bulk(mp, mbufs, nb_mbufs) < 0)) { + errno = ENOMEM; + return -1; + } + + for (i = 0; i < nb_mbufs; ++i) { + copy_len = RTE_MIN((size_t)RTE_MBUF_DEFAULT_DATAROOM, + len - done); + dst = rte_pktmbuf_mtod(mbufs[i], char *); + rte_memcpy(dst, (const char *)buf + done, copy_len); + done += copy_len; + mbufs[i]->data_len = copy_len; + mbufs[i]->pkt_len = copy_len; + } + + cnt = 0; +do_send: + rc = OPS(so)->send(so, mbufs + cnt, nb_mbufs - cnt, peer); + + cnt += rc; + + if (cnt > 0) + be_tx_with_lock(CTX(so)); + + if (cnt > 0 && blocking) + be_process(get_ctx()); + + if ((rc > 0 || errno == EAGAIN) && + cnt < nb_mbufs && blocking && + tle_event_state(so->erev) != TLE_SEV_UP) { + be_process(get_ctx()); + goto do_send; + } + + for (i = cnt; i < nb_mbufs; ++i) { + left += mbufs[i]->pkt_len; + rte_pktmbuf_free_seg(mbufs[i]); + } + + if (cnt == 0) + return -1; + else + return len - left; +} + +ssize_t PRE(send)(int sockfd, const void *buf, size_t len, int flags) +{ + if (is_kernel_fd(sockfd)) + return k_write(sockfd, buf, len); + + /* MSG_NOSIGNAL means "Do not generate SIGPIPE". Ignore this flag */ + flags &= ~MSG_NOSIGNAL; + + return _send(sockfd, buf, len, NULL, flags); +} + +ssize_t PRE(sendto)(int sockfd, const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen) +{ + if (is_kernel_fd(sockfd)) + return k_sendto(sockfd, buf, len, flags, dest_addr, addrlen); + + /* MSG_NOSIGNAL means "Do not generate SIGPIPE". Ignore this flag */ + flags &= ~MSG_NOSIGNAL; + + return _send(sockfd, buf, len, dest_addr, flags); +} + +ssize_t PRE(sendmsg)(int sockfd, const struct msghdr *msg, int flags) +{ + ssize_t ret; + struct sock *so; + + if (is_kernel_fd(sockfd)) + return k_sendmsg(sockfd, msg, flags); + + /* MSG_NOSIGNAL means "Do not generate SIGPIPE". Ignore this flag */ + flags &= ~MSG_NOSIGNAL; + + so = fd2sock(sockfd); + if (OPS(so)->writev) { + ret = OPS(so)->writev(so, msg->msg_iov, msg->msg_iovlen, + msg->msg_name); + if (ret < 0) { + if (errno != EAGAIN || is_nonblock(so, flags)) + return -1; + } else { + /* TODO: blocking && ret < total length */ + be_tx_with_lock(CTX(so)); + return ret; + } + + /* fall through to blocking send */ + } + + return _send(sockfd, msg->msg_iov[0].iov_base, msg->msg_iov[0].iov_len, + (struct sockaddr *)msg->msg_name, flags); +} + +ssize_t PRE(write)(int fd, const void *buf, size_t count) +{ + if (is_kernel_fd(fd)) + return k_write(fd, buf, count); + + return _send(fd, buf, count, NULL, 0); +} + +ssize_t PRE(writev)(int fd, const struct iovec *iov, int iovcnt) +{ + ssize_t ret; + struct sock *so; + + if (is_kernel_fd(fd)) + return k_writev(fd, iov, iovcnt); + + so = fd2sock(fd); + if (OPS(so)->writev) { + ret = OPS(so)->writev(so, iov, iovcnt, NULL); + if (ret < 0) { + if (errno != EAGAIN || is_nonblock(so, 0)) + return -1; + } else { + /* TODO: blocking && ret < total length */ + be_tx_with_lock(CTX(so)); + return ret; + } + + /* fall through to blocking send */ + } + + return _send(fd, iov[0].iov_base, iov[0].iov_len, NULL, 0); +} + +/* advanced functions */ +ssize_t PRE(splice)(int fd_in, loff_t *off_in, int fd_out, + loff_t *off_out, size_t len, unsigned int flags) +{ + if (is_kernel_fd(fd_in) && is_kernel_fd(fd_out)) + return k_splice(fd_in, off_in, fd_out, off_out, len, flags); + + rte_panic("splice is not supported yet"); + errno = EOPNOTSUPP; + return -1; +} + +ssize_t PRE(sendfile)(int out_fd, int in_fd, off_t *offset, size_t count) +{ + if (is_kernel_fd(out_fd) && is_kernel_fd(in_fd)) + return k_sendfile(out_fd, in_fd, offset, count); + + rte_panic("sendfile is not supported yet"); + errno = EOPNOTSUPP; + return -1; +} diff --git a/lib/libtle_glue/select.c b/lib/libtle_glue/select.c new file mode 100644 index 0000000..3b4fc00 --- /dev/null +++ b/lib/libtle_glue/select.c @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include + +#include "fd.h" +#include "ctx.h" +#include "sym.h" +#include "log.h" +#include "util.h" +#include "internal.h" +#include "tle_glue.h" + +#define FD_ZERO_N(s, n) do { memset((s)->fds_bits, 0, n/sizeof(long)); } while(0) + +static int +fdset_to_events_user(int nfds, fd_set *fdset, int *total, int event) +{ + int i, num = 0; + struct sock *so; + const struct tle_event *ev; + + for (i = fd_table.fd_base; i < nfds; ++i) { + if (!FD_ISSET(i, fdset)) + continue; + + so = fd2sock(i); /* fix me: check if fd is opened */ + + switch (event) { + case EPOLLIN: + ev = so->rxev; + break; + case EPOLLOUT: + ev = so->txev; + break; + case EPOLLERR: + ev = so->erev; + break; + default: + rte_panic("non-sense value\n"); + } + /* Check event is ready */ + if (TLE_SEV_UP == tle_event_state(ev)) { + *total = *total + 1; + } else { + FD_CLR(i, fdset); + num++; + } + + /* We fill sock->event here as we need this when + * we filter events in poll_common(). But it was + * originally set by epoll_ctl(). Now we have to + * assume that there are no application which + * uses epoll/poll/select at the same time. + */ + so->event.events |= event; + so->event.data.u32 = i; + } + + return num; +} + +static int +fdset_to_events_kernel(int nfds, fd_set *fdset, int efd, int event) +{ + int i, num = 0; + struct epoll_event k_ev; + + for (i = 0; i < nfds; ++i) { + if (!FD_ISSET(i, fdset)) + continue; + + k_ev.events = event; + k_ev.data.u32 = i; + k_epoll_ctl(efd, EPOLL_CTL_ADD, i, &k_ev); + num++; + } + + return num; +} + +int +PRE(select)(int nfds, fd_set *readfds, fd_set *writefds, + fd_set *exceptfds, struct timeval *timeout) +{ + int to; + struct glue_ctx *ctx; + int j, efd, total = 0, max = 0; + + /* thread <> context binding happens here */ + if (RTE_PER_LCORE(glue_ctx) == NULL) { + ctx = &ctx_array[glue_ctx_alloc()]; + RTE_PER_LCORE(glue_ctx) = ctx; + } else + ctx = RTE_PER_LCORE(glue_ctx); + + /* step 0, process some packets */ + be_process(ctx); + + /* step 1, check if any userspace events are ready */ + + if (readfds) + max += fdset_to_events_user(nfds, readfds, + &total, EPOLLIN); + if (writefds) + max += fdset_to_events_user(nfds, writefds, + &total, EPOLLOUT); + if (exceptfds) + max += fdset_to_events_user(nfds, writefds, + &total, EPOLLERR); + if (total > 0) { + /* userspace events go firstly */ + if (readfds) + FD_ZERO_N(readfds, fd_table.fd_base); + if (writefds) + FD_ZERO_N(writefds, fd_table.fd_base); + if (exceptfds) + FD_ZERO_N(exceptfds, fd_table.fd_base); + + return total; + } + + /* step 2, only wait for kernel events? */ + if (max == 0) + return k_select(nfds, readfds, writefds, exceptfds, timeout); + + /* step 3, slow path: wait for I/O and kernel events */ + efd = k_epoll_create(1); + if (efd < 0) + rte_panic("k_epoll_create failed %d", errno); + + nfds = RTE_MIN(nfds, fd_table.fd_base); + if (readfds) + max += fdset_to_events_kernel(nfds, readfds, + efd, EPOLLIN); + if (writefds) + max += fdset_to_events_kernel(nfds, writefds, + efd, EPOLLOUT); + if (exceptfds) + max += fdset_to_events_kernel(nfds, exceptfds, + efd, EPOLLERR); + + struct epoll_event events[max]; + + if (timeout) + to = timeout->tv_sec * 1000 + timeout->tv_usec / 1000; + else + to = -1; + total = poll_common(ctx, events, max, to, efd); + + k_close(efd); + for (j = 0; j < total; ++j) { + if (events[j].events & EPOLLIN) + FD_SET(events[j].data.fd, readfds); + + if (events[j].events & EPOLLOUT) + FD_SET(events[j].data.fd, writefds); + + if ((events[j].events & (EPOLLHUP | EPOLLERR)) && exceptfds) + FD_SET(events[j].data.fd, exceptfds); + } + return total; +} + +int +PRE(pselect)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, + const struct timespec *timeout, const sigset_t *sigmask) +{ + struct timeval tv, *tv_to; + + if (sigmask != NULL) + rte_panic("pselect with signal is not supported"); + + if (timeout) { + tv.tv_usec = timeout->tv_nsec / 1000; + tv.tv_sec = timeout->tv_sec; + tv_to = &tv; + } else + tv_to = NULL; + + return select(nfds, readfds, writefds, exceptfds, tv_to); +} diff --git a/lib/libtle_glue/sock.h b/lib/libtle_glue/sock.h new file mode 100644 index 0000000..d2ec73d --- /dev/null +++ b/lib/libtle_glue/sock.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef _SOCK_H_ +#define _SOCK_H_ + +#include +#include +#include +#include + +#include +#include +#include "../libtle_l4p/stream.h" +#include "../libtle_l4p/net_misc.h" + +#include "ctx.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern unsigned int def_sndbuf; +extern unsigned int def_rcvbuf; + +#ifndef TCP_FASTOPEN +#define TCP_FASTOPEN 23 +#endif + +#ifndef TCP_USER_TIMEOUT +#define TCP_USER_TIMEOUT 18 +#endif + +#ifndef TCP_FASTOPEN_CONNECT +#define TCP_FASTOPEN_CONNECT 30 +#endif + +struct sock; + +struct proto { + int (*setsockopt)(struct sock *sk, int optname, const void *optval, + socklen_t optlen); + int (*getsockopt)(struct sock *sk, int optname, void *optval, + socklen_t *option); + int (*getname)(struct sock *sk, struct sockaddr *addr, int peer); + + int (*bind)(struct sock *sk, const struct sockaddr *addr); + int (*listen)(struct sock *sk, int backlog); + int (*connect)(struct sock *sk, const struct sockaddr *addr); + int (*accept)(struct sock *sk, struct sockaddr *addr, + socklen_t *addrlen, int flags); + + ssize_t (*recv)(struct tle_stream *s, struct rte_mbuf *pkt[], + uint16_t num, struct sockaddr *addr); + ssize_t (*send)(struct sock *sk, struct rte_mbuf *pkt[], + uint16_t num, const struct sockaddr *dst_addr); + + ssize_t (*readv)(struct tle_stream *s, const struct iovec *iov, + int iovcnt, struct msghdr *msg); + ssize_t (*writev)(struct sock *sk, const struct iovec *iov, + int iovcnt, const struct sockaddr *dst_addr); + + int (*shutdown)(struct sock *sk, int how); + int (*close)(struct tle_stream *s); + + void (*update_cfg)(struct sock *sk); + + char name[32]; +}; + +enum { + PROTO_TCP, + PROTO_UDP +}; + +extern struct proto udp_prot; +extern struct proto tcp_prot; +extern struct proto *supported_proto_ops[]; + +struct sock { + int fd; + uint32_t cid:8, /* ctx id for indexing ctx_array */ + type:8, /* for TLE_V4, TLE_V6 */ + proto:8, /* PROTO_TCP, PROTO_UDP */ + valid:1, + epoll:1, + ubind:1, + nonblock:1; + struct tle_stream *s; + struct tle_event *erev; + struct tle_event *rxev; + struct tle_event *txev; + struct rte_mbuf *rx_left; + + union stream_option option; + + union { + struct epoll_event event; + int shadow_efd; + }; +} __rte_cache_aligned; + +#define CTX(so) (&ctx_array[so->cid]) +#define OPS(so) (supported_proto_ops[so->proto]) +#define IS_TCP(so) (so->proto == PROTO_TCP) +#define IS_UDP(so) (so->proto == PROTO_UDP) + +static inline int +is_nonblock(struct sock *so, int flags) +{ + return (flags & MSG_DONTWAIT) || so->nonblock; +} + +static inline struct tle_ctx * +get_sock_ctx(struct sock *so) +{ + if (IS_TCP(so)) + return CTX(so)->tcp_ctx; + else + return CTX(so)->udp_ctx; +} + +static inline size_t +get_sockaddr_len(struct sock *so) +{ + if (so->type == TLE_V4) + return sizeof(struct sockaddr_in); + else + return sizeof(struct sockaddr_in6); +} + +static inline size_t +get_sockaddr_len_family(sa_family_t family) +{ + if (family == AF_INET) + return sizeof(struct sockaddr_in); + else + return sizeof(struct sockaddr_in6); +} + +#ifdef __cplusplus +} +#endif + +#endif /*_SOCK_H_ */ diff --git a/lib/libtle_glue/socket.c b/lib/libtle_glue/socket.c new file mode 100644 index 0000000..bb94d47 --- /dev/null +++ b/lib/libtle_glue/socket.c @@ -0,0 +1,671 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sym.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tle_glue.h" +#include "fd.h" +#include "log.h" +#include "util.h" +#include "internal.h" +#include "sock.h" + +uint32_t timestamp_needed = 0; + +struct proto *supported_proto_ops[] = { + [PROTO_TCP] = &tcp_prot, + [PROTO_UDP] = &udp_prot, +}; + +/* for setup, settings, and destroy */ +int PRE(socket)(int domain, int type, int protocol) +{ + int fd; + struct sock *so; + + if ((domain != AF_INET && domain != AF_INET6) || + (type != SOCK_STREAM && type != SOCK_DGRAM)) + return k_socket(domain, type, protocol); + + fd = get_unused_fd(); + so = fd2sock(fd); + so->cid = get_cid(); + if (type == SOCK_STREAM) + so->proto = PROTO_TCP; + else /* type == SOCK_DGRAM */ + so->proto = PROTO_UDP; + + so->type = domain == AF_INET ? TLE_V4 : TLE_V6; + + so->option.raw = 0; + so->option.mulloop = 1; + so->option.multtl = 1; + if (type == SOCK_STREAM) { + so->option.tcpquickack = 1; + so->option.keepidle = 2 * 60 * 60; /*linux default value: 2hours*/ + so->option.keepintvl = 75; /*linux default value: 75seconds*/ + } + + GLUE_DEBUG("socket fd = %d", fd); + return fd; +} + +int PRE(bind)(int sockfd, const struct sockaddr *addr, socklen_t addrlen) +{ + struct sock *so; + + if (is_kernel_fd(sockfd)) + return k_bind(sockfd, addr, addrlen); + + so = fd2sock(sockfd); + if (so->s) { + /* The socket is already bound to an address */ + errno = EINVAL; + return -1; + } + /* fixme: It is not conform to linux standard, fix it later. + * should return truncated address */ + if (addrlen < get_sockaddr_len_family(addr->sa_family)) { + errno = EINVAL; + return -1; + } + + so->cid = get_cid(); /* allow ctx reset as stream is null */ + if (OPS(so)->bind) + return OPS(so)->bind(so, addr); + + errno = EOPNOTSUPP; + return -1; +} + +int PRE(listen)(int sockfd, int backlog) +{ + struct sock *so; + + if (is_kernel_fd(sockfd)) + return k_listen(sockfd, backlog); + + so = fd2sock(sockfd); + + if (OPS(so)->listen) + return OPS(so)->listen(so, backlog); + + errno = EOPNOTSUPP; + return -1; +} + +int PRE(accept)(int sockfd, struct sockaddr *addr, socklen_t *addrlen) +{ + struct sock *so; + + if (is_kernel_fd(sockfd)) + return k_accept(sockfd, addr, addrlen); + + so = fd2sock(sockfd); + if (OPS(so)->accept) + return OPS(so)->accept(so, addr, addrlen, 0); + + errno = EOPNOTSUPP; + return -1; +} + +int PRE(accept4)(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags) +{ + int fd; + struct sock *so; + + if (is_kernel_fd(sockfd)) + return k_accept4(sockfd, addr, addrlen, flags); + + fd = PRE(accept)(sockfd, addr, addrlen); + + /* inherit NONBLOCK flag */ + if (fd >= 0 && (flags & SOCK_NONBLOCK)) { + so = fd2sock(fd); + so->nonblock = 1; + } + + return fd; +} + +int PRE(connect)(int sockfd, const struct sockaddr *addr, socklen_t addrlen) +{ + struct sock *so; + + if (is_kernel_fd(sockfd)) + return k_connect(sockfd, addr, addrlen); + + so = fd2sock(sockfd); + so->cid = get_cid(); + + if (!(is_nonblock(so, 0))) + mac_check(CTX(so), addr); + + if (OPS(so)->connect) + return OPS(so)->connect(so, addr); + + errno = EOPNOTSUPP; + return -1; +} + +unsigned int def_sndbuf = 212992; +unsigned int def_rcvbuf = 212992; +static struct linger ling; + +int PRE(getsockopt)(int sockfd, int level, int optname, + void *optval, socklen_t *optlen) +{ + struct sock *so; + union { + int val; + uint64_t val64; + struct linger ling; + struct timeval tm; + } *p = optval; + + + if (is_kernel_fd(sockfd)) + return k_getsockopt(sockfd, level, optname, optval, optlen); + + if (!optval && !optlen) + return -1; + + so = fd2sock(sockfd); + + switch (level) { + case IPPROTO_IP: + switch (optname) { + case IP_OPTIONS: + *optlen = 0; + return 0; + case IP_MULTICAST_LOOP: + p->val = so->option.mulloop; + return 0; + case IP_MULTICAST_TTL: + p->val = so->option.multtl; + return 0; + } + break; + case IPPROTO_IPV6: + switch (optname) { + case IPV6_V6ONLY: + p->val = so->option.ipv6only; + return 0; + } + break; + case SOL_SOCKET: + /* man socket(7), see /usr/include/asm-generic/socket.h */ + switch (optname) { + case SO_ERROR: + if (TLE_SEV_DOWN == tle_event_state(so->erev)) + p->val = 0; + else { + p->val = ECONNREFUSED; + /* fixe me: ETIMEDOUT */ + } + return 0; + case SO_LINGER: + p->ling.l_onoff = 0; + return 0; + case SO_SNDBUF: + p->val = def_sndbuf; + return 0; + case SO_RCVBUF: + p->val = def_rcvbuf; + return 0; + case SO_ACCEPTCONN: + if (IS_TCP(so) && + TCP_STREAM(so->s)->tcb.state == TCP_ST_LISTEN) + p->val = 1; + else + p->val = 0; + return 0; + case SO_KEEPALIVE: + p->val = so->option.keepalive; + return 0; + case SO_TYPE: + if (IS_TCP(so)) + p->val = SOCK_STREAM; + else + p->val = SOCK_DGRAM; + return 0; + case SO_OOBINLINE: + p->val = so->option.oobinline; + return 0; + case SO_TIMESTAMP: + p->val = timestamp_needed; + return 0; + default: + break; + } + + break; + case SOL_TCP: + case SOL_UDP: + return OPS(so)->getsockopt(so, optname, optval, optlen); + } + + GLUE_LOG(WARNING, "getsockopt(%d) with level = %d, optname = %d", + sockfd, level, optname); + errno = EOPNOTSUPP; + return -1; +} + +int PRE(setsockopt)(int sockfd, int level, int optname, + const void *optval, socklen_t optlen) +{ + int val; + struct sock *so; + if (is_kernel_fd(sockfd)) + return k_setsockopt(sockfd, level, optname, optval, optlen); + if (!optval && !optlen) + return -1; + + val = 0; /* just to make compiler happy */ + switch (optlen) { + case sizeof(char): + val = *(const char *)optval; + break; + case sizeof(int): + val = *(const int *)optval; + break; + } + + so = fd2sock(sockfd); + + switch (level) { + case IPPROTO_IP: + switch (optname) { + case IP_RECVERR: + /* needed by netperf */ + return 0; + case IP_MULTICAST_LOOP: + if (val == 0) + so->option.mulloop = 0; + else + so->option.mulloop = 1; + if (so->s != NULL) + so->s->option.mulloop = so->option.mulloop; + return 0; + case IP_MULTICAST_TTL: + if (val > 255 || val < -1) { + errno = EINVAL; + return -1; + } + if(val == -1) { + val = 1; + } + so->option.multtl = val; + if (so->s != NULL) + so->s->option.multtl = so->option.multtl; + return 0; + case IP_ADD_MEMBERSHIP: + if (optlen < sizeof(struct ip_mreq)) { + errno = EINVAL; + return -1; + } + const struct ip_mreq* mreq = (const struct ip_mreq*)optval; + if (mreq->imr_multiaddr.s_addr == INADDR_ANY) { + errno = EINVAL; + return -1; + } + errno = EOPNOTSUPP; + return -1; + case IP_MTU_DISCOVER: + return 0; + case IP_TOS: + return 0; + case IP_RECVTOS: + return 0; + } + break; + case IPPROTO_IPV6: + switch (optname) { + case IPV6_V6ONLY: + if (val == 0) + so->option.ipv6only = 0; + else + so->option.ipv6only = 1; + if (so->s != NULL) + so->s->option.ipv6only = so->option.ipv6only; + return 0; + case IPV6_TCLASS: + return 0; + case IPV6_RECVTCLASS: + return 0; + } + break; + case SOL_SOCKET: + switch (optname) { + case SO_REUSEADDR: + if (val == 0) + so->option.reuseaddr = 0; + else + so->option.reuseaddr = 1; + if (so->s != NULL) + so->s->option.reuseaddr = so->option.reuseaddr; + return 0; + case SO_LINGER: + ling = *(const struct linger *)optval; + if (ling.l_onoff == 0) + return 0; + else { + GLUE_LOG(ERR, "app is enabling SO_LINGER" + " which is not really supported"); + return 0; + } + break; + case SO_KEEPALIVE: + if (val == 0) + so->option.keepalive = 0; + else + so->option.keepalive = 1; + if (so->s != NULL) + so->s->option.keepalive = so->option.keepalive; + return 0; + case SO_REUSEPORT: + if (val == 0) + so->option.reuseport = 0; + else + so->option.reuseport = 1; + if (so->s != NULL) + so->s->option.reuseport = so->option.reuseport; + return 0; + case SO_SNDBUF: + def_sndbuf = val; + return 0; + case SO_RCVBUF: + def_rcvbuf = val; + return 0; + case SO_DONTROUTE: + /* needed by netperf */ + return 0; + case SO_BROADCAST: + /* needed by nc */ + /* todo: only supported for DGRAM */ + return 0; + case SO_TIMESTAMP: + timestamp_needed++; + return 0; + case SO_OOBINLINE: + if (val == 0) + so->option.oobinline = 0; + else + so->option.oobinline = 1; + if (so->s != NULL) + so->s->option.oobinline = so->option.oobinline; + return 0; + default: + break; + } + break; + case IPPROTO_TCP: + case IPPROTO_UDP: + return OPS(so)->setsockopt(so, optname, optval, optlen); + } + + GLUE_LOG(WARNING, "setsockopt(%d) with level = %d, optname = %d\n", + sockfd, level, optname); + errno = EOPNOTSUPP; + return -1; +} + +/* + * Refer to glibc/sysdeps/unix/sysv/linux/fcntl.c + */ +int PRE(fcntl)(int fd, int cmd, ...) +{ + int rc; + void *arg; + va_list ap; + struct sock *so; + + va_start(ap, cmd); + arg = va_arg(ap, void *); + va_end(ap); + + if (is_kernel_fd(fd)) + return k_fcntl(fd, cmd, arg); + + so = fd2sock(fd); + switch (cmd) { + case F_SETFL: + if ((unsigned long)arg & O_NONBLOCK) + so->nonblock = 1; + else + so->nonblock = 0; + rc = 0; + break; + case F_GETFL: + if (so->nonblock) + rc = O_NONBLOCK | O_RDWR; + else + rc = O_RDWR; + break; + case F_SETFD: + rc = 0; + break; + default: + rc = -1; + errno = EOPNOTSUPP; + rte_panic("fcntl(%d) with cmd = %d", fd, cmd); + } + + return rc; +} + +/* + * Refer to musl/src/misc/ioctl.c + */ +int PRE(ioctl)(int fd, unsigned long int request, ...) +{ + int rc; + void *arg; + va_list ap; + uint16_t left; + struct sock *so; + struct rte_mbuf *m; + + va_start(ap, request); + arg = va_arg(ap, void *); + va_end(ap); + + if (is_kernel_fd(fd)) + return k_ioctl(fd, request, arg); + + so = fd2sock(fd); + + switch (request) { + case FIONREAD: /* SIOCINQ */ + if (so->s == NULL) { + *(int *)arg = 0; + } else if (IS_TCP(so)) { + left = tle_tcp_stream_inq(so->s); + if (so->rx_left) + left += rte_pktmbuf_pkt_len(so->rx_left); + *(int *)arg = left; + } else { + if (so->rx_left) { + *(int *)arg = rte_pktmbuf_pkt_len(so->rx_left); + } else { + if (tle_udp_stream_recv(so->s, &m , 1) == 0) { + *(int *)arg = 0; + } else { + *(int *)arg = rte_pktmbuf_pkt_len(m); + so->rx_left = m; + } + } + } + rc = 0; + break; + case FIONBIO: + if (*(int *)arg) + so->nonblock = 1; + else + so->nonblock = 0; + rc = 0; + break; + case SIOCGSTAMP: + if (so->s->timestamp == 0) { + errno = ENOENT; + rc = -1; + } else { + ((struct timeval*)arg)->tv_sec = so->s->timestamp >> 20; + ((struct timeval*)arg)->tv_usec = so->s->timestamp & 0xFFFFFUL; + rc = 0; + } + break; + default: + errno = EOPNOTSUPP; + rc = -1; + rte_panic("ioctl(%d) with request = %ld", fd, request); + } + + return rc; +} + +int PRE(shutdown)(int sockfd, int how) +{ + struct sock *so; + + if (is_kernel_fd(sockfd)) + return k_shutdown(sockfd, how); + + so = fd2sock(sockfd); + if (OPS(so)->shutdown) + return OPS(so)->shutdown(so, how); + + errno = EOPNOTSUPP; + return -1; +} + +static inline int +getname(int sockfd, struct sockaddr *uaddr, socklen_t *addrlen, int peer) +{ + struct sock *so; + size_t socklen; + int rc; + + so = fd2sock(sockfd); + + /* This is ugly, but netperf ask for local addr (before any + * connect or bind) to check family. + * + * To formally fix this, we shall bind a local address in advance + */ + socklen = get_sockaddr_len(so); + /* fixme: It is not conform to linux standard, fix it later. */ + if (*addrlen < socklen) { + errno = EINVAL; + return -1; + } + *addrlen = socklen; + + if (so->s == NULL) { + if (peer) { + errno = ENOTCONN; + return -1; + } else { + memset(uaddr, 0, socklen); + if (so->type == TLE_V4) + uaddr->sa_family = AF_INET; + else + uaddr->sa_family = AF_INET6; + return 0; + } + } + + if (OPS(so)->getname) { + rc = OPS(so)->getname(so, uaddr, peer); + if (rc < 0) + return rc; + if (peer) { + if ((uaddr->sa_family == AF_INET && + ((struct sockaddr_in*)uaddr)->sin_addr.s_addr == 0) || + (uaddr->sa_family == AF_INET6 && + IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6*) + uaddr)->sin6_addr))) { + errno = ENOTCONN; + return -1; + } + } + if (uaddr->sa_family == AF_INET && so->type == TLE_V6) { + /* IPv4 mapped IPv6 socket */ + trans_4mapped6_addr(uaddr); + } + return rc; + } + + errno = EOPNOTSUPP; + return -1; +} + +int PRE(getsockname)(int sockfd, struct sockaddr *addr, socklen_t *addrlen) +{ + if (is_kernel_fd(sockfd)) + return k_getsockname(sockfd, addr, addrlen); + + return getname(sockfd, addr, addrlen, 0); +} + +int PRE(getpeername)(int sockfd, struct sockaddr *addr, socklen_t *addrlen) +{ + if (is_kernel_fd(sockfd)) + return k_getpeername(sockfd, addr, addrlen); + + return getname(sockfd, addr, addrlen, 1); +} + +int PRE(close)(int fd) +{ + struct sock *so; + + if (is_kernel_fd(fd)) + return k_close(fd); + + so = fd2sock(fd); + if (unlikely(so->valid == 0)) { + errno = EBADF; + return -1; + } else if (unlikely(so->epoll)) { + k_close(so->shadow_efd); + glue_ctx_free(CTX(so)); + } else if (so->s) { + if (OPS(so)->close) + OPS(so)->close(so->s); + + free_fd_event(so); + + if (IS_TCP(so)) { + if (so->rx_left) + rte_pktmbuf_free(so->rx_left); + + be_tx_with_lock(CTX(so)); + } + } + + memset(so, 0, sizeof(*so)); + put_free_fd(fd); + return 0; +} diff --git a/lib/libtle_glue/sym.c b/lib/libtle_glue/sym.c new file mode 100644 index 0000000..39b1707 --- /dev/null +++ b/lib/libtle_glue/sym.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#ifndef __USE_GNU +#define __USE_GNU +#endif +#include + +#include + +#include "sym.h" +#include "log.h" + +#ifdef PRELOAD +int (*k_epoll_create)(int size); +int (*k_epoll_create1)(int flags); +int (*k_epoll_create1)(int flags); +int (*k_epoll_ctl)(int epfd, int op, int fd, struct epoll_event *event); +int (*k_epoll_wait)(int epfd, struct epoll_event *events, int maxevents, int timeout); +int (*k_epoll_pwait)(int epfd, struct epoll_event *events, int maxevents, int timeout, const sigset_t *sigmask); +int (*k_poll)(struct pollfd *fds, nfds_t nfds, int timeout); +int (*k_select)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout); +int (*k_pselect)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timespec *timeout, const sigset_t *sigmask); +int (*k_socket)(int domain, int type, int protocol); +int (*k_listen)(int sockfd, int backlog); +int (*k_bind)(int sockfd, const struct sockaddr *addr, socklen_t addrlen); +int (*k_accept)(int sockfd, struct sockaddr *addr, socklen_t *addrlen); +int (*k_accept4)(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags); +int (*k_connect)(int sockfd, const struct sockaddr *addr, socklen_t addrlen); +int (*k_getsockopt)(int sockfd, int level, int optname, void *optval, socklen_t *optlen); +int (*k_setsockopt)(int sockfd, int level, int optname, const void *optval, socklen_t optlen); +int (*k_fcntl)(int fd, int cmd, ... /* arg */ ); +int (*k_ioctl)(int d, int request, ...); +int (*k_shutdown)(int sockfd, int how); +int (*k_close)(int fd); +ssize_t (*k_recv)(int sockfd, void *buf, size_t len, int flags); +ssize_t (*k_recvfrom)(int sockfd, void *buf, size_t len, int flags, struct sockaddr *src_addr, socklen_t *addrlen); +ssize_t (*k_recvmsg)(int sockfd, struct msghdr *msg, int flags); +ssize_t (*k_read)(int fd, void *buf, size_t count); +ssize_t (*k_readv)(int fd, const struct iovec *iov, int iovcnt); +ssize_t (*k_send)(int sockfd, const void *buf, size_t len, int flags); +ssize_t (*k_sendto)(int sockfd, const void *buf, size_t len, int flags, const struct sockaddr *dest_addr, socklen_t addrlen); +ssize_t (*k_sendmsg)(int sockfd, const struct msghdr *msg, int flags); +ssize_t (*k_write)(int fd, const void *buf, size_t count); +ssize_t (*k_writev)(int fd, const struct iovec *iov, int iovcnt); +ssize_t (*k_splice)(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags); +ssize_t (*k_sendfile)(int out_fd, int in_fd, off_t *offset, size_t count); +int (*k_getsockname)(int sockfd, struct sockaddr *addr, socklen_t *addrlen); +int (*k_getpeername)(int sockfd, struct sockaddr *addr, socklen_t *addrlen); + +#define INIT_FUNC(func, handle) do { \ + k_##func = dlsym(handle, #func); \ + if ((error = dlerror()) != NULL) { \ + rte_panic(#func "is not init"); \ + } \ + RTE_ASSERT(k_##func); \ +} while (0) + +#endif + +void +symbol_init(void) +{ +#ifdef PRELOAD + void *handle; + char *error; + + TRACE("in %s", __func__); + + handle = dlopen("libc.so.6", RTLD_NOW); + error = dlerror(); + if (!handle) { + fprintf(stderr, "%s\n", error); + exit(EXIT_FAILURE); + } + + INIT_FUNC(epoll_create, handle); + INIT_FUNC(epoll_create1, handle); + INIT_FUNC(epoll_create1, handle); + INIT_FUNC(epoll_ctl, handle); + INIT_FUNC(epoll_wait, handle); + INIT_FUNC(epoll_pwait, handle); + INIT_FUNC(socket, handle); + INIT_FUNC(listen, handle); + INIT_FUNC(bind, handle); + INIT_FUNC(accept, handle); + INIT_FUNC(accept4, handle); + INIT_FUNC(connect, handle); + INIT_FUNC(getsockopt, handle); + INIT_FUNC(setsockopt, handle); + INIT_FUNC(fcntl, handle); + INIT_FUNC(ioctl, handle); + INIT_FUNC(shutdown, handle); + INIT_FUNC(close, handle); + INIT_FUNC(recv, handle); + INIT_FUNC(recvfrom, handle); + INIT_FUNC(recvmsg, handle); + INIT_FUNC(read, handle); + INIT_FUNC(readv, handle); + INIT_FUNC(send, handle); + INIT_FUNC(sendto, handle); + INIT_FUNC(sendmsg, handle); + INIT_FUNC(write, handle); + INIT_FUNC(writev, handle); + INIT_FUNC(splice, handle); + INIT_FUNC(sendfile, handle); + INIT_FUNC(poll, handle); + INIT_FUNC(getsockname, handle); + INIT_FUNC(getpeername, handle); + INIT_FUNC(select, handle); + INIT_FUNC(pselect, handle); + + dlclose(handle); +#endif +} diff --git a/lib/libtle_glue/sym.h b/lib/libtle_glue/sym.h new file mode 100644 index 0000000..b5a333d --- /dev/null +++ b/lib/libtle_glue/sym.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_KSYM_H_ +#define _TLE_KSYM_H_ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tle_glue.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void symbol_init(void); + +#ifdef PRELOAD +int (*k_epoll_create)(int size); +int (*k_epoll_create1)(int flags); +int (*k_epoll_ctl)(int epfd, int op, int fd, struct epoll_event *event); +int (*k_epoll_wait)(int epfd, struct epoll_event *events, int maxevents, int timeout); +int (*k_epoll_pwait)(int epfd, struct epoll_event *events, int maxevents, int timeout, const sigset_t *sigmask); +int (*k_poll)(struct pollfd *fds, nfds_t nfds, int timeout); +int (*k_select)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout); +int (*k_pselect)(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timespec *timeout, const sigset_t *sigmask); + +int (*k_socket)(int domain, int type, int protocol); +int (*k_listen)(int sockfd, int backlog); +int (*k_bind)(int sockfd, const struct sockaddr *addr, socklen_t addrlen); +int (*k_accept)(int sockfd, struct sockaddr *addr, socklen_t *addrlen); +int (*k_accept4)(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags); +int (*k_connect)(int sockfd, const struct sockaddr *addr, socklen_t addrlen); +int (*k_getsockopt)(int sockfd, int level, int optname, void *optval, socklen_t *optlen); +int (*k_setsockopt)(int sockfd, int level, int optname, const void *optval, socklen_t optlen); +int (*k_fcntl)(int fd, int cmd, ... /* arg */ ); +int (*k_ioctl)(int d, int request, ...); +int (*k_shutdown)(int sockfd, int how); +int (*k_close)(int fd); +ssize_t (*k_recv)(int sockfd, void *buf, size_t len, int flags); +ssize_t (*k_recvfrom)(int sockfd, void *buf, size_t len, int flags, struct sockaddr *src_addr, socklen_t *addrlen); +ssize_t (*k_recvmsg)(int sockfd, struct msghdr *msg, int flags); +ssize_t (*k_read)(int fd, void *buf, size_t count); +ssize_t (*k_readv)(int fd, const struct iovec *iov, int iovcnt); +ssize_t (*k_send)(int sockfd, const void *buf, size_t len, int flags); +ssize_t (*k_sendto)(int sockfd, const void *buf, size_t len, int flags, const struct sockaddr *dest_addr, socklen_t addrlen); +ssize_t (*k_sendmsg)(int sockfd, const struct msghdr *msg, int flags); +ssize_t (*k_write)(int fd, const void *buf, size_t count); +ssize_t (*k_writev)(int fd, const struct iovec *iov, int iovcnt); +ssize_t (*k_splice)(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags); +ssize_t (*k_sendfile)(int out_fd, int in_fd, off_t *offset, size_t count); +int (*k_getsockname)(int sockfd, struct sockaddr *addr, socklen_t *addrlen); +int (*k_getpeername)(int sockfd, struct sockaddr *addr, socklen_t *addrlen); +#else +#define k_epoll_create epoll_create +#define k_epoll_create1 epoll_create1 +#define k_epoll_ctl epoll_ctl +#define k_epoll_wait epoll_wait +#define k_epoll_pwait epoll_pwait +#define k_poll poll +#define k_select select +#define k_pselect pselect +#define k_socket socket +#define k_listen listen +#define k_bind bind +#define k_accept accept +#define k_accept4 accept4 +#define k_connect connect +#define k_getsockopt getsockopt +#define k_setsockopt setsockopt +#define k_fcntl fcntl +#define k_ioctl ioctl +#define k_shutdown shutdown +#define k_close close +#define k_recv recv +#define k_recvfrom recvfrom +#define k_recvmsg recvmsg +#define k_read read +#define k_readv readv +#define k_send send +#define k_sendto sendto +#define k_sendmsg sendmsg +#define k_write write +#define k_writev writev +#define k_splice splice +#define k_sendfile sendfile +#define k_getsockname getsockname +#define k_getpeername getpeername +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _TLE_KSYM_H_ */ diff --git a/lib/libtle_glue/tcp.c b/lib/libtle_glue/tcp.c new file mode 100644 index 0000000..cb66bcc --- /dev/null +++ b/lib/libtle_glue/tcp.c @@ -0,0 +1,500 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sym.h" +#include "fd.h" +#include "log.h" +#include "util.h" +#include "internal.h" +#include "sock.h" + +#define MAX_TCP_KEEPIDLE 32767 +#define MAX_TCP_KEEPINTVL 32767 + +static inline void +foo_support(const char *msg) +{ + GLUE_LOG(WARNING, "%s, return ok without really supporting it", msg); +} + +static int tcp_setsockopt(struct sock *sk, int optname, + const void *optval, socklen_t optlen) +{ + int val; + + val = 0; /* just to make compiler happy */ + if (optlen == sizeof(val)) + val = *(const int *)optval; + + /* man tcp(7) or see /usr/include/netinet/tcp.h */ + switch (optname) { + case TCP_NODELAY: /* antonym: TCP_CORK */ + if (val == 0) + sk->option.tcpnodelay = 0; + else + sk->option.tcpnodelay = 1; + if (sk->s != NULL) + sk->s->option.tcpnodelay = sk->option.tcpnodelay; + return 0; + case TCP_CORK: + if (val == 0) + sk->option.tcpcork = 0; + else + sk->option.tcpcork = 1; + if (sk->s != NULL) + sk->s->option.tcpcork = sk->option.tcpcork; + return 0; + case TCP_KEEPIDLE: + if (val <= 0 || val > MAX_TCP_KEEPIDLE) { + errno = EINVAL; + return -1; + } + sk->option.keepidle = val; + if (sk->s != NULL) + sk->s->option.keepidle = sk->option.keepidle; + return 0; + case TCP_KEEPINTVL: + if (val <= 0 || val > MAX_TCP_KEEPINTVL) { + errno = EINVAL; + return -1; + } + sk->option.keepintvl = val; + if (sk->s != NULL) + sk->s->option.keepintvl = sk->option.keepintvl; + return 0; + case TCP_USER_TIMEOUT: + foo_support("set TCP_USER_TIMEOUT"); + return 0; + case TCP_DEFER_ACCEPT: + if (val == 0) + return 0; + break; + case TCP_FASTOPEN: + case TCP_FASTOPEN_CONNECT: + if (val == 0) + return 0; + break; + case TCP_QUICKACK: + /* Based on below info, it's safe to just return 0: + * "This flag is not permanent, it only enables a + * switch to or from quickack mode. Subsequent + * operationof the TCP protocol will once again ..." + */ + if (val == 0) + sk->option.tcpquickack = 0; + else + sk->option.tcpquickack = 8; + if (sk->s != NULL) + sk->s->option.tcpquickack = sk->option.tcpquickack; + return 0; + case TCP_CONGESTION: + /* only support NewReno; but we return success for + * any kind of setting. + */ + foo_support("set TCP_CONGESTION"); + return 0; + default: + break; + } + + rte_panic("setsockopt(%d) with level = SOL_TCP, optname = %d\n", + sock2fd(sk), optname); + return -1; +} + +static int tcp_getsockopt(struct sock *sk, int optname, + void *optval, socklen_t *optlen) +{ + int rc; + union { + int val; + uint64_t val64; + struct linger ling; + struct timeval tm; + } *p = optval; + + RTE_SET_USED(optlen); + + /* man tcp(7) or see /usr/include/netinet/tcp.h */ + switch (optname) { + case TCP_MAXSEG: + p->val = 64 * 1024; + return 0; + case TCP_FASTOPEN: + case TCP_FASTOPEN_CONNECT: + p->val = 0; + return 0; + case TCP_INFO: + /* needed by netperf */ + rc = tle_tcp_stream_get_info(sk->s, optval, optlen); + if (rc < 0) { + errno = -rc; + return -1; + } + return 0; + case TCP_CONGESTION: + strncpy(optval, "NewReno", *optlen); + ((char *)optval)[*optlen - 1] = '\0'; + return 0; + case TCP_CORK: + p->val = sk->option.tcpcork; + return 0; + case TCP_QUICKACK: + p->val = sk->option.tcpquickack != 0 ? 1 : 0; + return 0; + case TCP_NODELAY: + p->val = sk->option.tcpnodelay; + return 0; + case TCP_KEEPIDLE: + p->val = sk->option.keepidle; + return 0; + case TCP_KEEPINTVL: + p->val = sk->option.keepintvl; + return 0; + default: + break; + } + + rte_panic("getsockopt(%d) with level = SOL_TCP, optname = %d", + sock2fd(sk), optname); + return -1; +} + +static int tcp_getname(struct sock *sk, struct sockaddr *addr, int peer) +{ + int rc; + int addrlen; + struct tle_tcp_stream_addr a; + + rc = tle_tcp_stream_get_addr(sk->s, &a); + if (rc) { + errno = -rc; + return -1; + } + + if (a.local.ss_family == AF_INET) + addrlen = sizeof(struct sockaddr_in); + else + addrlen = sizeof(struct sockaddr_in6); + + if (peer) + memcpy(addr, &a.remote, addrlen); + else + memcpy(addr, &a.local, addrlen); + + addr->sa_family = a.local.ss_family; + + return 0; +} + +static int tcp_bind(struct sock *sk, const struct sockaddr *addr) +{ + sk->s = open_bind(sk, addr, NULL); + if (sk->s == NULL) + return -1; + return 0; +} + +static int tcp_listen(struct sock *sk, int backlog) +{ + int32_t rc; + + if (backlog < 0) { + errno = EINVAL; + return -1; + } + + rc = tle_tcp_stream_listen(sk->s); + if (rc) { + errno = -rc; + return -1; + } + + return 0; +} + +static int tcp_connect(struct sock *sk, const struct sockaddr *addr) +{ + int rc; + int rx; + int ret; + struct epoll_event event; + struct sockaddr_storage laddr; + struct sockaddr_storage raddr; + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + struct sockaddr *local = NULL; + + /* TODO: For multi-thread case, we shall properly manage local + * L4 port so that packets coming back can be put into the same + * queue pair. + */ + if (sk->s) { + struct tle_tcp_stream *ts = TCP_STREAM(sk->s); + /* case 1: bind happens before connect; + * case 2: connect after a previous connect, failed + * or succeeded. + */ + if (ts->tcb.err != 0) { + errno = ts->tcb.err; + return -1; + } + + if (sk->txev && tle_event_state(sk->txev) != TLE_SEV_DOWN) + return 0; /* connect succeeds */ + + int state = ts->tcb.state; + + if (state == TCP_ST_CLOSED) { + if (tcp_getname(sk, (struct sockaddr *)&laddr, 0) == 0) + local = (struct sockaddr *)&laddr; + tle_tcp_stream_close(sk->s); + sk->s = NULL; + goto do_connect; /* case 1 */ + } else if (state >= TCP_ST_SYN_SENT && + state < TCP_ST_ESTABLISHED) + errno = EALREADY; + else if (state >= TCP_ST_ESTABLISHED) + errno = EISCONN; + else + errno = EINVAL; + return -1; + } + +do_connect: + sk->s = open_bind(sk, local, addr); + if (sk->s == NULL) /* errno is set */ + return -1; + + if (sk->s->type == TLE_V4) { + addr4 = (struct sockaddr_in*)&raddr; + addr4->sin_family = AF_INET; + addr4->sin_port = sk->s->port.src; + addr4->sin_addr.s_addr = sk->s->ipv4.addr.src; + } else { + addr6 = (struct sockaddr_in6*)&raddr; + addr6->sin6_family = AF_INET6; + addr6->sin6_port = sk->s->port.src; + rte_memcpy(&addr6->sin6_addr, &sk->s->ipv6.addr.src, + sizeof(struct in6_addr)); + } + rc = tle_tcp_stream_connect(sk->s, (const struct sockaddr*)&raddr); + if (rc < 0) { + errno = -rc; + return -1; + } + + if (is_nonblock(sk, 0)) { + be_tx_with_lock(CTX(sk)); + /* It could not be ready so fast */ + errno = EINPROGRESS; + return -1; + } + + do { + be_process(CTX(sk)); + + if (tle_event_state(sk->txev) == TLE_SEV_UP) { + tle_event_down(sk->txev); + ret = 0; + break; + } + + if (tle_event_state(sk->erev) == TLE_SEV_UP) { + tle_event_down(sk->erev); + errno = ECONNREFUSED; + ret = -1; + break; + } + + /* fix me: timeout? */ + epoll_kernel_wait(CTX(sk), -1, &event, 1, 1, &rx); + } while (1); + + return ret; +} + +static void tcp_update_cfg(struct sock *sk); + +static int tcp_accept(struct sock *sk, struct sockaddr *addr, + socklen_t *addrlen, int flags) +{ + int fd; + int rx; + struct sock *newsk; + struct tle_stream *rs; + struct tle_tcp_stream_addr a; + + fd = get_unused_fd(); + if (fd < 0) { + errno = ENFILE; + return -1; + } + + newsk = fd2sock(fd); +again: + if (tle_tcp_stream_accept(sk->s, &rs, 1) == 0) { + if (rte_errno != EAGAIN) { + errno = rte_errno; + return -1; + } + + struct epoll_event event; + + if (is_nonblock(sk, flags)) { + newsk->valid = 0; + put_free_fd(fd); + errno = EAGAIN; + return -1; + } + + epoll_kernel_wait(CTX(sk), -1, &event, 1, 1, &rx); + + be_process(CTX(sk)); + + goto again; + } + + newsk->s = rs; + newsk->cid = sk->cid; + newsk->type = sk->type; + newsk->proto = sk->proto; + newsk->option.raw = 0; + newsk->option.tcpquickack = 1; + newsk->option.mulloop = 1; + newsk->option.multtl = 1; + newsk->option.keepidle = 2 * 60 * 60; + newsk->option.keepintvl = 75; + newsk->s->option.raw = newsk->option.raw; + sock_alloc_events(newsk); + tcp_update_cfg(newsk); + + if (addr) { + /* We assume this function never fails */ + tle_tcp_stream_get_addr(rs, &a); + + *addrlen = sizeof(struct sockaddr_in); + memcpy(addr, &a.remote, *addrlen); + } + + return fd; +} + +static ssize_t tcp_send(struct sock *sk, struct rte_mbuf *pkt[], + uint16_t num, const struct sockaddr *dst_addr) +{ + uint16_t rc; + RTE_SET_USED(dst_addr); + + rc = tle_tcp_stream_send(sk->s, pkt, num); + if (rc == 0) + errno = rte_errno; + + return rc; +} + +static ssize_t tcp_recv(struct tle_stream *s, struct rte_mbuf *pkt[], + uint16_t num, struct sockaddr *addr) +{ + uint16_t rc; + + RTE_SET_USED(addr); + + /* optimize me: merge multiple mbufs into one */ + rc = tle_tcp_stream_recv(s, pkt, num); + if (rc == 0) + errno = rte_errno; + + return rc; +} + +static ssize_t tcp_readv(struct tle_stream *ts, const struct iovec *iov, + int iovcnt, struct msghdr *msg) +{ + ssize_t rc; + + rc = tle_tcp_stream_readv_msg(ts, iov, iovcnt, msg); + if (rc < 0) + errno = rte_errno; + return rc; +} + +static ssize_t tcp_writev(struct sock *sk, const struct iovec *iov, + int iovcnt, const struct sockaddr *dst_addr) +{ + ssize_t rc; + struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */ + + RTE_SET_USED(dst_addr); + + rc = tle_tcp_stream_writev(sk->s, mp, iov, iovcnt); + if (rc < 0) + errno = rte_errno; + return rc; +} + +static int tcp_shutdown(struct sock *sk, int how) +{ + int ret; + + ret = tle_tcp_stream_shutdown(sk->s, how); + if (how == SHUT_RDWR) + sk->s = NULL; + + be_tx_with_lock(CTX(sk)); + return ret; + +} + +static void tcp_update_cfg(struct sock *sk) +{ + struct tle_tcp_stream_cfg prm; + memset(&prm, 0, sizeof(prm)); + + prm.recv_ev = sk->rxev; + prm.send_ev = sk->txev; + prm.err_ev = sk->erev; + + tle_tcp_stream_update_cfg(&sk->s, &prm, 1); +} + +struct proto tcp_prot = { + .name = "TCP", + .setsockopt = tcp_setsockopt, + .getsockopt = tcp_getsockopt, + .getname = tcp_getname, + .bind = tcp_bind, + .listen = tcp_listen, + .connect = tcp_connect, + .accept = tcp_accept, + .recv = tcp_recv, + .send = tcp_send, + .readv = tcp_readv, + .writev = tcp_writev, + .shutdown = tcp_shutdown, + .close = tle_tcp_stream_close, + .update_cfg = tcp_update_cfg, +}; diff --git a/lib/libtle_glue/tle_glue.h b/lib/libtle_glue/tle_glue.h new file mode 100644 index 0000000..38357e4 --- /dev/null +++ b/lib/libtle_glue/tle_glue.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_GLUE_H_ +#define _TLE_GLUE_H_ + +#include +#include +#include +#include +#include + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef PRELOAD + +#define PRE(name) name + +#else + +#define PRE(name) tle_ ## name + +#endif + +void glue_init1(int argc, char **argv); + +/* epoll */ +int PRE(epoll_create)(int size); +int PRE(epoll_create1)(int flags); +int PRE(epoll_ctl)(int epfd, int op, int fd, struct epoll_event *event); +int PRE(epoll_wait)(int epfd, struct epoll_event *events, int maxevents, int timeout); +int PRE(epoll_pwait)(int epfd, struct epoll_event *events, + int maxevents, int timeout, const sigset_t *sigmask); + +/* for setup, settings, and destroy */ +int PRE(socket)(int domain, int type, int protocol); +int PRE(listen)(int sockfd, int backlog); +int PRE(bind)(int sockfd, const struct sockaddr *addr, socklen_t addrlen); +int PRE(accept)(int sockfd, struct sockaddr *addr, socklen_t *addrlen); +int PRE(accept4)(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags); +int PRE(connect)(int sockfd, const struct sockaddr *addr, socklen_t addrlen); +int PRE(getsockopt)(int sockfd, int level, int optname, + void *optval, socklen_t *optlen); +int PRE(setsockopt)(int sockfd, int level, int optname, + const void *optval, socklen_t optlen); +int PRE(getsockname)(int sockfd, struct sockaddr *addr, socklen_t *addrlen); +int PRE(getpeername)(int sockfd, struct sockaddr *addr, socklen_t *addrlen); +int PRE(fcntl)(int fd, int cmd, ... /* arg */ ); +int PRE(ioctl)(int d, unsigned long int request, ...); +int PRE(shutdown)(int sockfd, int how); +int PRE(close)(int fd); + +/* for recv */ +ssize_t PRE(recv)(int sockfd, void *buf, size_t len, int flags); +ssize_t PRE(recvfrom)(int sockfd, void *buf, size_t len, int flags, + struct sockaddr *src_addr, socklen_t *addrlen); +ssize_t PRE(recvmsg)(int sockfd, struct msghdr *msg, int flags); +ssize_t PRE(read)(int fd, void *buf, size_t count); +ssize_t PRE(readv)(int fd, const struct iovec *iov, int iovcnt); + +/* for send */ +ssize_t PRE(send)(int sockfd, const void *buf, size_t len, int flags); +ssize_t PRE(sendto)(int sockfd, const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen); +ssize_t PRE(sendmsg)(int sockfd, const struct msghdr *msg, int flags); +ssize_t PRE(write)(int fd, const void *buf, size_t count); +ssize_t PRE(writev)(int fd, const struct iovec *iov, int iovcnt); + +/* advanced functions */ +ssize_t PRE(splice)(int fd_in, loff_t *off_in, int fd_out, + loff_t *off_out, size_t len, unsigned int flags); +ssize_t PRE(sendfile)(int out_fd, int in_fd, off_t *offset, size_t count); + +/* poll */ +int PRE(poll)(struct pollfd *fds, nfds_t nfds, int timeout); +int PRE(ppoll)(struct pollfd *fds, nfds_t nfds, + const struct timespec *tmo_p, const sigset_t *sigmask); + +/* select */ +int PRE(select)(int nfds, fd_set *readfds, fd_set *writefds, + fd_set *exceptfds, struct timeval *timeout); +int PRE(pselect)(int nfds, fd_set *readfds, fd_set *writefds, + fd_set *exceptfds, const struct timespec *timeout, + const sigset_t *sigmask); + +/* non-posix APIs */ +int fd_ready(int fd, int events); +void v_get_stats_snmp(unsigned long mibs[]); + +#ifdef __cplusplus +} +#endif + +#endif /* _TLE_GLUE_H_ */ diff --git a/lib/libtle_glue/udp.c b/lib/libtle_glue/udp.c new file mode 100644 index 0000000..6e08d03 --- /dev/null +++ b/lib/libtle_glue/udp.c @@ -0,0 +1,419 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "sym.h" +#include "fd.h" +#include "log.h" +#include "util.h" +#include "internal.h" +#include "sock.h" +#include "../libtle_l4p/udp_stream.h" + +static inline int udp_setsockopt(__rte_unused struct sock *sk, + __rte_unused int optname, + __rte_unused const void *optval, + __rte_unused socklen_t optlen) +{ + return 0; +} + +static inline int udp_getsockopt(__rte_unused struct sock *sk, + __rte_unused int optname, + __rte_unused void *optval, + __rte_unused socklen_t *optlen) +{ + return 0; +} + +static int udp_getname(struct sock *sk, struct sockaddr *addr, int peer) +{ + int rc; + struct tle_udp_stream_param p; + size_t addrlen; + + rc = tle_udp_stream_get_param(sk->s, &p); + if (rc) { + errno = -rc; + return -1; + } + + addrlen = get_sockaddr_len(sk); + + if (peer) + memcpy(addr, &p.remote_addr, addrlen); + else + memcpy(addr, &p.local_addr, addrlen); + + addr->sa_family = p.local_addr.ss_family; + + return 0; +} + +static int udp_bind(struct sock *sk, const struct sockaddr *addr) +{ + if (sk->ubind) { + /* The socket is already bound to an address */ + errno = EINVAL; + return -1; + } + + sk->s = open_bind(sk, addr, NULL); + if (sk->s != NULL) { + sk->ubind = 1; + return 0; + } + + return -1; +} + +static int udp_connect(struct sock *sk, const struct sockaddr *addr) +{ + struct sockaddr laddr; + + if (sk->ubind) { + if (udp_getname(sk, &laddr, 0)) + return -1; + sk->s = open_bind(sk, &laddr, addr); + } else + sk->s = open_bind(sk, NULL, addr); + + if (sk->s) + return 0; + + return -1; +} + +static int +udp_addr_prepare(struct sock *sk, const struct sockaddr **p_dst_addr, + struct sockaddr_storage *addr) +{ + const struct sockaddr* dst_addr = *p_dst_addr; + + if (dst_addr != NULL && + dst_addr->sa_family == AF_INET6 && IN6_IS_ADDR_V4MAPPED( + &((const struct sockaddr_in6*)dst_addr)->sin6_addr)) { + /* process V4 mapped V6 address */ + rte_memcpy(addr, dst_addr, sizeof(struct sockaddr_in6)); + dst_addr = (const struct sockaddr*)(addr); + *p_dst_addr = dst_addr; + retrans_4mapped6_addr((struct sockaddr_storage*)(addr)); + } + + if (sk->s == NULL) { + if (dst_addr == NULL) { + errno = EDESTADDRREQ; + return -1; + } + + sk->s = open_bind(sk, NULL, dst_addr); + if (sk->s == NULL) /* errno is set */ + return -1; + } else if (dst_addr != NULL) { + if (dst_addr->sa_family == AF_INET && sk->s->type == TLE_V6) { + if (IN6_IS_ADDR_UNSPECIFIED(&sk->s->ipv6.addr.dst)) { + sk->s->type = TLE_V4; + sk->s->ipv4.addr.dst = 0; + } else { + errno = ENETUNREACH; + return -1; + } + } else if (dst_addr->sa_family == AF_INET6 && sk->s->type == TLE_V4) { + errno = EINVAL; + return -1; + } + } + + return 0; +} + +/* abstract client info from mbuf into s */ +static inline void +udp_pkt_addr(const struct rte_mbuf *m, struct sockaddr *addr, __rte_unused uint16_t family) +{ + const struct ipv4_hdr *ip4h; + const struct ipv6_hdr *ip6h; + const struct udp_hdr *udph; + struct sockaddr_in *in4; + struct sockaddr_in6 *in6; + int off = -(m->l4_len + m->l3_len); + + udph = rte_pktmbuf_mtod_offset(m, struct udp_hdr *, -m->l4_len); + ip4h = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, off); + if ((ip4h->version_ihl>>4) == 4) { + addr->sa_family = AF_INET; + in4 = (struct sockaddr_in *)addr; + in4->sin_port = udph->src_port; + in4->sin_addr.s_addr = ip4h->src_addr; + } else { + addr->sa_family = AF_INET6; + ip6h = (const struct ipv6_hdr*)ip4h; + in6 = (struct sockaddr_in6 *)addr; + in6->sin6_port = udph->src_port; + rte_memcpy(&in6->sin6_addr, ip6h->src_addr, + sizeof(in6->sin6_addr)); + } +} + +static ssize_t +udp_send(struct sock *sk, struct rte_mbuf *pkt[], + uint16_t num, const struct sockaddr *dst_addr) +{ + uint16_t i; + struct sockaddr_storage addr; + if (udp_addr_prepare(sk, &dst_addr, &addr) != 0) + return 0; + + /* chain them together as *one* message */ + for (i = 1; i < num; ++i) { + pkt[i-1]->next = pkt[i]; + pkt[0]->pkt_len += pkt[i]->pkt_len; + } + pkt[0]->nb_segs = num; + + if (tle_udp_stream_send(sk->s, &pkt[0], 1, dst_addr) == 0) { + errno = rte_errno; + return 0; + } + + return num; +} + +static ssize_t udp_readv(struct tle_stream *s, const struct iovec *iov, + int iovcnt, struct msghdr *msg) +{ + uint16_t rc; + ssize_t sz; + struct rte_mbuf *m; + struct iovec iv; + uint32_t fin; + + sz = 0; + rc = tle_udp_stream_recv(s, &m, 1); + + if (rc > 0) { + if (!timestamp_needed) + s->timestamp = m->timestamp; + if (msg != NULL && msg->msg_control != NULL) { + if (timestamp_needed) + set_msg_timestamp(msg, m); + else + msg->msg_controllen = 0; + } + + if (msg != NULL && msg->msg_name != NULL) { + udp_pkt_addr(m, (struct sockaddr*)msg->msg_name, 0); + if (((struct sockaddr *)msg->msg_name)->sa_family == AF_INET) { + msg->msg_namelen = sizeof(struct sockaddr_in); + } else { + msg->msg_namelen = sizeof(struct sockaddr_in6); + } + } + + for (int i = 0; i != iovcnt; i++) { + iv = iov[i]; + sz += iv.iov_len; + fin = _mbus_to_iovec(&iv, &m, 1); + if (fin == 1) { + sz -= iv.iov_len; + break; + } + } + if (fin == 0) + rte_pktmbuf_free_seg(m); + return sz; + } else { + errno = rte_errno; + if (errno == EAGAIN && !rwl_is_up(&(UDP_STREAM(s)->rx.use))) + /* rx is shutdown */ + errno = ESHUTDOWN; + return -1; + } +} + +static ssize_t udp_writev(struct sock *sk, const struct iovec *iov, + int iovcnt, const struct sockaddr *dst_addr) +{ + struct rte_mempool *mp = get_mempool_by_socket(0); /* fix me */ + uint16_t i, rc, nb_mbufs; + uint32_t slen, left_m, left_b, copy_len, left; + char *dst, *src; + uint64_t ufo; + size_t total; + int j; + struct sockaddr_storage addr; + + if (udp_addr_prepare(sk, &dst_addr, &addr) != 0) + return -1; + + for (j = 0, total = 0; j < iovcnt; ++j) { + total += iov[j].iov_len; + } + + ufo = tx_offload & DEV_TX_OFFLOAD_UDP_TSO; + if (ufo) + slen = RTE_MBUF_DEFAULT_DATAROOM; + else + slen = 1500 - 20; /* mtu - ip_hdr_len */ + + nb_mbufs = (total + 8 + slen - 1) / slen; + struct rte_mbuf *mbufs[nb_mbufs]; + if (unlikely(rte_pktmbuf_alloc_bulk(mp, mbufs, nb_mbufs) != 0)) { + errno = ENOMEM; + return -1; + } + + for (i = 0, j = 0, left_b = iov[0].iov_len; + i < nb_mbufs && j < iovcnt; ++i) { + /* first frag has udp hdr, its payload is 8 bytes less */ + if (i == 0) { + slen -= 8; + } else if (i == 1) { + slen += 8; + } + left_m = slen; + while (left_m > 0 && j < iovcnt) { + copy_len = RTE_MIN(left_m, left_b); + dst = rte_pktmbuf_mtod_offset(mbufs[i], char *, + slen - left_m); + src = (char *)iov[j].iov_base + iov[j].iov_len - left_b; + rte_memcpy(dst, src, copy_len); + + left_m -= copy_len; + left_b -= copy_len; + if (left_b == 0) { + j++; + left_b = iov[j].iov_len; + } + } + mbufs[i]->data_len = slen; + mbufs[i]->pkt_len = slen; + } + + /* last seg */ + if (nb_mbufs == 1) { + mbufs[nb_mbufs - 1]->data_len = total; + mbufs[nb_mbufs - 1]->pkt_len = total; + } else { + mbufs[nb_mbufs - 1]->data_len = total - (nb_mbufs - 1) * slen + 8; + mbufs[nb_mbufs - 1]->pkt_len = total - (nb_mbufs - 1) * slen + 8; + } + + /* chain as *one* message */ + for (i = 1; i < nb_mbufs; ++i) + mbufs[i-1]->next = mbufs[i]; + mbufs[0]->nb_segs = nb_mbufs; + mbufs[0]->pkt_len = total; + nb_mbufs = 1; + + rc = tle_udp_stream_send(sk->s, mbufs, nb_mbufs, dst_addr); + for (i = rc, left = 0; i < nb_mbufs; ++i) { + left += mbufs[i]->pkt_len; + rte_pktmbuf_free(mbufs[i]); + } + + if (rc == 0) { + errno = rte_errno; + return -1; + } + + return total - left; +} + +static ssize_t +udp_recv(struct tle_stream *s, struct rte_mbuf *pkt[], uint16_t num, + struct sockaddr *addr) +{ + uint16_t rc; + + rc = tle_udp_stream_recv(s, pkt, num); + if (addr && num == 1 && rc == 1) { + udp_pkt_addr(pkt[0], addr, 0); + } + + if (rc == 0) { + errno = rte_errno; + if (errno == EAGAIN && !rwl_is_up(&(UDP_STREAM(s)->rx.use))) + /* rx is shutdown */ + errno = ESHUTDOWN; + } + + return rc; +} + +static void udp_update_cfg(struct sock *sk) +{ + struct tle_udp_stream_param prm; + memset(&prm, 0, sizeof(prm)); + + prm.recv_ev = sk->rxev; + prm.send_ev = sk->txev; + + tle_udp_stream_update_cfg(&sk->s, &prm, 1); +} + +static int udp_shutdown(struct sock *sk, int how) +{ + if (sk->s == NULL) { + errno = ENOTCONN; + return -1; + } + /* TODO: the implimentation should be moved + * into l4p and encapsuled better */ + struct tle_udp_stream* stream = UDP_STREAM(sk->s); + switch (how) { + case SHUT_RD: + rwl_down(&stream->rx.use); + break; + case SHUT_WR: + rwl_down(&stream->tx.use); + break; + case SHUT_RDWR: + rwl_down(&stream->rx.use); + rwl_down(&stream->tx.use); + break; + default: + errno = EINVAL; + } + + tle_event_raise(sk->rxev); + return 0; +} + +struct proto udp_prot = { + .name = "UDP", + .setsockopt = udp_setsockopt, + .getsockopt = udp_getsockopt, + .getname = udp_getname, + .bind = udp_bind, + .connect = udp_connect, + .recv = udp_recv, + .send = udp_send, + .readv = udp_readv, + .writev = udp_writev, + .shutdown = udp_shutdown, + .close = tle_udp_stream_close, + .update_cfg = udp_update_cfg, +}; diff --git a/lib/libtle_glue/util.c b/lib/libtle_glue/util.c new file mode 100644 index 0000000..cb321dd --- /dev/null +++ b/lib/libtle_glue/util.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include + +#include "util.h" + +#define NUMA_NODE_PATH "/sys/devices/system/node" + +static unsigned +eal_cpu_socket_id(unsigned lcore_id) +{ + unsigned socket; + + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "%s/node%u/cpu%u", NUMA_NODE_PATH, + socket, lcore_id); + if (access(path, F_OK) == 0) + return socket; + } + return 0; +} + +uint32_t +get_socket_id(void) +{ + int err; + uint32_t i; + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + err = pthread_getaffinity_np(pthread_self(), + sizeof(cpuset), &cpuset); + if (err) + return 0; + + for (i = 0; i < CPU_SETSIZE; i++) + if (CPU_ISSET(i, &cpuset)) + break; + + return eal_cpu_socket_id(i); +} diff --git a/lib/libtle_glue/util.h b/lib/libtle_glue/util.h new file mode 100644 index 0000000..4b4aa38 --- /dev/null +++ b/lib/libtle_glue/util.h @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_GLUE_UTIL_H_ +#define _TLE_GLUE_UTIL_H_ + +#include +#include +#include +#include + +#include +#include + +#include "../libtle_l4p/tcp_stream.h" + +#include "fd.h" +#include "ctx.h" +#include "sock.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static inline void * +xstrdup(const void *old) +{ + void *new = strdup(old); + if (unlikely(new == NULL)) + rte_panic("Failed to strdup"); + return new; +} + +static inline void * +xmalloc(size_t size) +{ + void *p = malloc(size ? size : 1); + if (p == NULL) + rte_panic("Failed to malloc"); + return p; +} + +static inline char * +xvasprintf(const char *format, va_list args) +{ + va_list args2; + size_t needed; + char *s; + + va_copy(args2, args); + needed = vsnprintf(NULL, 0, format, args); + + s = xmalloc(needed + 1); + + vsnprintf(s, needed + 1, format, args2); + va_end(args2); + + return s; +} + +static inline char * +xasprintf(const char *format, ...) +{ + va_list args; + char *s; + + va_start(args, format); + s = xvasprintf(format, args); + va_end(args); + + return s; +} + +static inline char ** +grow_argv(char **argv, size_t cur_siz, size_t grow_by) +{ + char **p; + + p = realloc(argv, sizeof(char *) * (cur_siz + grow_by)); + if (unlikely(p == NULL)) + rte_panic("Failed to grow argv"); + return p; +} + +static inline void +release_argv(int argc, char **argv_to_release, char **argv) +{ + int i; + + for (i = 0; i < argc; ++i) + free(argv_to_release[i]); + + free(argv_to_release); + free(argv); +} + +static inline void +sock_alloc_events(struct sock *so) +{ + so->erev = tle_event_alloc(CTX(so)->ereq, so); + so->rxev = tle_event_alloc(CTX(so)->rxeq, so); + so->txev = tle_event_alloc(CTX(so)->txeq, so); + tle_event_active(so->erev, TLE_SEV_DOWN); + tle_event_active(so->rxev, TLE_SEV_DOWN); + tle_event_active(so->txev, TLE_SEV_DOWN); +} + +static inline void +sock_active_events(struct sock *so) +{ + tle_event_active(so->erev, TLE_SEV_DOWN); + tle_event_active(so->rxev, TLE_SEV_DOWN); + tle_event_active(so->txev, TLE_SEV_DOWN); +} + +static inline const struct in6_addr* +select_local_addr_v6(const struct sockaddr *remote, struct glue_ctx *ctx) +{ + /* todo: implement route table to decide local address */ + + if (IN6_IS_ADDR_LOOPBACK(&((const struct sockaddr_in6 *)remote) + ->sin6_addr)) + return &in6addr_loopback; + else + return &ctx->ipv6; +} + +static inline in_addr_t +select_local_addr(const struct sockaddr *remote, struct glue_ctx *ctx) +{ + /* todo: implement route table to decide local address */ + in_addr_t remote_addr; + + remote_addr = ((const struct sockaddr_in*)remote)->sin_addr.s_addr; + if (remote_addr == htonl(INADDR_LOOPBACK)) + return htonl(INADDR_LOOPBACK); + else + return ctx->ipv4; +} + +/* transform an IPv4 address(in struct sockaddr_in) to + * an IPv4 mapped IPv6 address(in struct sockaddr_in6) */ +static inline void +trans_4mapped6_addr(struct sockaddr *addr) +{ + struct sockaddr_in6 *addr6; + + if (addr->sa_family != AF_INET) + return; + + addr6 = (struct sockaddr_in6*)addr; + addr6->sin6_family = AF_INET6; + addr6->sin6_addr.s6_addr32[0] = 0; + addr6->sin6_addr.s6_addr32[1] = 0; + addr6->sin6_addr.s6_addr32[2] = 0xffff0000; + addr6->sin6_addr.s6_addr32[3] = ((struct sockaddr_in*)addr)->sin_addr.s_addr; +} + +/* transform an IPv4 mapped IPv6 address(in struct sockaddr_in6) to + * an IPv4 address(in struct sockaddr_in) */ +static inline void +retrans_4mapped6_addr(struct sockaddr_storage * addr) +{ + struct in6_addr* addr6; + if (addr->ss_family == AF_INET) + return; + + addr6 = &((struct sockaddr_in6*)addr)->sin6_addr; + if(IN6_IS_ADDR_V4MAPPED(addr6)) { + addr->ss_family = AF_INET; + ((struct sockaddr_in*)addr)->sin_addr.s_addr = addr6->__in6_u.__u6_addr32[3]; + } +} + +static inline struct tle_stream * +open_bind(struct sock *so, const struct sockaddr *local, + const struct sockaddr *remote) +{ + struct tle_stream *s; + struct sockaddr_storage *l, *r; + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + struct tle_tcp_stream_param pt = {0}; + struct tle_udp_stream_param pu = {0}; + + if (so->rxev == NULL) + sock_alloc_events(so); + else + sock_active_events(so); + + if (IS_TCP(so)) { + pt.option = so->option.raw; + l = &pt.addr.local; + r = &pt.addr.remote; + pt.cfg.err_ev = so->erev; + pt.cfg.recv_ev = so->rxev; + pt.cfg.send_ev = so->txev; + } else { + pu.option = so->option.raw; + l = &pu.local_addr; + r = &pu.remote_addr; + pu.recv_ev = so->rxev; + pu.send_ev = so->txev; + } + + if (remote) { + memcpy(r, remote, get_sockaddr_len_family(remote->sa_family)); + retrans_4mapped6_addr(r); + if(r->ss_family == AF_INET) { + addr4 = (struct sockaddr_in*)r; + if (addr4->sin_addr.s_addr == 0) + addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + } else { + addr6 = (struct sockaddr_in6*)r; + if (IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr)) + rte_memcpy(&addr6->sin6_addr, &in6addr_loopback, + sizeof(struct in6_addr)); + } + } + + if (local) { + memcpy(l, local, get_sockaddr_len_family(local->sa_family)); + retrans_4mapped6_addr(l); + } else { + l->ss_family = r->ss_family; + } + + if (!remote) + r->ss_family = l->ss_family; + + /* Endpoints of stream have different socket families */ + if (r->ss_family != l->ss_family) { + if (l->ss_family == AF_INET) { + errno = EINVAL; + return NULL; + } else { + /* if local addr is unbound, convert into remote family */ + if (IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6*)l)->sin6_addr)) { + l->ss_family = AF_INET; + ((struct sockaddr_in*)l)->sin_addr.s_addr = 0; + } else { + errno = ENETUNREACH; + return NULL; + } + } + } + + if (l->ss_family == AF_INET) { + addr4 = (struct sockaddr_in*)l; + if (addr4->sin_addr.s_addr == htonl(INADDR_ANY) && remote) { + addr4->sin_addr.s_addr = + select_local_addr((struct sockaddr*)r, CTX(so)); + if (addr4->sin_addr.s_addr == htonl(INADDR_ANY)) { + errno = EADDRNOTAVAIL; + return NULL; + } + } + else if (addr4->sin_addr.s_addr != CTX(so)->ipv4 && + addr4->sin_addr.s_addr != htonl(INADDR_LOOPBACK) && + addr4->sin_addr.s_addr != htonl(INADDR_ANY)) { + errno = EADDRNOTAVAIL; + return NULL; + } + } else { + addr6 = (struct sockaddr_in6 *)l; + if (IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr) && remote) { + memcpy(&addr6->sin6_addr, + select_local_addr_v6((struct sockaddr*)r, CTX(so)), + sizeof(struct in6_addr)); + if (IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr)) { + errno = EADDRNOTAVAIL; + return NULL; + } + } + else if (memcmp(&addr6->sin6_addr, &CTX(so)->ipv6, + sizeof(struct in6_addr)) != 0 && + (!IN6_IS_ADDR_LOOPBACK(&addr6->sin6_addr)) && + (!IN6_IS_ADDR_UNSPECIFIED(&addr6->sin6_addr))) { + errno = EADDRNOTAVAIL; + return NULL; + } + } + + if (IS_TCP(so)) + s = tle_tcp_stream_open(CTX(so)->tcp_ctx, &pt); + else { + if (so->s == NULL) + s = tle_udp_stream_open(CTX(so)->udp_ctx, &pu); + else + s = tle_udp_stream_set(so->s, CTX(so)->udp_ctx, &pu); + } + + if (s == NULL) + errno = rte_errno; + + return s; +} + +static inline struct tle_stream * +open_bind_listen(struct sock *so, const struct sockaddr *local) +{ + struct tle_stream *s = open_bind(so, local, NULL); + + if (s == NULL) + return NULL; + + if (tle_tcp_stream_listen(s) != 0) { + tle_tcp_stream_close(s); + return NULL; + } + + return s; +} + +uint32_t get_socket_id(void); + +#ifdef __cplusplus +} +#endif + +#endif /*_TLE_GLUE_UTIL_H_ */ diff --git a/lib/libtle_glue/zerocopy.h b/lib/libtle_glue/zerocopy.h new file mode 100644 index 0000000..a37f8f5 --- /dev/null +++ b/lib/libtle_glue/zerocopy.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TLE_GLUE_ZEROCOPY_H_ +#define _TLE_GLUE_ZEROCOPY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * This API performs recv operation on specified socket, and it's + * optimized for zero copy, which means the caller does not need to + * prepare the buffer, instead, it will get a pointer on success. + * @param sockfd + * the file descriptor for the socket. + * @param buf + * after successfully receiving some payload, the pointer of the + * received buffer will be stored in *buf. + * @return + * the number of bytes received, or -1 if an error occurred, or 0 + * if a stream socket peer has performed an orderly shutdown. + * + */ +ssize_t recv_zc(int sockfd, void **buf); + +/** + * This API performs send operation on specified socket, and it's + * optimized for zero copy, which means the caller does not need to + * free the buffer, not even touch that buffer even after calling this + * API; the buffer will be freed after an ack from the socket peer. + * @param sockfd + * the file descriptor for the socket. + * @param buf + * The pointer to the payload buffer to be sent. + * @param len + * The length of the payload buffer to be sent. + * @return + * the number of bytes sent, or -1 if an error occurred. + */ +ssize_t send_zc(int sockfd, const void *buf, size_t len); + +#ifdef __cplusplus +} +#endif + +#endif /*_TLE_GLUE_ZEROCOPY_H_ */ diff --git a/lib/libtle_l4p/Makefile b/lib/libtle_l4p/Makefile index e1357d1..ee81d4a 100644 --- a/lib/libtle_l4p/Makefile +++ b/lib/libtle_l4p/Makefile @@ -45,6 +45,7 @@ SYMLINK-y-include += tle_ctx.h SYMLINK-y-include += tle_event.h SYMLINK-y-include += tle_tcp.h SYMLINK-y-include += tle_udp.h +SYMLINK-y-include += tle_stats.h # this lib dependencies DEPDIRS-y += lib/libtle_misc diff --git a/lib/libtle_l4p/ctx.c b/lib/libtle_l4p/ctx.c index 910fc88..3a9098e 100644 --- a/lib/libtle_l4p/ctx.c +++ b/lib/libtle_l4p/ctx.c @@ -21,9 +21,14 @@ #include #include "stream.h" +#include "stream_table.h" #include "misc.h" #include +struct tle_mib default_mib; + +RTE_DEFINE_PER_LCORE(struct tle_mib *, mib) = &default_mib; + #define LPORT_START 0x8000 #define LPORT_END MAX_PORT_NUM @@ -38,6 +43,13 @@ const struct in6_addr tle_ipv6_none = { }, }, }; +const struct in6_addr tle_ipv6_loopback = IN6ADDR_LOOPBACK_INIT; +const struct in6_addr tle_ipv6_all_multi = {{{ 0xff, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}}; +const struct in6_addr tle_ipv6_multi_mask = {{{ 0xff, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}}; + + struct stream_ops tle_stream_ops[TLE_PROTO_NUM] = {}; @@ -103,6 +115,16 @@ tle_ctx_create(const struct tle_ctx_param *ctx_prm) ctx->prm = *ctx_prm; + rc = bhash_init(ctx); + if (rc != 0) { + UDP_LOG(ERR, "create bhash table (ctx=%p, proto=%u) failed " + "with error code: %d;\n", + ctx, ctx_prm->proto, rc); + tle_ctx_destroy(ctx); + rte_errno = -rc; + return NULL; + } + rc = tle_stream_ops[ctx_prm->proto].init_streams(ctx); if (rc != 0) { UDP_LOG(ERR, "init_streams(ctx=%p, proto=%u) failed " @@ -114,9 +136,10 @@ tle_ctx_create(const struct tle_ctx_param *ctx_prm) } for (i = 0; i != RTE_DIM(ctx->use); i++) - tle_pbm_init(ctx->use + i, LPORT_START_BLK); + tle_psm_init(ctx->use + i); - ctx->streams.nb_free = ctx->prm.max_streams; + ctx->streams.nb_free = ctx->prm.min_streams; + ctx->streams.nb_cur = ctx->prm.min_streams; /* Initialization of siphash state is done here to speed up the * fastpath processing. @@ -124,6 +147,11 @@ tle_ctx_create(const struct tle_ctx_param *ctx_prm) if (ctx->prm.hash_alg == TLE_SIPHASH) siphash_initialization(&ctx->prm.secret_key, &ctx->prm.secret_key); + + rte_spinlock_init(&ctx->dev_lock); + rte_spinlock_init(&ctx->bhash_lock[TLE_V4]); + rte_spinlock_init(&ctx->bhash_lock[TLE_V6]); + return ctx; } @@ -137,6 +165,8 @@ tle_ctx_destroy(struct tle_ctx *ctx) return; } + bhash_fini(ctx); + for (i = 0; i != RTE_DIM(ctx->dev); i++) tle_del_dev(ctx->dev + i); @@ -150,37 +180,6 @@ tle_ctx_invalidate(struct tle_ctx *ctx) RTE_SET_USED(ctx); } -static void -fill_pbm(struct tle_pbm *pbm, const struct tle_bl_port *blp) -{ - uint32_t i; - - for (i = 0; i != blp->nb_port; i++) - tle_pbm_set(pbm, blp->port[i]); -} - -static int -init_dev_proto(struct tle_dev *dev, uint32_t idx, int32_t socket_id, - const struct tle_bl_port *blp) -{ - size_t sz; - - sz = sizeof(*dev->dp[idx]); - dev->dp[idx] = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, - socket_id); - - if (dev->dp[idx] == NULL) { - UDP_LOG(ERR, "allocation of %zu bytes on " - "socket %d for %u-th device failed\n", - sz, socket_id, idx); - return ENOMEM; - } - - tle_pbm_init(&dev->dp[idx]->use, LPORT_START_BLK); - fill_pbm(&dev->dp[idx]->use, blp); - return 0; -} - static struct tle_dev * find_free_dev(struct tle_ctx *ctx) { @@ -214,27 +213,8 @@ tle_add_dev(struct tle_ctx *ctx, const struct tle_dev_param *dev_prm) return NULL; rc = 0; - /* device can handle IPv4 traffic */ - if (dev_prm->local_addr4.s_addr != INADDR_ANY) { - rc = init_dev_proto(dev, TLE_V4, ctx->prm.socket_id, - &dev_prm->bl4); - if (rc == 0) - fill_pbm(&ctx->use[TLE_V4], &dev_prm->bl4); - } - - /* device can handle IPv6 traffic */ - if (rc == 0 && memcmp(&dev_prm->local_addr6, &tle_ipv6_any, - sizeof(tle_ipv6_any)) != 0) { - rc = init_dev_proto(dev, TLE_V6, ctx->prm.socket_id, - &dev_prm->bl6); - if (rc == 0) - fill_pbm(&ctx->use[TLE_V6], &dev_prm->bl6); - } - if (rc != 0) { /* cleanup and return an error. */ - rte_free(dev->dp[TLE_V4]); - rte_free(dev->dp[TLE_V6]); rte_errno = rc; return NULL; } @@ -314,220 +294,105 @@ tle_del_dev(struct tle_dev *dev) ctx = dev->ctx; p = dev - ctx->dev; - if (p >= RTE_DIM(ctx->dev) || - (dev->dp[TLE_V4] == NULL && - dev->dp[TLE_V6] == NULL)) + if (p >= RTE_DIM(ctx->dev)) return -EINVAL; /* emtpy TX queues. */ empty_dring(&dev->tx.dr, ctx->prm.proto); - rte_free(dev->dp[TLE_V4]); - rte_free(dev->dp[TLE_V6]); memset(dev, 0, sizeof(*dev)); ctx->nb_dev--; return 0; } -static struct tle_dev * -find_ipv4_dev(struct tle_ctx *ctx, const struct in_addr *addr) -{ - uint32_t i; - - for (i = 0; i != RTE_DIM(ctx->dev); i++) { - if (ctx->dev[i].prm.local_addr4.s_addr == addr->s_addr && - ctx->dev[i].dp[TLE_V4] != NULL) - return ctx->dev + i; - } - - return NULL; -} - -static struct tle_dev * -find_ipv6_dev(struct tle_ctx *ctx, const struct in6_addr *addr) +int +stream_fill_ctx(struct tle_ctx *ctx, struct tle_stream *s, + const struct sockaddr *laddr, const struct sockaddr *raddr) { - uint32_t i; + struct sockaddr_storage addr; + int32_t rc = 0; - for (i = 0; i != RTE_DIM(ctx->dev); i++) { - if (memcmp(&ctx->dev[i].prm.local_addr6, addr, - sizeof(*addr)) == 0 && - ctx->dev[i].dp[TLE_V6] != NULL) - return ctx->dev + i; + if (laddr->sa_family == AF_INET) { + s->type = TLE_V4; + } else if (laddr->sa_family == AF_INET6) { + s->type = TLE_V6; } - return NULL; -} - -static int -stream_fill_dev(struct tle_ctx *ctx, struct tle_stream *s, - const struct sockaddr *addr) -{ - struct tle_dev *dev; - struct tle_pbm *pbm; - const struct sockaddr_in *lin4; - const struct sockaddr_in6 *lin6; - uint32_t i, p, sp, t; - - if (addr->sa_family == AF_INET) { - lin4 = (const struct sockaddr_in *)addr; - t = TLE_V4; - p = lin4->sin_port; - } else if (addr->sa_family == AF_INET6) { - lin6 = (const struct sockaddr_in6 *)addr; - t = TLE_V6; - p = lin6->sin6_port; - } else - return EINVAL; - + uint16_t p = ((const struct sockaddr_in *)laddr)->sin_port; p = ntohs(p); - - /* if local address is not wildcard, find device it belongs to. */ - if (t == TLE_V4 && lin4->sin_addr.s_addr != INADDR_ANY) { - dev = find_ipv4_dev(ctx, &lin4->sin_addr); - if (dev == NULL) - return ENODEV; - } else if (t == TLE_V6 && memcmp(&tle_ipv6_any, &lin6->sin6_addr, - sizeof(tle_ipv6_any)) != 0) { - dev = find_ipv6_dev(ctx, &lin6->sin6_addr); - if (dev == NULL) - return ENODEV; - } else - dev = NULL; - - if (dev != NULL) - pbm = &dev->dp[t]->use; - else - pbm = &ctx->use[t]; - + struct tle_psm *psm = &ctx->use[s->type]; /* try to acquire local port number. */ + rte_spinlock_lock(&ctx->dev_lock); if (p == 0) { - p = tle_pbm_find_range(pbm, pbm->blk, LPORT_END_BLK); - if (p == 0 && pbm->blk > LPORT_START_BLK) - p = tle_pbm_find_range(pbm, LPORT_START_BLK, pbm->blk); - } else if (tle_pbm_check(pbm, p) != 0) - return EEXIST; - - if (p == 0) - return ENFILE; - - /* fill socket's dst port and type */ - - sp = htons(p); - s->type = t; - s->port.dst = sp; - - /* mark port as in-use */ - - tle_pbm_set(&ctx->use[t], p); - if (dev != NULL) { - tle_pbm_set(pbm, p); - dev->dp[t]->streams[sp] = s; - } else { - for (i = 0; i != RTE_DIM(ctx->dev); i++) { - if (ctx->dev[i].dp[t] != NULL) { - tle_pbm_set(&ctx->dev[i].dp[t]->use, p); - ctx->dev[i].dp[t]->streams[sp] = s; - } + if (s->type == TLE_V6 && is_empty_addr(laddr) && !s->option.ipv6only) + p = tle_psm_alloc_dual_port(&ctx->use[TLE_V4], psm); + else + p = tle_psm_alloc_port(psm); + if (p == 0) { + rte_spinlock_unlock(&ctx->dev_lock); + return ENFILE; } + rte_memcpy(&addr, laddr, sizeof(struct sockaddr_storage)); + ((struct sockaddr_in *)&addr)->sin_port = htons(p); + laddr = (const struct sockaddr*)&addr; } - return 0; -} + if (tle_psm_set(psm, p, s->option.reuseport) != 0) { + rte_spinlock_unlock(&ctx->dev_lock); + return EADDRINUSE; + } -static int -stream_clear_dev(struct tle_ctx *ctx, const struct tle_stream *s) -{ - struct tle_dev *dev; - uint32_t i, p, sp, t; - - t = s->type; - sp = s->port.dst; - p = ntohs(sp); - - /* if local address is not wildcard, find device it belongs to. */ - if (t == TLE_V4 && s->ipv4.addr.dst != INADDR_ANY) { - dev = find_ipv4_dev(ctx, - (const struct in_addr *)&s->ipv4.addr.dst); - if (dev == NULL) - return ENODEV; - } else if (t == TLE_V6 && memcmp(&tle_ipv6_any, &s->ipv6.addr.dst, - sizeof(tle_ipv6_any)) != 0) { - dev = find_ipv6_dev(ctx, - (const struct in6_addr *)&s->ipv6.addr.dst); - if (dev == NULL) - return ENODEV; - } else - dev = NULL; - - tle_pbm_clear(&ctx->use[t], p); - if (dev != NULL) { - if (dev->dp[t]->streams[sp] == s) { - tle_pbm_clear(&dev->dp[t]->use, p); - dev->dp[t]->streams[sp] = NULL; - } - } else { - for (i = 0; i != RTE_DIM(ctx->dev); i++) { - if (ctx->dev[i].dp[t] != NULL && - ctx->dev[i].dp[t]->streams[sp] == s) { - tle_pbm_clear(&ctx->dev[i].dp[t]->use, p); - ctx->dev[i].dp[t]->streams[sp] = NULL; + /* strange behaviour to match linux stack */ + if (is_empty_addr(laddr)) { + if (s->type == TLE_V4) { + if (tle_psm_check(&ctx->use[TLE_V6], p) != 0) { + tle_psm_clear(psm, p); + rte_spinlock_unlock(&ctx->dev_lock); + return EADDRINUSE; + } + } else { + if (!s->option.ipv6only) + rc = tle_psm_set(&ctx->use[TLE_V4], p, s->option.reuseport); + if (rc != 0) { + tle_psm_clear(psm, p); + rte_spinlock_unlock(&ctx->dev_lock); + return EADDRINUSE; } } } - return 0; -} - -static void -fill_ipv4_am(const struct sockaddr_in *in, uint32_t *addr, uint32_t *mask) -{ - *addr = in->sin_addr.s_addr; - *mask = (*addr == INADDR_ANY) ? INADDR_ANY : INADDR_NONE; -} - -static void -fill_ipv6_am(const struct sockaddr_in6 *in, rte_xmm_t *addr, rte_xmm_t *mask) -{ - const struct in6_addr *pm; + if (is_empty_addr(raddr)) + rc = bhash_add_entry(ctx, laddr, s); - memcpy(addr, &in->sin6_addr, sizeof(*addr)); - if (memcmp(&tle_ipv6_any, addr, sizeof(*addr)) == 0) - pm = &tle_ipv6_any; - else - pm = &tle_ipv6_none; - - memcpy(mask, pm, sizeof(*mask)); -} + if (rc) { + tle_psm_clear(psm, p); + } -int -stream_fill_ctx(struct tle_ctx *ctx, struct tle_stream *s, - const struct sockaddr *laddr, const struct sockaddr *raddr) -{ - const struct sockaddr_in *rin; - int32_t rc; + rte_spinlock_unlock(&ctx->dev_lock); + /* fill socket's dst (src actually) port */ + s->port.dst = htons(p); - /* setup ports and port mask fields (except dst port). */ - rin = (const struct sockaddr_in *)raddr; - s->port.src = rin->sin_port; - s->pmsk.src = (s->port.src == 0) ? 0 : UINT16_MAX; - s->pmsk.dst = UINT16_MAX; + if (rc) + return rc; - /* setup src and dst addresses. */ + /* setup src, dst addresses, and src port. */ if (laddr->sa_family == AF_INET) { fill_ipv4_am((const struct sockaddr_in *)laddr, &s->ipv4.addr.dst, &s->ipv4.mask.dst); fill_ipv4_am((const struct sockaddr_in *)raddr, &s->ipv4.addr.src, &s->ipv4.mask.src); + s->port.src = ((const struct sockaddr_in *)raddr)->sin_port; } else if (laddr->sa_family == AF_INET6) { fill_ipv6_am((const struct sockaddr_in6 *)laddr, &s->ipv6.addr.dst, &s->ipv6.mask.dst); fill_ipv6_am((const struct sockaddr_in6 *)raddr, &s->ipv6.addr.src, &s->ipv6.mask.src); + s->port.src = ((const struct sockaddr_in6 *)raddr)->sin6_port; } - rte_spinlock_lock(&ctx->dev_lock); - rc = stream_fill_dev(ctx, s, laddr); - rte_spinlock_unlock(&ctx->dev_lock); + /* setup port mask fields. */ + s->pmsk.src = (s->port.src == 0) ? 0 : UINT16_MAX; + s->pmsk.dst = UINT16_MAX; return rc; } @@ -536,11 +401,41 @@ stream_fill_ctx(struct tle_ctx *ctx, struct tle_stream *s, int stream_clear_ctx(struct tle_ctx *ctx, struct tle_stream *s) { - int32_t rc; + bool is_any = false; + struct sockaddr_storage addr; + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + + if (s->type == TLE_V4) { + if (s->ipv4.addr.src == INADDR_ANY) { + is_any = true; + addr4 = (struct sockaddr_in *)&addr; + addr4->sin_addr.s_addr = s->ipv4.addr.dst; + addr4->sin_port = s->port.dst; + addr.ss_family = AF_INET; + bhash_del_entry(ctx, s, (struct sockaddr*)&addr); + } + } else { + if (IN6_IS_ADDR_UNSPECIFIED(&s->ipv6.addr.src)) { + is_any = true; + addr6 = (struct sockaddr_in6 *)&addr; + memcpy(&addr6->sin6_addr, &s->ipv6.addr.dst, + sizeof(tle_ipv6_any)); + addr6->sin6_port = s->port.dst; + addr.ss_family = AF_INET6; + bhash_del_entry(ctx, s, (struct sockaddr*)&addr); + } + } rte_spinlock_lock(&ctx->dev_lock); - rc = stream_clear_dev(ctx, s); + /* strange behaviour to match linux stack */ + if (is_any) { + if (s->type == TLE_V6 && !s->option.ipv6only) + tle_psm_clear(&ctx->use[TLE_V4], ntohs(s->port.dst)); + } + + tle_psm_clear(&ctx->use[s->type], ntohs(s->port.dst)); rte_spinlock_unlock(&ctx->dev_lock); - return rc; + return 0; } diff --git a/lib/libtle_l4p/ctx.h b/lib/libtle_l4p/ctx.h index 389d646..5cb5df2 100644 --- a/lib/libtle_l4p/ctx.h +++ b/lib/libtle_l4p/ctx.h @@ -21,7 +21,7 @@ #include #include -#include "port_bitmap.h" +#include "port_statmap.h" #include "osdep.h" #include "net_misc.h" @@ -29,11 +29,6 @@ extern "C" { #endif -struct tle_dport { - struct tle_pbm use; /* ports in use. */ - struct tle_stream *streams[MAX_PORT_NUM]; /* port to stream. */ -}; - struct tle_dev { struct tle_ctx *ctx; struct { @@ -48,7 +43,6 @@ struct tle_dev { struct tle_dring dr; } tx; struct tle_dev_param prm; /* copy of device parameters. */ - struct tle_dport *dp[TLE_VNUM]; /* device L4 ports */ }; struct tle_ctx { @@ -57,18 +51,23 @@ struct tle_ctx { struct { rte_spinlock_t lock; uint32_t nb_free; /* number of free streams. */ + uint32_t nb_cur; /* number of allocated streams. */ STAILQ_HEAD(, tle_stream) free; void *buf; /* space allocated for streams */ } streams; - rte_spinlock_t dev_lock; + rte_spinlock_t bhash_lock[TLE_VNUM]; + struct rte_hash *bhash[TLE_VNUM]; /* bind and listen hash table */ + uint32_t nb_dev; - struct tle_pbm use[TLE_VNUM]; /* all ports in use. */ + rte_spinlock_t dev_lock; + struct tle_psm use[TLE_VNUM]; /* all ports in use. */ struct tle_dev dev[RTE_MAX_ETHPORTS]; }; struct stream_ops { int (*init_streams)(struct tle_ctx *); + uint32_t (*more_streams)(struct tle_ctx *); void (*fini_streams)(struct tle_ctx *); void (*free_drbs)(struct tle_stream *, struct tle_drb *[], uint32_t); }; @@ -80,6 +79,27 @@ int stream_fill_ctx(struct tle_ctx *ctx, struct tle_stream *s, int stream_clear_ctx(struct tle_ctx *ctx, struct tle_stream *s); +static inline void +fill_ipv4_am(const struct sockaddr_in *in, uint32_t *addr, uint32_t *mask) +{ + *addr = in->sin_addr.s_addr; + *mask = (*addr == INADDR_ANY) ? INADDR_ANY : INADDR_NONE; +} + +static inline void +fill_ipv6_am(const struct sockaddr_in6 *in, rte_xmm_t *addr, rte_xmm_t *mask) +{ + const struct in6_addr *pm; + + memcpy(addr, &in->sin6_addr, sizeof(*addr)); + if (IN6_IS_ADDR_UNSPECIFIED(addr)) + pm = &tle_ipv6_any; + else + pm = &tle_ipv6_none; + + memcpy(mask, pm, sizeof(*mask)); +} + #ifdef __cplusplus } #endif diff --git a/lib/libtle_l4p/event.c b/lib/libtle_l4p/event.c index 66c5a3b..809de31 100644 --- a/lib/libtle_l4p/event.c +++ b/lib/libtle_l4p/event.c @@ -25,16 +25,14 @@ tle_evq_create(const struct tle_evq_param *prm) { struct tle_evq *evq; size_t sz; - uint32_t i; if (prm == NULL) { rte_errno = EINVAL; return NULL; } - sz = sizeof(*evq) + sizeof(evq->events[0]) * prm->max_events; - evq = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, - prm->socket_id); + sz = sizeof(*evq); + evq = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, prm->socket_id); if (evq == NULL) { UDP_LOG(ERR, "allocation of %zu bytes for " "new tle_evq(%u) on socket %d failed\n", @@ -43,16 +41,6 @@ tle_evq_create(const struct tle_evq_param *prm) } TAILQ_INIT(&evq->armed); - TAILQ_INIT(&evq->free); - - for (i = 0; i != prm->max_events; i++) { - evq->events[i].head = evq; - TAILQ_INSERT_TAIL(&evq->free, evq->events + i, ql); - } - - evq->nb_events = i; - evq->nb_free = i; - return evq; } @@ -65,40 +53,32 @@ tle_evq_destroy(struct tle_evq *evq) struct tle_event * tle_event_alloc(struct tle_evq *evq, const void *data) { - struct tle_event *h; + struct tle_event *h = NULL; if (evq == NULL) { rte_errno = EINVAL; return NULL; } - rte_spinlock_lock(&evq->lock); - h = TAILQ_FIRST(&evq->free); - if (h != NULL) { - TAILQ_REMOVE(&evq->free, h, ql); - evq->nb_free--; - h->data = data; - } else + h = rte_malloc(NULL, sizeof(*h), sizeof(h)); + if (h == NULL) { rte_errno = ENOMEM; - rte_spinlock_unlock(&evq->lock); + } else { + h->head = evq; + h->data = data; + } return h; } void tle_event_free(struct tle_event *ev) { - struct tle_evq *q; - if (ev == NULL) { rte_errno = EINVAL; return; } - q = ev->head; - rte_spinlock_lock(&q->lock); ev->data = NULL; ev->state = TLE_SEV_IDLE; - TAILQ_INSERT_HEAD(&q->free, ev, ql); - q->nb_free++; - rte_spinlock_unlock(&q->lock); + rte_free(ev); } diff --git a/lib/libtle_l4p/misc.h b/lib/libtle_l4p/misc.h index 9bff459..ce26255 100644 --- a/lib/libtle_l4p/misc.h +++ b/lib/libtle_l4p/misc.h @@ -16,12 +16,34 @@ #ifndef _MISC_H_ #define _MISC_H_ +#include #include #ifdef __cplusplus extern "C" { #endif +union typflg { + uint16_t raw; + struct { + uint8_t type; /* TLE_V4/TLE_V6 */ + uint8_t flags; /* TCP header flags */ + }; +}; + +union pkt_info { + rte_xmm_t raw; + struct { + union typflg tf; + uint16_t csf; /* checksum flags */ + union l4_ports port; + union { + union ipv4_addrs addr4; + const union ipv6_addrs *addr6; + }; + }; +}; + static inline int xmm_cmp(const rte_xmm_t *da, const rte_xmm_t *sa) { @@ -287,25 +309,40 @@ _ipv4x_cksum(const void *iph, size_t len) } static inline int -check_pkt_csum(const struct rte_mbuf *m, uint64_t ol_flags, uint32_t type, - uint32_t proto) +check_pkt_csum(const struct rte_mbuf *m, uint32_t type, uint32_t proto) { const struct ipv4_hdr *l3h4; const struct ipv6_hdr *l3h6; const struct udp_hdr *l4h; int32_t ret; uint16_t csum; + uint64_t ol_flags = m->ol_flags; + + /* case 0: both ip and l4 cksum is verified or data is valid */ + if ((ol_flags & PKT_RX_IP_CKSUM_GOOD) && + (ol_flags & PKT_RX_L4_CKSUM_GOOD)) + return 0; + + /* case 1: either ip or l4 cksum bad */ + if ((ol_flags & PKT_RX_IP_CKSUM_MASK) == PKT_RX_IP_CKSUM_BAD) + return 1; + if ((ol_flags & PKT_RX_L4_CKSUM_MASK) == PKT_RX_L4_CKSUM_BAD) + return 1; + + /* case 2: either ip or l4 or both cksum is unknown */ ret = 0; l3h4 = rte_pktmbuf_mtod_offset(m, const struct ipv4_hdr *, m->l2_len); l3h6 = rte_pktmbuf_mtod_offset(m, const struct ipv6_hdr *, m->l2_len); - if ((ol_flags & PKT_RX_IP_CKSUM_BAD) != 0) { + if ((ol_flags & PKT_RX_IP_CKSUM_MASK) == PKT_RX_IP_CKSUM_UNKNOWN && + l3h4->hdr_checksum != 0) { csum = _ipv4x_cksum(l3h4, m->l3_len); ret = (csum != UINT16_MAX); } - if (ret == 0 && (ol_flags & PKT_RX_L4_CKSUM_BAD) != 0) { + if (ret == 0 && (ol_flags & PKT_RX_L4_CKSUM_MASK) == + PKT_RX_L4_CKSUM_UNKNOWN) { /* * for IPv4 it is allowed to have zero UDP cksum, @@ -333,6 +370,12 @@ check_pkt_csum(const struct rte_mbuf *m, uint64_t ol_flags, uint32_t type, * Consider to move into DPDK librte_eal. */ +static inline int +rwl_is_up(rte_atomic32_t *p) +{ + return (rte_atomic32_read(p) >= 0); +} + static inline int rwl_try_acquire(rte_atomic32_t *p) { @@ -359,7 +402,8 @@ rwl_acquire(rte_atomic32_t *p) static inline void rwl_down(rte_atomic32_t *p) { - while (rte_atomic32_cmpset((volatile uint32_t *)p, 0, INT32_MIN) == 0) + while (rte_atomic32_read(p) != INT32_MIN && + rte_atomic32_cmpset((volatile uint32_t *)p, 0, INT32_MIN) == 0) rte_pause(); } @@ -493,6 +537,122 @@ _iovec_to_mbsegs(struct iovec *iv, uint32_t seglen, struct rte_mbuf *mb[], return i; } +static inline void +set_msg_timestamp(struct msghdr *msg, struct rte_mbuf* m) +{ + struct cmsghdr* cmsg = CMSG_FIRSTHDR(msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SO_TIMESTAMP; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct timeval)); + msg->msg_controllen = cmsg->cmsg_len; + struct timeval *tv = (struct timeval*)CMSG_DATA(cmsg); + tv->tv_sec = m->timestamp >> 20; + tv->tv_usec = m->timestamp & 0xFFFFFUL; +} + +/** + * Remove len bytes at the beginning of an mbuf. + * + * It's an enhancement version of rte_pktmbuf_abj which not support + * adjusting length greater than the length of the first segment. + * + * Returns a pointer to the new mbuf. If the + * length is greater than the total length of the mbuf, then the + * function will fail and return NULL, without modifying the mbuf. + + * @param m + * The packet mbuf. + * @param len + * The amount of data to remove (in bytes). + * @return + * A pointer to the new start of the data. + */ +static inline struct rte_mbuf * +_rte_pktmbuf_adj(struct rte_mbuf *m, uint16_t len) +{ + struct rte_mbuf *next; + uint32_t plen = m->pkt_len; + uint16_t segs = m->nb_segs; + + if (unlikely(len > plen)) + return NULL; + + while (len > m->data_len) { + next = m->next; + plen -= m->data_len; + len -= m->data_len; + segs--; + rte_pktmbuf_free_seg(m); + m = next; + } + + if (len) { + m->data_len = (uint16_t)(m->data_len - len); + m->data_off = (uint16_t)(m->data_off + len); + plen -= len; + } + + m->pkt_len = plen; + m->nb_segs = segs; + return m; +} + +/** + * Remove len bytes of data at the end of the mbuf. + * + * It's an enhancement version of rte_pktmbuf_trim, which not support + * removing length greater than the length of the last segment. + * + * @param m + * The packet mbuf. + * @param len + * The amount of data to remove (in bytes). + * @return + * - 0: On success. + * - -1: On error. + */ +static inline int +_rte_pktmbuf_trim(struct rte_mbuf *m, uint16_t len) +{ + struct rte_mbuf *next, *tmp, *last; + uint32_t plen = m->pkt_len; + uint32_t left; + uint16_t segs = m->nb_segs; + + if (unlikely(len > plen)) + return -1; + + left = m->pkt_len - m->data_len; + next = m->next; + last = m; + /* find the last segment will remain after trim */ + while (left > len) { + left -= next->data_len; + if (left <= len) { + last = next; + } + next = next->next; + } + if (left > 0) { + /* remove last segments */ + last->next = NULL; + while (next != NULL) { + tmp = next->next; + segs--; + rte_pktmbuf_free_seg(next); + next = tmp; + } + m->pkt_len -= left; + m->nb_segs = segs; + len -= left; + } + + /* trim the remained last segment */ + last->data_len = (uint16_t)(last->data_len - len); + m->pkt_len = (m->pkt_len - len); + return 0; +} + #ifdef __cplusplus } #endif diff --git a/lib/libtle_l4p/net_misc.h b/lib/libtle_l4p/net_misc.h index 2d8dac2..e270e70 100644 --- a/lib/libtle_l4p/net_misc.h +++ b/lib/libtle_l4p/net_misc.h @@ -16,6 +16,7 @@ #ifndef _NET_MISC_H_ #define _NET_MISC_H_ +#include #include #include #include "osdep.h" @@ -39,8 +40,12 @@ enum { TLE_VNUM }; +#define IPV6_MULTI_MASK_LEN 13 extern const struct in6_addr tle_ipv6_any; extern const struct in6_addr tle_ipv6_none; +extern const struct in6_addr tle_ipv6_loopback; +extern const struct in6_addr tle_ipv6_all_multi; +extern const struct in6_addr tle_ipv6_multi_mask; union l4_ports { uint32_t raw; @@ -71,6 +76,26 @@ union ip_addrs { union ipv6_addrs v6; }; +static inline bool +is_empty_addr(const struct sockaddr *addr) +{ + bool any = false; + const struct sockaddr_in *in4; + const struct sockaddr_in6 *in6; + + if (addr->sa_family == AF_INET) { + in4 = (const struct sockaddr_in *)addr; + if (in4->sin_addr.s_addr == INADDR_ANY) + any = true; + } else if (addr->sa_family == AF_INET6) { + in6 = (const struct sockaddr_in6 *)addr; + if (IN6_IS_ADDR_UNSPECIFIED(&in6->sin6_addr)) + any = true; + } + + return any; +} + #ifdef __cplusplus } #endif diff --git a/lib/libtle_l4p/port_bitmap.h b/lib/libtle_l4p/port_bitmap.h index c0420d5..20a942a 100644 --- a/lib/libtle_l4p/port_bitmap.h +++ b/lib/libtle_l4p/port_bitmap.h @@ -98,7 +98,7 @@ tle_pbm_find_range(struct tle_pbm *pbm, uint32_t start_blk, uint32_t end_blk) (v & 1) != 0; v >>= 1, p++) ; - pbm->blk = i; + pbm->blk = i + 1; break; } } diff --git a/lib/libtle_l4p/port_statmap.h b/lib/libtle_l4p/port_statmap.h new file mode 100644 index 0000000..b5ef253 --- /dev/null +++ b/lib/libtle_l4p/port_statmap.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2019 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _PORT_STATMAP_H_ +#define _PORT_STATMAP_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_PORT_NUM (UINT16_MAX + 1) +#define ALLOC_PORT_START 0x8000 + +struct tle_psm { + uint32_t nb_used; /* number of ports already in use. */ + uint32_t next_alloc; /* next port to try allocate. */ + uint8_t stat[MAX_PORT_NUM]; /* use status of each port. first bit + shows if SO_REUSEPORT is allowed, last + 7 bits record the count of sockets who + use this port. */ +}; + +static inline void +tle_psm_init(struct tle_psm *psm) +{ + memset(psm, 0, sizeof(struct tle_psm)); + psm->next_alloc = ALLOC_PORT_START; +} + +static inline int +tle_psm_set(struct tle_psm *psm, uint16_t port, uint8_t reuseport) +{ + if (psm->stat[port] == 0) { + /* port has not been used */ + psm->stat[port]++; + if (reuseport) + psm->stat[port] |= 0x80; + } else { + /* port is used by some socket */ + if (reuseport && (psm->stat[port] & 0x80)) { + /* all sockets set reuseport */ + psm->stat[port]++; + } + else{ + return -1; + } + } + + return 0; +} + +static inline void +tle_psm_clear(struct tle_psm *psm, uint16_t port) +{ + psm->stat[port]--; + if ((psm->stat[port] & 0x7f) == 0) + psm->stat[port] = 0; +} + + +static inline uint8_t +tle_psm_check(const struct tle_psm *psm, uint16_t port) +{ + return psm->stat[port]; +} + +static inline uint16_t +tle_psm_alloc_port(struct tle_psm *psm) +{ + uint32_t i = psm->next_alloc; + + for (; i < MAX_PORT_NUM; i++) { + if (psm->stat[i] == 0) { + psm->next_alloc = i + 1; + return (uint16_t)i; + } + } + + for (i = ALLOC_PORT_START; i < psm->next_alloc; i++) { + if (psm->stat[i] == 0) { + psm->next_alloc = i + 1; + return (uint16_t)i; + } + } + + return 0; +} + +static inline uint16_t +tle_psm_alloc_dual_port(struct tle_psm *psm4, struct tle_psm *psm6) +{ + uint32_t i = psm6->next_alloc; + + for (; i < MAX_PORT_NUM; i++) { + if (psm6->stat[i] == 0 && psm4->stat[i] == 0) { + psm6->next_alloc = i + 1; + return (uint16_t)i; + } + } + + for (i = ALLOC_PORT_START; i < psm6->next_alloc; i++) { + if (psm6->stat[i] == 0 && psm4->stat[i] == 0) { + psm6->next_alloc = i + 1; + return (uint16_t)i; + } + } + + return 0; +} + + +#ifdef __cplusplus +} +#endif + +#endif /* _PORT_STATMAP_H_ */ diff --git a/lib/libtle_l4p/stream.h b/lib/libtle_l4p/stream.h index e76f126..5477c28 100644 --- a/lib/libtle_l4p/stream.h +++ b/lib/libtle_l4p/stream.h @@ -22,6 +22,25 @@ extern "C" { #endif +union stream_option { + struct { + uint32_t reuseaddr: 1; + uint32_t reuseport: 1; + uint32_t keepalive: 1; + uint32_t ipv6only: 1; + uint32_t oobinline: 1; + uint32_t tcpcork: 1; + uint32_t tcpquickack: 4; + uint32_t tcpnodelay: 1; + uint32_t mulloop: 1; + uint32_t reserve: 12; + uint32_t multtl: 8; + uint16_t keepidle; + uint16_t keepintvl; + }; + uint64_t raw; +}; + /* * Common structure that must be present as first field in all partcular * L4 (UDP/TCP, etc.) stream implementations. @@ -32,6 +51,10 @@ struct tle_stream { struct tle_ctx *ctx; uint8_t type; /* TLE_V4 or TLE_V6 */ + uint8_t padding; + uint16_t reuseport_seed; + union stream_option option; + unsigned long timestamp; /* Stream address information. */ union l4_ports port; @@ -53,15 +76,25 @@ static inline uint32_t get_streams(struct tle_ctx *ctx, struct tle_stream *s[], uint32_t num) { struct tle_stream *p; - uint32_t i, n; + uint32_t i, n, inc; rte_spinlock_lock(&ctx->streams.lock); - n = RTE_MIN(ctx->streams.nb_free, num); - for (i = 0, p = STAILQ_FIRST(&ctx->streams.free); - i != n; - i++, p = STAILQ_NEXT(p, link)) + n = ctx->streams.nb_free; + if (n < num) { + inc = tle_stream_ops[ctx->prm.proto].more_streams(ctx); + ctx->streams.nb_free += inc; + ctx->streams.nb_cur += inc; + n = ctx->streams.nb_free; + } + n = RTE_MIN(n, num); + + for (i = 0, p = STAILQ_FIRST(&ctx->streams.free); i != n; ) { s[i] = p; + p = STAILQ_NEXT(p, link); + s[i]->link.stqe_next = NULL; + i++; + } if (p == NULL) /* we retrieved all free entries */ @@ -80,9 +113,6 @@ get_stream(struct tle_ctx *ctx) struct tle_stream *s; s = NULL; - if (ctx->streams.nb_free == 0) - return s; - get_streams(ctx, &s, 1); return s; } @@ -120,8 +150,8 @@ drb_nb_elem(const struct tle_ctx *ctx) } static inline int32_t -stream_get_dest(struct tle_stream *s, const void *dst_addr, - struct tle_dest *dst) +stream_get_dest(uint8_t type, struct tle_stream *s, const void *src_addr, + const void *dst_addr, struct tle_dest *dst) { int32_t rc; const struct in_addr *d4; @@ -133,11 +163,13 @@ stream_get_dest(struct tle_stream *s, const void *dst_addr, /* it is here just to keep gcc happy. */ d4 = NULL; + /* it is here just to keep gcc happy. */ + d6 = NULL; - if (s->type == TLE_V4) { + if (type == TLE_V4) { d4 = dst_addr; rc = ctx->prm.lookup4(ctx->prm.lookup4_data, d4, dst); - } else if (s->type == TLE_V6) { + } else if (type == TLE_V6) { d6 = dst_addr; rc = ctx->prm.lookup6(ctx->prm.lookup6_data, d6, dst); } else @@ -147,18 +179,25 @@ stream_get_dest(struct tle_stream *s, const void *dst_addr, return -ENOENT; dev = dst->dev; - dst->ol_flags = dev->tx.ol_flags[s->type]; + dst->ol_flags = dev->tx.ol_flags[type]; - if (s->type == TLE_V4) { + if (type == TLE_V4) { struct ipv4_hdr *l3h; l3h = (struct ipv4_hdr *)(dst->hdr + dst->l2_len); - l3h->src_addr = dev->prm.local_addr4.s_addr; + if (((const struct in_addr*)src_addr)->s_addr != INADDR_ANY) + l3h->src_addr = ((const struct in_addr*)src_addr)->s_addr; + else + l3h->src_addr = dev->prm.local_addr4.s_addr; l3h->dst_addr = d4->s_addr; } else { struct ipv6_hdr *l3h; l3h = (struct ipv6_hdr *)(dst->hdr + dst->l2_len); - rte_memcpy(l3h->src_addr, &dev->prm.local_addr6, - sizeof(l3h->src_addr)); + if (!IN6_IS_ADDR_UNSPECIFIED(src_addr)) + rte_memcpy(l3h->src_addr, src_addr, + sizeof(l3h->src_addr)); + else + rte_memcpy(l3h->src_addr, &dev->prm.local_addr6, + sizeof(l3h->src_addr)); rte_memcpy(l3h->dst_addr, d6, sizeof(l3h->dst_addr)); } diff --git a/lib/libtle_l4p/stream_table.c b/lib/libtle_l4p/stream_table.c index 5a89553..e029306 100644 --- a/lib/libtle_l4p/stream_table.c +++ b/lib/libtle_l4p/stream_table.c @@ -13,68 +13,47 @@ * limitations under the License. */ #include -#include #include #include "stream_table.h" void -stbl_fini(struct stbl *st) +bhash_fini(struct tle_ctx *ctx) { uint32_t i; - for (i = 0; i != RTE_DIM(st->ht); i++) { - rte_hash_free(st->ht[i].t); - rte_free(st->ht[i].ent); - } - - memset(st, 0, sizeof(*st)); + for (i = 0; i != RTE_DIM(ctx->bhash); i++) + rte_hash_free(ctx->bhash[i]); } int -stbl_init(struct stbl *st, uint32_t num, int32_t socket) +bhash_init(struct tle_ctx *ctx) { - int32_t rc; - size_t i, sz; - struct rte_hash_parameters hprm; + int rc = 0; + struct rte_hash_parameters hprm = {0}; + bool ipv6 = ctx->prm.lookup6 != NULL; char buf[RTE_HASH_NAMESIZE]; - num = RTE_MAX(5 * num / 4, 0x10U); - - memset(&hprm, 0, sizeof(hprm)); hprm.name = buf; - hprm.entries = num; - hprm.socket_id = socket; - - rc = 0; - - snprintf(buf, sizeof(buf), "stbl4@%p", st); - hprm.key_len = sizeof(struct stbl4_key); - st->ht[TLE_V4].t = rte_hash_create(&hprm); - if (st->ht[TLE_V4].t == NULL) + hprm.entries = 4096; + hprm.extra_flag = RTE_HASH_EXTRA_FLAGS_EXT_TABLE; + hprm.socket_id = ctx->prm.socket_id; + + snprintf(buf, sizeof(buf), "bhash4@%p", ctx); + hprm.key_len = sizeof(struct bhash4_key); + ctx->bhash[TLE_V4] = rte_hash_create(&hprm); + if (ctx->bhash[TLE_V4] == NULL) rc = (rte_errno != 0) ? -rte_errno : -ENOMEM; - if (rc == 0) { - snprintf(buf, sizeof(buf), "stbl6@%p", st); - hprm.key_len = sizeof(struct stbl6_key); - st->ht[TLE_V6].t = rte_hash_create(&hprm); - if (st->ht[TLE_V6].t == NULL) + if (rc == 0 && ipv6) { + snprintf(buf, sizeof(buf), "bhash6@%p", ctx); + hprm.key_len = sizeof(struct bhash6_key); + ctx->bhash[TLE_V6] = rte_hash_create(&hprm); + if (ctx->bhash[TLE_V6] == NULL) { + rte_hash_free(ctx->bhash[TLE_V4]); rc = (rte_errno != 0) ? -rte_errno : -ENOMEM; + } } - for (i = 0; i != RTE_DIM(st->ht) && rc == 0; i++) { - - sz = sizeof(*st->ht[i].ent) * num; - st->ht[i].ent = rte_zmalloc_socket(NULL, sz, - RTE_CACHE_LINE_SIZE, socket); - if (st->ht[i].ent == NULL) - rc = -ENOMEM; - else - st->ht[i].nb_ent = num; - } - - if (rc != 0) - stbl_fini(st); - return rc; } diff --git a/lib/libtle_l4p/stream_table.h b/lib/libtle_l4p/stream_table.h index 033c306..16398cc 100644 --- a/lib/libtle_l4p/stream_table.h +++ b/lib/libtle_l4p/stream_table.h @@ -16,199 +16,407 @@ #ifndef _STREAM_TABLE_H_ #define _STREAM_TABLE_H_ +#include #include -#include "tcp_misc.h" +#include "stream.h" +#include "misc.h" #ifdef __cplusplus extern "C" { #endif +#define HASH_SIZE_32K 32771 +#define HASH_SIZE_64K 65537 +#define HASH_SIZE_128K 131071 + +#define HASH_SIZE HASH_SIZE_64K + struct stbl_entry { void *data; }; -struct shtbl { - uint32_t nb_ent; /* max number of entries in the table. */ - rte_spinlock_t l; /* lock to protect the hash table */ - struct rte_hash *t; - struct stbl_entry *ent; +struct stbl { + rte_spinlock_t l; + uint32_t need_lock; + struct stbl_entry head[HASH_SIZE]; } __rte_cache_aligned; -struct stbl { - struct shtbl ht[TLE_VNUM]; -}; +static inline int +stbl_init(struct stbl *st, uint32_t lock) +{ + st->need_lock = lock; + return 0; +} -struct stbl4_key { - union l4_ports port; - union ipv4_addrs addr; -} __attribute__((__packed__)); +static inline int +stbl_fini(struct stbl *st) +{ + st->need_lock = 0; + return 0; +} -struct stbl6_key { - union l4_ports port; - union ipv6_addrs addr; -} __attribute__((__packed__)); +static inline uint8_t +compare_pkt(const struct tle_stream *s, const union pkt_info *pi) +{ + if (s->type != pi->tf.type) + return -1; -struct stbl_key { - union l4_ports port; - union { - union ipv4_addrs addr4; - union ipv6_addrs addr6; - }; -} __attribute__((__packed__)); + if (s->port.raw != pi->port.raw) + return -1; -extern void stbl_fini(struct stbl *st); + if (s->type == TLE_V4) { + if (s->ipv4.addr.raw != pi->addr4.raw) + return -1; + } else { + if (memcmp(&s->ipv6.addr, pi->addr6, sizeof(union ipv6_addrs))) + return -1; + } -extern int stbl_init(struct stbl *st, uint32_t num, int32_t socket); + return 0; +} -static inline void -stbl_pkt_fill_key(struct stbl_key *k, const union pkt_info *pi, uint32_t type) +static inline uint32_t +stbl_hash_stream(const struct tle_stream *s) { - static const struct stbl_key zero = { - .port.raw = 0, - }; - - k->port = pi->port; - if (type == TLE_V4) - k->addr4 = pi->addr4; - else if (type == TLE_V6) - k->addr6 = *pi->addr6; - else - *k = zero; + int i; + unsigned int hash; + + if (s->type == TLE_V4) { + hash = s->ipv4.addr.src ^ s->ipv4.addr.dst + ^ s->port.src ^ s->port.dst; + } else { + hash = s->port.src ^ s->port.dst; + for (i = 0; i < 4; i++) { + hash ^= s->ipv6.addr.src.u32[i]; + hash ^= s->ipv6.addr.dst.u32[i]; + } + } + + return hash % HASH_SIZE; } -static inline void -stbl_lock(struct stbl *st, uint32_t type) +static inline uint32_t +stbl_hash_pkt(const union pkt_info* pi) { - rte_spinlock_lock(&st->ht[type].l); + unsigned int hash; + if (pi->tf.type == TLE_V4) { + hash = pi->addr4.src ^ pi->addr4.dst ^ pi->port.src ^ pi->port.dst; + } else { + hash = pi->port.src ^ pi->port.dst; + for (int i = 0; i < 4; i++) { + hash ^= pi->addr6->src.u32[i]; + hash ^= pi->addr6->dst.u32[i]; + } + } + + return hash % HASH_SIZE; } -static inline void -stbl_unlock(struct stbl *st, uint32_t type) +static inline struct stbl_entry* +stbl_add_stream(struct stbl *st, struct tle_stream *s) { - rte_spinlock_unlock(&st->ht[type].l); + struct stbl_entry* entry; + + if (st->need_lock) + rte_spinlock_lock(&st->l); + entry = &st->head[stbl_hash_stream(s)]; + s->link.stqe_next = (struct tle_stream*)entry->data; + entry->data = s; + if (st->need_lock) + rte_spinlock_unlock(&st->l); + + return entry; } -static inline struct stbl_entry * -stbl_add_entry(struct stbl *st, const union pkt_info *pi) +static inline struct tle_stream * +stbl_find_stream(struct stbl *st, const union pkt_info *pi) { - int32_t rc; - uint32_t type; - struct shtbl *ht; - struct stbl_key k; - - type = pi->tf.type; - stbl_pkt_fill_key(&k, pi, type); - ht = st->ht + type; - - rc = rte_hash_add_key(ht->t, &k); - if ((uint32_t)rc >= ht->nb_ent) - return NULL; - return ht->ent + rc; + struct tle_stream* head; + + if (st->need_lock) + rte_spinlock_lock(&st->l); + head = (struct tle_stream*)st->head[stbl_hash_pkt(pi)].data; + while (head != NULL) { + if (compare_pkt(head, pi) == 0) + break; + + head = head->link.stqe_next; + } + if (st->need_lock) + rte_spinlock_unlock(&st->l); + return head; } -static inline struct stbl_entry * -stbl_add_stream(struct stbl *st, const union pkt_info *pi, const void *s) +static inline void +stbl_del_stream(struct stbl *st, struct stbl_entry *se, + struct tle_stream *s) { - struct stbl_entry *se; + struct tle_stream *prev, *current; - se = stbl_add_entry(st, pi); - if (se != NULL) - se->data = (void *)(uintptr_t)s; - return se; + if (st->need_lock) + rte_spinlock_lock(&st->l); + if (se == NULL) + se = &st->head[stbl_hash_stream(s)]; + prev = NULL; + current = (struct tle_stream*)se->data; + while (current != NULL) { + if (current != s) { + prev = current; + current = current->link.stqe_next; + continue; + } + + if (prev) + prev->link.stqe_next = current->link.stqe_next; + else + se->data = current->link.stqe_next; + break; + } + if (st->need_lock) + rte_spinlock_unlock(&st->l); } -static inline struct stbl_entry * -stbl_find_entry(struct stbl *st, const union pkt_info *pi) +struct bhash4_key { + uint16_t port; + uint32_t addr; +} __attribute__((__packed__)); + +struct bhash6_key { + uint16_t port; + rte_xmm_t addr; +} __attribute__((__packed__)); + +struct bhash_key { + uint16_t port; + union { + uint32_t addr4; + rte_xmm_t addr6; + }; +} __attribute__((__packed__)); + +void bhash_fini(struct tle_ctx *ctx); + +int bhash_init(struct tle_ctx *ctx); + +static inline int +bhash_sockaddr2key(const struct sockaddr *addr, struct bhash_key *key) { - int32_t rc; - uint32_t type; - struct shtbl *ht; - struct stbl_key k; - - type = pi->tf.type; - stbl_pkt_fill_key(&k, pi, type); - ht = st->ht + type; - - rc = rte_hash_lookup(ht->t, &k); - if ((uint32_t)rc >= ht->nb_ent) - return NULL; - return ht->ent + rc; + int t; + const struct sockaddr_in *lin4; + const struct sockaddr_in6 *lin6; + + if (addr->sa_family == AF_INET) { + lin4 = (const struct sockaddr_in *)addr; + key->port = lin4->sin_port; + key->addr4 = lin4->sin_addr.s_addr; + t = TLE_V4; + } else { + lin6 = (const struct sockaddr_in6 *)addr; + memcpy(&key->addr6, &lin6->sin6_addr, sizeof(key->addr6)); + key->port = lin6->sin6_port; + t = TLE_V6; + } + + return t; } -static inline void * -stbl_find_data(struct stbl *st, const union pkt_info *pi) +/* Return 0 on success; + * Return errno on failure. + */ +static inline int +bhash_add_entry(struct tle_ctx *ctx, const struct sockaddr *addr, + struct tle_stream *s) { - struct stbl_entry *ent; - - ent = stbl_find_entry(st, pi); - return (ent == NULL) ? NULL : ent->data; + int t; + int rc; + int is_first; + struct bhash_key key; + struct rte_hash *bhash; + struct tle_stream *old, *tmp; + + is_first = 0; + t = bhash_sockaddr2key(addr, &key); + + rte_spinlock_lock(&ctx->bhash_lock[t]); + bhash = ctx->bhash[t]; + rc = rte_hash_lookup_data(bhash, &key, (void **)&old); + if (rc == -ENOENT) { + is_first = 1; + rc = rte_hash_add_key_data(bhash, &key, s); + } else if (rc >= 0) { + if (t == TLE_V4 && old->type == TLE_V6) { + /* V6 stream may listen V4 address, assure V4 stream + * is ahead of V6 stream in the list + */ + s->link.stqe_next = old; + rte_hash_add_key_data(bhash, &key, s); + } else { + tmp = old->link.stqe_next; + old->link.stqe_next = s; + s->link.stqe_next = tmp; + } + } + rte_spinlock_unlock(&ctx->bhash_lock[t]); + + /* IPv6 socket with unspecified address could receive IPv4 packets. + * So the stream should also be recorded in IPv4 table. + * Only the first stream need be inserted into V4 list, otherwise + * the V6 list is already following V4 list. + */ + if (t == TLE_V6 && !s->option.ipv6only && is_first && + IN6_IS_ADDR_UNSPECIFIED(&key.addr6)) { + t = TLE_V4; + rte_spinlock_lock(&ctx->bhash_lock[t]); + bhash = ctx->bhash[t]; + rc = rte_hash_lookup_data(bhash, &key, (void **)&old); + if (rc == -ENOENT) + rc = rte_hash_add_key_data(bhash, &key, s); + else if (rc >= 0) { + while(old->link.stqe_next != NULL) + old = old->link.stqe_next; + old->link.stqe_next = s; + } + rte_spinlock_unlock(&ctx->bhash_lock[t]); + } + + return (rc >= 0) ? 0 : (-rc); } -#include "tcp_stream.h" - static inline void -stbl_stream_fill_key(struct stbl_key *k, const struct tle_stream *s, - uint32_t type) +bhash_del_entry(struct tle_ctx *ctx, struct tle_stream *s, + const struct sockaddr *addr) { - static const struct stbl_key zero = { - .port.raw = 0, - }; + int t; + int rc; + struct bhash_key key; + struct tle_stream *f, *cur, *pre = NULL; + + t = bhash_sockaddr2key(addr, &key); + + rte_spinlock_lock(&ctx->bhash_lock[t]); + rc = rte_hash_lookup_data(ctx->bhash[t], &key, (void **)&f); + if (rc >= 0) { + cur = f; + pre = NULL; + while (cur != s) { + pre = cur; + cur = cur->link.stqe_next; + } + + if (pre == NULL) { + cur = cur->link.stqe_next; + if (cur == NULL) + rte_hash_del_key(ctx->bhash[t], &key); + else /* change data */ + rte_hash_add_key_data(ctx->bhash[t], &key, cur); + } else + pre->link.stqe_next = cur->link.stqe_next; + } + + rte_spinlock_unlock(&ctx->bhash_lock[t]); + + if (rc < 0) + return; + + /* IPv6 socket with unspecified address could receive IPv4 packets. + * So the stream should also be recorded in IPv4 table*/ + if (t == TLE_V6 && !s->option.ipv6only && pre == NULL && + IN6_IS_ADDR_UNSPECIFIED(&key.addr6)) { + t = TLE_V4; + rte_spinlock_lock(&ctx->bhash_lock[t]); + rc = rte_hash_lookup_data(ctx->bhash[t], &key, (void **)&f); + if (rc >= 0) { + cur = f; + pre = NULL; + while (cur != s) { + pre = cur; + cur = cur->link.stqe_next; + } + + if (pre == NULL) { + cur = cur->link.stqe_next; + if (cur == NULL) + rte_hash_del_key(ctx->bhash[t], &key); + else /* change data */ + rte_hash_add_key_data(ctx->bhash[t], &key, cur); + } else + pre->link.stqe_next = cur->link.stqe_next; + } + + rte_spinlock_unlock(&ctx->bhash_lock[t]); + } - k->port = s->port; - if (type == TLE_V4) - k->addr4 = s->ipv4.addr; - else if (type == TLE_V6) - k->addr6 = s->ipv6.addr; - else - *k = zero; } -static inline struct stbl_entry * -stbl_add_stream_lock(struct stbl *st, const struct tle_tcp_stream *s) +static inline void * +bhash_reuseport_get_stream(struct tle_stream *s) { - uint32_t type; - struct stbl_key k; - struct stbl_entry *se; - struct shtbl *ht; - int32_t rc; - - type = s->s.type; - stbl_stream_fill_key(&k, &s->s, type); - ht = st->ht + type; + int n = 0; + struct tle_stream *e, *all[32]; + + e = s; + while(e && n < 32) { + all[n++] = e; + e = e->link.stqe_next; + } + + /* for each connection, this function will be called twice + * 1st time for the first handshake: SYN + * 2nd time for the third handshake: ACK + */ + return all[(s->reuseport_seed++) % n]; +} - stbl_lock(st, type); - rc = rte_hash_add_key(ht->t, &k); - stbl_unlock(st, type); +static inline void * +bhash_lookup4(struct rte_hash *t, uint32_t addr, uint16_t port, uint8_t reuse) +{ + int rc; + void *s = NULL; + struct bhash_key key = { + .port = port, + .addr4 = addr, + }; - if ((uint32_t)rc >= ht->nb_ent) - return NULL; + rc = rte_hash_lookup_data(t, &key, &s); + if (rc == -ENOENT) { + key.addr4 = INADDR_ANY; + rc = rte_hash_lookup_data(t, &key, &s); + } - se = ht->ent + rc; - if (se != NULL) - se->data = (void *)(uintptr_t)s; + if (rc >= 0) { + if (reuse) + return bhash_reuseport_get_stream(s); + else + return s; + } - return se; + return NULL; } -static inline void -stbl_del_stream(struct stbl *st, struct stbl_entry *se, - const struct tle_tcp_stream *s, uint32_t lock) +static inline void * +bhash_lookup6(struct rte_hash *t, rte_xmm_t addr, uint16_t port, uint8_t reuse) { - uint32_t type; - struct stbl_key k; + int rc; + void *s = NULL; + struct bhash_key key = { + .port = port, + .addr6 = addr, + }; - if (se == NULL) - return; + rc = rte_hash_lookup_data(t, &key, &s); + if (rc == -ENOENT) { + memcpy(&key.addr6, &tle_ipv6_any, sizeof(key.addr6)); + rc = rte_hash_lookup_data(t, &key, &s); + } - se->data = NULL; + if (rc >= 0) { + if (reuse) + return bhash_reuseport_get_stream(s); + else + return s; + } - type = s->s.type; - stbl_stream_fill_key(&k, &s->s, type); - if (lock != 0) - stbl_lock(st, type); - rte_hash_del_key(st->ht[type].t, &k); - if (lock != 0) - stbl_unlock(st, type); + return NULL; } #ifdef __cplusplus diff --git a/lib/libtle_l4p/syncookie.h b/lib/libtle_l4p/syncookie.h index 61bfce4..bf01e78 100644 --- a/lib/libtle_l4p/syncookie.h +++ b/lib/libtle_l4p/syncookie.h @@ -182,9 +182,12 @@ sync_fill_tcb(struct tcb *tcb, const union seg_info *si, const union tsopt *to) { uint32_t ack, mss, seq, wscale; + tcb->err = 0; + seq = si->seq; tcb->rcv.nxt = seq; + tcb->rcv.cpy = seq; tcb->rcv.irs = seq - 1; tcb->snd.wu.wl1 = seq; @@ -202,6 +205,7 @@ sync_fill_tcb(struct tcb *tcb, const union seg_info *si, const union tsopt *to) tcb->so.mss = mss; tcb->snd.ts = to->ecr; + tcb->snd.cork_ts = 0; tcb->rcv.ts = to->val; tcb->so.ts.raw = to->raw; diff --git a/lib/libtle_l4p/tcp_ctl.h b/lib/libtle_l4p/tcp_ctl.h index bec1e76..3955eaa 100644 --- a/lib/libtle_l4p/tcp_ctl.h +++ b/lib/libtle_l4p/tcp_ctl.h @@ -22,6 +22,7 @@ #include "tcp_stream.h" #include "tcp_ofo.h" +#include "tcp_timer.h" #ifdef __cplusplus extern "C" { @@ -97,10 +98,10 @@ calc_rx_wnd(const struct tle_tcp_stream *s, uint32_t scale) /* peer doesn't support WSCALE option, wnd size is limited to 64K */ if (scale == TCP_WSCALE_NONE) { - wnd = _rte_ring_get_mask(s->rx.q) << TCP_WSCALE_DEFAULT; + wnd = rte_ring_free_count(s->rx.q) << TCP_WSCALE_DEFAULT; return RTE_MIN(wnd, (uint32_t)UINT16_MAX); } else - return _rte_ring_get_mask(s->rx.q) << scale; + return rte_ring_free_count(s->rx.q) << scale; } /* empty stream's send queue */ @@ -148,8 +149,11 @@ tcp_stream_reset(struct tle_ctx *ctx, struct tle_tcp_stream *s) st = CTX_TCP_STLB(ctx); + /* stop rto timer */ + timer_stop(s); /* reset TX armed */ rte_atomic32_set(&s->tx.arm, 0); + s->tx.need_da = 0; /* reset TCB */ uop = s->tcb.uop & ~TCP_OP_CLOSE; @@ -167,8 +171,7 @@ tcp_stream_reset(struct tle_ctx *ctx, struct tle_tcp_stream *s) if (s->ste != NULL) { /* remove entry from RX streams table */ - stbl_del_stream(st, s->ste, s, - (s->flags & TLE_CTX_FLAG_ST) == 0); + stbl_del_stream(st, s->ste, &s->s); s->ste = NULL; empty_rq(s); } @@ -184,6 +187,37 @@ tcp_stream_reset(struct tle_ctx *ctx, struct tle_tcp_stream *s) put_stream(ctx, &s->s, TCP_STREAM_TX_FINISHED(s)); } +static inline void +stream_term(struct tle_tcp_stream *s) +{ + struct sdr *dr; + + s->tcb.state = TCP_ST_CLOSED; + rte_smp_wmb(); + + timer_stop(s); + + /* close() was already invoked, schedule final cleanup */ + if ((s->tcb.uop & TCP_OP_CLOSE) != 0) { + if (s->ste != NULL) { + /* remove entry from RX streams table */ + stbl_del_stream(CTX_TCP_STLB(s->s.ctx), s->ste, &s->s); + s->ste = NULL; + empty_rq(s); + } + + dr = CTX_TCP_SDR(s->s.ctx); + rte_spinlock_lock(&dr->lock); + STAILQ_INSERT_TAIL(&dr->be, &s->s, link); + rte_spinlock_unlock(&dr->lock); + + /* notify user that stream need to be closed */ + } else if (s->err.ev != NULL) + tle_event_raise(s->err.ev); + else if (s->err.cb.func != NULL) + s->err.cb.func(s->err.cb.data, &s->s); +} + #ifdef __cplusplus } #endif diff --git a/lib/libtle_l4p/tcp_misc.h b/lib/libtle_l4p/tcp_misc.h index 0ca5429..63efa8c 100644 --- a/lib/libtle_l4p/tcp_misc.h +++ b/lib/libtle_l4p/tcp_misc.h @@ -30,7 +30,7 @@ extern "C" { * of protocol related data. */ -#define TCP_WSCALE_DEFAULT 7 +#define TCP_WSCALE_DEFAULT 10 #define TCP_WSCALE_NONE 0 #define TCP_TX_HDR_MAX (sizeof(struct tcp_hdr) + TCP_TX_OPT_LEN_MAX) @@ -71,27 +71,6 @@ extern "C" { /* TCP flags mask. */ #define TCP_FLAG_MASK UINT8_MAX -union typflg { - uint16_t raw; - struct { - uint8_t type; /* TLE_V4/TLE_V6 */ - uint8_t flags; /* TCP header flags */ - }; -}; - -union pkt_info { - rte_xmm_t raw; - struct { - union typflg tf; - uint16_t csf; /* checksum flags */ - union l4_ports port; - union { - union ipv4_addrs addr4; - const union ipv6_addrs *addr6; - }; - }; -}; - union seg_info { rte_xmm_t raw; struct { @@ -226,7 +205,7 @@ struct dack_info { }; /* get current timestamp in ms */ -static inline uint32_t +static inline uint64_t tcp_get_tms(uint32_t mshift) { uint64_t ts; @@ -382,6 +361,8 @@ get_tms_opts(uintptr_t p, uint32_t len) else if (kind == TCP_OPT_KIND_NOP) i += sizeof(to->kl.kind); else { + if (to->kl.len == 0) + break; i += to->kl.len; if (i <= len && to->kl.raw == TCP_OPT_KL_TMS) { ts.val = rte_be_to_cpu_32(to->ts.val); @@ -441,7 +422,6 @@ get_pkt_info(const struct rte_mbuf *m, union pkt_info *pi, union seg_info *si) ((uintptr_t)tcph + offsetof(struct tcp_hdr, src_port)); pi->tf.flags = tcph->tcp_flags; pi->tf.type = type; - pi->csf = m->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD); pi->port.raw = prt->raw; get_seg_info(tcph, si); diff --git a/lib/libtle_l4p/tcp_ofo.c b/lib/libtle_l4p/tcp_ofo.c index 1565445..b31f2b5 100644 --- a/lib/libtle_l4p/tcp_ofo.c +++ b/lib/libtle_l4p/tcp_ofo.c @@ -12,7 +12,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include #include #include "tcp_stream.h" @@ -28,12 +27,6 @@ #define OFO_OBJ_MAX (OFODB_OBJ_MAX * OFO_DB_MAX) void -tcp_ofo_free(struct ofo *ofo) -{ - rte_free(ofo); -} - -static void calc_ofo_elems(uint32_t nbufs, uint32_t *nobj, uint32_t *ndb) { uint32_t n, nd, no; @@ -51,35 +44,3 @@ calc_ofo_elems(uint32_t nbufs, uint32_t *nobj, uint32_t *ndb) *nobj = no; *ndb = nd; } - -struct ofo * -tcp_ofo_alloc(uint32_t nbufs, int32_t socket) -{ - uint32_t i, ndb, nobj; - size_t dsz, osz, sz; - struct ofo *ofo; - struct rte_mbuf **obj; - - calc_ofo_elems(nbufs, &nobj, &ndb); - osz = sizeof(*ofo) + sizeof(ofo->db[0]) * ndb; - dsz = sizeof(ofo->db[0].obj[0]) * nobj * ndb; - sz = osz + dsz; - - ofo = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, socket); - if (ofo == NULL) { - TCP_LOG(ERR, "%s: allocation of %zu bytes on socket %d " - "failed with error code: %d\n", - __func__, sz, socket, rte_errno); - return NULL; - } - - obj = (struct rte_mbuf **)&ofo->db[ndb]; - for (i = 0; i != ndb; i++) { - ofo->db[i].nb_max = nobj; - ofo->db[i].obj = obj + i * nobj; - } - - ofo->nb_max = ndb; - return ofo; -} - diff --git a/lib/libtle_l4p/tcp_ofo.h b/lib/libtle_l4p/tcp_ofo.h index 4580402..e72393f 100644 --- a/lib/libtle_l4p/tcp_ofo.h +++ b/lib/libtle_l4p/tcp_ofo.h @@ -33,6 +33,15 @@ struct ofo { struct ofodb db[]; }; +static inline void +_ofodb_copy(struct ofodb *dst, struct ofodb *src) +{ + dst->nb_elem = src->nb_elem; + dst->sl = src->sl; + rte_memcpy(dst->obj, src->obj, + src->nb_elem * sizeof(struct rte_mbuf*)); +} + static inline void _ofodb_free(struct ofodb *db) { @@ -49,7 +58,7 @@ _ofo_remove(struct ofo *ofo, uint32_t pos, uint32_t num) n = ofo->nb_elem - num - pos; for (i = 0; i != n; i++) - ofo->db[pos + i] = ofo->db[pos + num + i]; + _ofodb_copy(&ofo->db[pos + i], &ofo->db[pos + num + i]); ofo->nb_elem -= num; } @@ -78,14 +87,14 @@ _ofo_insert_new(struct ofo *ofo, uint32_t pos, union seqlen *sl, return 0; /* allocate new one */ - db = ofo->db + n; ofo->nb_elem = n + 1; /* insert into a proper position. */ for (i = n; i != pos; i--) - ofo->db[i] = ofo->db[i - 1]; + _ofodb_copy(&ofo->db[i], &ofo->db[i - 1]); /* fill new block */ + db = ofo->db + pos; n = RTE_MIN(db->nb_max, num); for (i = 0; i != n; i++) db->obj[i] = mb[i]; @@ -119,11 +128,10 @@ _ofo_insert_right(struct ofo *ofo, uint32_t pos, union seqlen *sl, /* skip overlapping packets */ for (i = 0, n = skip; i != num && n != 0; i++, n -= plen) { - plen = mb[i]->pkt_len; if (n < plen) { /* adjust partially overlapped packet. */ - rte_pktmbuf_adj(mb[i], plen - n); + mb[i] = _rte_pktmbuf_adj(mb[i], n); break; } } @@ -155,12 +163,12 @@ static inline uint32_t _ofo_step(struct ofo *ofo, union seqlen *sl, struct rte_mbuf *mb[], uint32_t num) { - uint32_t i, n, end, lo, ro; - struct ofodb *db; + uint32_t i, j, n, ro, mn; + struct ofodb *db, *nextdb; db = NULL; - end = sl->seq + sl->len; n = ofo->nb_elem; + mn = num; /* * start from the right side, assume that after some gap, @@ -172,25 +180,57 @@ _ofo_step(struct ofo *ofo, union seqlen *sl, struct rte_mbuf *mb[], break; } - /* new db required */ - if ((int32_t)i < 0 || tcp_seq_lt(db->sl.seq + db->sl.len, sl->seq)) - return _ofo_insert_new(ofo, i + 1, sl, mb, num); - - /* new one is right adjacent, or overlap */ - - ro = sl->seq - db->sl.seq; - lo = end - db->sl.seq; + if (i != n - 1) { + nextdb = ofo->db + (i + 1); + /* overlap with right side, remove overlapped part from mb */ + if (tcp_seq_lt(nextdb->sl.seq, sl->seq + sl->len)) { + ro = sl->seq + sl->len - nextdb->sl.seq; + j = num - 1; + while (ro > 0) { + if (ro >= mb[j]->pkt_len) { + ro -= mb[j]->pkt_len; + sl->len -= mb[j]->pkt_len; + rte_pktmbuf_free(mb[j]); + num--; + j--; + } else { + _rte_pktmbuf_trim(mb[j], ro); + break; + } + } + } + } /* new one is completely overlapped by old one */ - if (lo <= db->sl.len) - return 0; + if ((int32_t)i >= 0 && tcp_seq_leq(sl->seq + sl->len, db->sl.seq + db->sl.len)) { + for (j = 0; j < num; j++) { + rte_pktmbuf_free(mb[j]); + } + return mn; + } - /* either overlap OR (adjacent AND some free space remains) */ - if (ro < db->sl.len || db->nb_elem != db->nb_max) - return _ofo_insert_right(ofo, i, sl, mb, num); + /* new one is partially overlapped by old one + * OR new one is right adjacent and current db still have space*/ + j = 0; + if ((int32_t)i >= 0 && + (tcp_seq_lt(sl->seq, db->sl.seq + db->sl.len) || + (sl->seq == db->sl.seq + db->sl.len && + db->nb_elem < db->nb_max))) { + j = _ofo_insert_right(ofo, i, sl, mb, num); + } - /* adjacent, no free space in current block */ - return _ofo_insert_new(ofo, i + 1, sl, mb, num); + while (j != num) { + i++; + n = _ofo_insert_new(ofo, i, sl, mb + j, num - j); + if (n == 0) + break; + j += n; + } + + for (; j < num; j++) { + rte_pktmbuf_free(mb[j]); + } + return mn; } static inline void @@ -199,7 +239,7 @@ _ofo_compact(struct ofo *ofo) uint32_t i, j, n, ro; struct ofodb *db; - for (i = 0; i < ofo->nb_elem; i = j) { + for (i = 0; i < ofo->nb_elem; i++) { for (j = i + 1; j != ofo->nb_elem; j++) { @@ -213,6 +253,8 @@ _ofo_compact(struct ofo *ofo) db->nb_elem); if (n < db->nb_elem) { db->nb_elem -= n; + memmove(db->obj, db->obj + n, + db->nb_elem * sizeof(struct rte_mbuf*)); break; } } @@ -224,23 +266,41 @@ _ofo_compact(struct ofo *ofo) } static inline uint32_t -_ofodb_enqueue(struct rte_ring *r, const struct ofodb *db, union seqlen *sl) +_ofodb_enqueue(struct rte_ring *r, const struct ofodb *db, uint32_t *seq) { - uint32_t n, num; + uint32_t i, n, num, begin, end; + struct rte_mbuf* pkt; + n = 0; num = db->nb_elem; - sl->raw = db->sl.raw; - n = _rte_ring_enqueue_burst(r, (void * const *)db->obj, num); + begin = db->sl.seq; + i = 0; + pkt = db->obj[0]; + + /* removed overlapped part from db */ + while (tcp_seq_lt(begin, *seq)) { + end = begin + pkt->pkt_len; + if (tcp_seq_leq(end, *seq)) { + /* pkt is completely overlapped */ + begin = end; + rte_pktmbuf_free(pkt); + pkt = db->obj[++i]; + } else { + /* pkt is partly overlapped */ + db->obj[i] = _rte_pktmbuf_adj(pkt, *seq - begin); + break; + } + } + + n = i; + n += _rte_ring_enqueue_burst(r, (void * const *)(db->obj + i), num - i); - sl->len -= tcp_mbuf_seq_free(db->obj + n, num - n); + *seq = db->sl.seq + db->sl.len; + *seq -= tcp_mbuf_seq_free(db->obj + n, num - n); return num - n; } -struct ofo * -tcp_ofo_alloc(uint32_t nbufs, int32_t socket); - -void -tcp_ofo_free(struct ofo *ofo); +void calc_ofo_elems(uint32_t nbufs, uint32_t *nobj, uint32_t *ndb); #ifdef __cplusplus } diff --git a/lib/libtle_l4p/tcp_rxq.h b/lib/libtle_l4p/tcp_rxq.h index 01f34fa..59ba043 100644 --- a/lib/libtle_l4p/tcp_rxq.h +++ b/lib/libtle_l4p/tcp_rxq.h @@ -17,6 +17,7 @@ #define _TCP_RXQ_H_ #include "tcp_ofo.h" +#include "tcp_ctl.h" #ifdef __cplusplus extern "C" { @@ -31,14 +32,9 @@ static inline uint32_t rx_ofo_enqueue(struct tle_tcp_stream *s, union seqlen *sl, struct rte_mbuf *mb[], uint32_t num) { - uint32_t i, n; - - n = 0; - do { - i = _ofo_step(s->rx.ofo, sl, mb + n, num - n); - n += i; - } while (i != 0 && n != num); + uint32_t n; + n = _ofo_step(s->rx.ofo, sl, mb, num); _ofo_compact(s->rx.ofo); return n; } @@ -46,14 +42,16 @@ rx_ofo_enqueue(struct tle_tcp_stream *s, union seqlen *sl, static inline uint32_t rx_ofo_reduce(struct tle_tcp_stream *s) { - uint32_t i, n, end, seq; + uint32_t i, n, seq; struct ofo *ofo; struct ofodb *db; - union seqlen sl; seq = s->tcb.rcv.nxt; ofo = s->rx.ofo; + if (ofo->nb_elem == 0) + return 0; + n = 0; for (i = 0; i != ofo->nb_elem; i++) { @@ -63,19 +61,16 @@ rx_ofo_reduce(struct tle_tcp_stream *s) if (tcp_seq_lt(seq, db->sl.seq)) break; - end = db->sl.seq + db->sl.len; - /* this db is fully overlapped */ - if (tcp_seq_leq(end, seq)) + if (tcp_seq_leq(db->sl.seq + db->sl.len, seq)) _ofodb_free(db); else - n += _ofodb_enqueue(s->rx.q, db, &sl); - - seq = sl.seq + sl.len; + n += _ofodb_enqueue(s->rx.q, db, &seq); } s->tcb.rcv.nxt = seq; _ofo_remove(ofo, 0, i); + return n; } @@ -135,6 +130,8 @@ rx_data_enqueue(struct tle_tcp_stream *s, uint32_t seq, uint32_t len, } n = rte_ring_count(s->rx.q); + /* update receive window with left recv buffer*/ + s->tcb.rcv.wnd = calc_rx_wnd(s, s->tcb.rcv.wscale); if (r != n) { /* raise RX event */ if (s->rx.ev != NULL) diff --git a/lib/libtle_l4p/tcp_rxtx.c b/lib/libtle_l4p/tcp_rxtx.c index c0a0dd1..56eb606 100644 --- a/lib/libtle_l4p/tcp_rxtx.c +++ b/lib/libtle_l4p/tcp_rxtx.c @@ -28,9 +28,14 @@ #include "tcp_rxq.h" #include "tcp_txq.h" #include "tcp_tx_seg.h" +#include "tcp_rxtx.h" #define TCP_MAX_PKT_SEG 0x20 +/* must larger than l2_len(14)+l3_len(20)+l4_len(20)+tms_option(12) */ +#define RESERVE_HEADER_LEN 128 +#define WIN_NOTIFY_THRESH 64 + /* * checks if input TCP ports and IP addresses match given stream. * returns zero on success. @@ -54,11 +59,17 @@ rx_check_stream(const struct tle_tcp_stream *s, const union pkt_info *pi) static inline struct tle_tcp_stream * rx_obtain_listen_stream(const struct tle_dev *dev, const union pkt_info *pi, - uint32_t type) + uint32_t type, uint8_t reuse) { struct tle_tcp_stream *s; - s = (struct tle_tcp_stream *)dev->dp[type]->streams[pi->port.dst]; + if (type == TLE_V4) + s = bhash_lookup4(dev->ctx->bhash[type], + pi->addr4.dst, pi->port.dst, reuse); + else + s = bhash_lookup6(dev->ctx->bhash[type], + pi->addr6->dst, pi->port.dst, reuse); + if (s == NULL || tcp_stream_acquire(s) < 0) return NULL; @@ -77,10 +88,10 @@ rx_obtain_stream(const struct tle_dev *dev, struct stbl *st, { struct tle_tcp_stream *s; - s = stbl_find_data(st, pi); + s = TCP_STREAM(stbl_find_stream(st, pi)); if (s == NULL) { - if (pi->tf.flags == TCP_FLAG_ACK) - return rx_obtain_listen_stream(dev, pi, type); + if (pi->tf.flags & TCP_FLAG_ACK) + return rx_obtain_listen_stream(dev, pi, type, 1); return NULL; } @@ -150,131 +161,6 @@ pkt_info_bulk_syneq(const union pkt_info pi[], uint32_t num) return i; } -static inline void -stream_drb_free(struct tle_tcp_stream *s, struct tle_drb *drbs[], - uint32_t nb_drb) -{ - _rte_ring_enqueue_burst(s->tx.drb.r, (void **)drbs, nb_drb); -} - -static inline uint32_t -stream_drb_alloc(struct tle_tcp_stream *s, struct tle_drb *drbs[], - uint32_t nb_drb) -{ - return _rte_ring_dequeue_burst(s->tx.drb.r, (void **)drbs, nb_drb); -} - -static inline uint32_t -get_ip_pid(struct tle_dev *dev, uint32_t num, uint32_t type, uint32_t st) -{ - uint32_t pid; - rte_atomic32_t *pa; - - pa = &dev->tx.packet_id[type]; - - if (st == 0) { - pid = rte_atomic32_add_return(pa, num); - return pid - num; - } else { - pid = rte_atomic32_read(pa); - rte_atomic32_set(pa, pid + num); - return pid; - } -} - -static inline void -fill_tcph(struct tcp_hdr *l4h, const struct tcb *tcb, union l4_ports port, - uint32_t seq, uint8_t hlen, uint8_t flags) -{ - uint16_t wnd; - - l4h->src_port = port.dst; - l4h->dst_port = port.src; - - wnd = (flags & TCP_FLAG_SYN) ? - RTE_MIN(tcb->rcv.wnd, (uint32_t)UINT16_MAX) : - tcb->rcv.wnd >> tcb->rcv.wscale; - - /* ??? use sse shuffle to hton all remaining 16 bytes at once. ??? */ - l4h->sent_seq = rte_cpu_to_be_32(seq); - l4h->recv_ack = rte_cpu_to_be_32(tcb->rcv.nxt); - l4h->data_off = hlen / TCP_DATA_ALIGN << TCP_DATA_OFFSET; - l4h->tcp_flags = flags; - l4h->rx_win = rte_cpu_to_be_16(wnd); - l4h->cksum = 0; - l4h->tcp_urp = 0; - - if (flags & TCP_FLAG_SYN) - fill_syn_opts(l4h + 1, &tcb->so); - else if ((flags & TCP_FLAG_RST) == 0 && tcb->so.ts.raw != 0) - fill_tms_opts(l4h + 1, tcb->snd.ts, tcb->rcv.ts); -} - -static inline int -tcp_fill_mbuf(struct rte_mbuf *m, const struct tle_tcp_stream *s, - const struct tle_dest *dst, uint64_t ol_flags, - union l4_ports port, uint32_t seq, uint32_t flags, - uint32_t pid, uint32_t swcsm) -{ - uint32_t l4, len, plen; - struct tcp_hdr *l4h; - char *l2h; - - len = dst->l2_len + dst->l3_len; - plen = m->pkt_len; - - if (flags & TCP_FLAG_SYN) - l4 = sizeof(*l4h) + TCP_TX_OPT_LEN_MAX; - else if ((flags & TCP_FLAG_RST) == 0 && s->tcb.rcv.ts != 0) - l4 = sizeof(*l4h) + TCP_TX_OPT_LEN_TMS; - else - l4 = sizeof(*l4h); - - /* adjust mbuf to put L2/L3/L4 headers into it. */ - l2h = rte_pktmbuf_prepend(m, len + l4); - if (l2h == NULL) - return -EINVAL; - - /* copy L2/L3 header */ - rte_memcpy(l2h, dst->hdr, len); - - /* setup TCP header & options */ - l4h = (struct tcp_hdr *)(l2h + len); - fill_tcph(l4h, &s->tcb, port, seq, l4, flags); - - /* setup mbuf TX offload related fields. */ - m->tx_offload = _mbuf_tx_offload(dst->l2_len, dst->l3_len, l4, 0, 0, 0); - m->ol_flags |= ol_flags; - - /* update proto specific fields. */ - - if (s->s.type == TLE_V4) { - struct ipv4_hdr *l3h; - l3h = (struct ipv4_hdr *)(l2h + dst->l2_len); - l3h->packet_id = rte_cpu_to_be_16(pid); - l3h->total_length = rte_cpu_to_be_16(plen + dst->l3_len + l4); - - if ((ol_flags & PKT_TX_TCP_CKSUM) != 0) - l4h->cksum = _ipv4x_phdr_cksum(l3h, m->l3_len, - ol_flags); - else if (swcsm != 0) - l4h->cksum = _ipv4_udptcp_mbuf_cksum(m, len, l3h); - - if ((ol_flags & PKT_TX_IP_CKSUM) == 0 && swcsm != 0) - l3h->hdr_checksum = _ipv4x_cksum(l3h, m->l3_len); - } else { - struct ipv6_hdr *l3h; - l3h = (struct ipv6_hdr *)(l2h + dst->l2_len); - l3h->payload_len = rte_cpu_to_be_16(plen + l4); - if ((ol_flags & PKT_TX_TCP_CKSUM) != 0) - l4h->cksum = rte_ipv6_phdr_cksum(l3h, ol_flags); - else if (swcsm != 0) - l4h->cksum = _ipv6_udptcp_mbuf_cksum(m, len, l3h); - } - - return 0; -} - /* * That function supposed to be used only for data packets. * Assumes that L2/L3/L4 headers and mbuf fields already setup properly. @@ -355,6 +241,9 @@ tx_data_pkts(struct tle_tcp_stream *s, struct rte_mbuf *const m[], uint32_t num) i = tle_dring_mp_enqueue(&dev->tx.dr, (const void * const*)m, num, drb, &nb); + if (i > 0 && s->tx.need_da) + s->tx.need_da = 0; + /* free unused drbs. */ if (nb != 0) stream_drb_free(s, drb + nbm - nb, nb); @@ -362,6 +251,115 @@ tx_data_pkts(struct tle_tcp_stream *s, struct rte_mbuf *const m[], uint32_t num) return i; } +/* + * case 0: pkt is not split yet, (indicate plen > sl->len) + * case 1: pkt is split, but left packet > sl->len + * case 2: pkt is split, but left packet <= sl->len + */ +static inline struct rte_mbuf * +get_indirect_mbuf(struct tle_tcp_stream *s, + struct rte_mbuf *m, uint32_t *p_plen, + union seqlen *sl, uint32_t type, + uint32_t mss) +{ + uint32_t hdr_len = PKT_L234_HLEN(m), plen, mlen; + struct rte_mbuf *f, *t; + uint16_t i, nb_segs; + void *hdr; + + if (m->next_pkt) { + f = m->next_pkt; + plen = f->data_len - f->next_offset; + if (f == m) + plen -= hdr_len; + } else { + f = m; + plen = f->data_len - hdr_len; + } + mlen = 0; + + TCP_LOG(DEBUG, "m(%p):pkt_len=%u,nb_segs=%u, sl->len = %u\n", + m, m->pkt_len, m->nb_segs, sl->len); + + /* Seg split needed: + * sometimes, cwnd will be reset to mss which is about 1400~1500 bytes + * which could be smaller than one seg. + * + * Our solution is to send part of one seg, and record the sended data + * offset in the seg. + */ + if (sl->len < plen) { + uint32_t payload_len = RTE_MIN(sl->len, s->tx.dst.mtu + m->l2_len - hdr_len); + mlen = plen - payload_len; + nb_segs = 1; + plen = payload_len; + m->next_pkt = f; + } else { + t = f->next; + nb_segs = 1; + while (t && plen + t->data_len < sl->len) { + plen += t->data_len; + t = t->next; + nb_segs++; + } + + m->next_pkt = t; + } + + struct rte_mbuf *pkts[1 + nb_segs]; + if (rte_pktmbuf_alloc_bulk(s->tx.dst.head_mp, pkts, 1 + nb_segs) < 0) { + return NULL; + } + + rte_pktmbuf_attach(pkts[1], f); + if (f->next_offset) + rte_pktmbuf_adj(pkts[1], f->next_offset); + if (f == m) + rte_pktmbuf_adj(pkts[1], hdr_len); + + if (mlen > 0) { + rte_pktmbuf_trim(pkts[1], mlen); + f->next_offset += plen; + } else { + f->next_offset = 0; + } + + for (i = 1, t = f->next; i < nb_segs; ++i) { + rte_pktmbuf_attach(pkts[i+1], t); + pkts[i]->next = pkts[i+1]; + t = t->next; + } + + /* prepare l2/l3/l4 header */ + hdr = rte_pktmbuf_append(pkts[0], hdr_len); + rte_memcpy(hdr, rte_pktmbuf_mtod(m, void *), hdr_len); + pkts[0]->nb_segs = nb_segs + 1; + pkts[0]->pkt_len = plen + hdr_len; + pkts[0]->ol_flags = m->ol_flags; + pkts[0]->tx_offload = m->tx_offload; + if (type == TLE_V4) { + struct ipv4_hdr *l3h; + + l3h = rte_pktmbuf_mtod_offset(pkts[0], + struct ipv4_hdr *, m->l2_len); + l3h->total_length = + rte_cpu_to_be_16(plen + m->l3_len + m->l4_len); + } else { + struct ipv6_hdr *l3h; + + l3h = rte_pktmbuf_mtod_offset(pkts[0], + struct ipv6_hdr *, m->l2_len); + l3h->payload_len = + rte_cpu_to_be_16(plen + m->l4_len); + } + if (plen <= mss) + pkts[0]->ol_flags &= ~PKT_TX_TCP_SEG; + pkts[0]->next = pkts[1]; + + *p_plen = plen; + return pkts[0]; +} + static inline uint32_t tx_data_bulk(struct tle_tcp_stream *s, union seqlen *sl, struct rte_mbuf *mi[], uint32_t num) @@ -383,11 +381,10 @@ tx_data_bulk(struct tle_tcp_stream *s, union seqlen *sl, struct rte_mbuf *mi[], for (i = 0; i != num && sl->len != 0 && fail == 0; i++) { mb = mi[i]; - sz = RTE_MIN(sl->len, mss); plen = PKT_L4_PLEN(mb); /*fast path, no need to use indirect mbufs. */ - if (plen <= sz) { + if (mb->next_pkt == NULL && plen <= sl->len) { /* update pkt TCP header */ tcp_update_mbuf(mb, type, &s->tcb, sl->seq, pid + i); @@ -397,12 +394,45 @@ tx_data_bulk(struct tle_tcp_stream *s, union seqlen *sl, struct rte_mbuf *mi[], sl->len -= plen; sl->seq += plen; mo[k++] = mb; + if (sl->seq <= s->tcb.snd.rcvr) + TCP_INC_STATS(TCP_MIB_RETRANSSEGS); /* remaining snd.wnd is less them MSS, send nothing */ - } else if (sz < mss) + } else if (sl->len < mss) { + break; + /* some data to send already */ + } else if (k != 0 || tn != 0) { break; /* packet indirection needed */ - else - RTE_VERIFY(0); + } else { + struct rte_mbuf *out; + + out = get_indirect_mbuf(s, mb, &plen, sl, type, mss); + if (out == NULL) + return 0; + + /* update pkt TCP header */ + tcp_update_mbuf(out, type, &s->tcb, sl->seq, pid + i); + + /* no need to bump refcnt !!! */ + + if (tx_data_pkts(s, &out, 1) == 0) { + rte_pktmbuf_free(out); + return 0; + } + + sl->len -= plen; + sl->seq += plen; + + if (sl->seq <= s->tcb.snd.rcvr) + TCP_INC_STATS(TCP_MIB_RETRANSSEGS); + + if (mb->next_pkt) + return 0; + else { + tn = 1; + continue; + } + } if (k >= MAX_PKT_BURST) { n = tx_data_pkts(s, mo, k); @@ -466,7 +496,10 @@ tx_nxt_data(struct tle_tcp_stream *s, uint32_t tms) tcp_txq_set_nxt_head(s, n); } while (n == num); - s->tcb.snd.nxt += sl.seq - (uint32_t)s->tcb.snd.nxt; + if (sl.seq != (uint32_t)s->tcb.snd.nxt) { + s->tcb.snd.nxt += sl.seq - (uint32_t)s->tcb.snd.nxt; + s->tcb.snd.ack = s->tcb.rcv.nxt; + } return tn; } @@ -503,6 +536,7 @@ free_una_data(struct tle_tcp_stream *s, uint32_t len) } while (plen < len); s->tcb.snd.una += len; + s->tcb.snd.waitlen -= len; /* * that could happen in case of retransmit, @@ -519,7 +553,7 @@ calc_smss(uint16_t mss, const struct tle_dest *dst) { uint16_t n; - n = dst->mtu - dst->l2_len - dst->l3_len - TCP_TX_HDR_DACK; + n = dst->mtu - dst->l3_len - sizeof(struct tcp_hdr); mss = RTE_MIN(n, mss); return mss; } @@ -537,67 +571,24 @@ initial_cwnd(uint32_t smss, uint32_t icw) return RTE_MIN(10 * smss, RTE_MAX(2 * smss, icw)); } -/* - * queue standalone packet to he particular output device - * It assumes that: - * - L2/L3/L4 headers should be already set. - * - packet fits into one segment. - */ -static inline int -send_pkt(struct tle_tcp_stream *s, struct tle_dev *dev, struct rte_mbuf *m) -{ - uint32_t n, nb; - struct tle_drb *drb; - - if (stream_drb_alloc(s, &drb, 1) == 0) - return -ENOBUFS; - - /* enqueue pkt for TX. */ - nb = 1; - n = tle_dring_mp_enqueue(&dev->tx.dr, (const void * const*)&m, 1, - &drb, &nb); - - /* free unused drbs. */ - if (nb != 0) - stream_drb_free(s, &drb, 1); - - return (n == 1) ? 0 : -ENOBUFS; -} - -static inline int -send_ctrl_pkt(struct tle_tcp_stream *s, struct rte_mbuf *m, uint32_t seq, - uint32_t flags) +void +tle_tcp_stream_kill(struct tle_stream *ts) { - const struct tle_dest *dst; - uint32_t pid, type; - int32_t rc; - - dst = &s->tx.dst; - type = s->s.type; - pid = get_ip_pid(dst->dev, 1, type, (s->flags & TLE_CTX_FLAG_ST) != 0); - - rc = tcp_fill_mbuf(m, s, dst, 0, s->s.port, seq, flags, pid, 1); - if (rc == 0) - rc = send_pkt(s, dst->dev, m); - - return rc; -} + struct tle_tcp_stream *s; -static inline int -send_rst(struct tle_tcp_stream *s, uint32_t seq) -{ - struct rte_mbuf *m; - int32_t rc; + s = TCP_STREAM(ts); + if (ts == NULL || s->s.type >= TLE_VNUM) + return; - m = rte_pktmbuf_alloc(s->tx.dst.head_mp); - if (m == NULL) - return -ENOMEM; + if (s->tcb.state > TCP_ST_LISTEN) + send_rst(s, s->tcb.snd.nxt); - rc = send_ctrl_pkt(s, m, seq, TCP_FLAG_RST); - if (rc != 0) - rte_pktmbuf_free(m); + if (s->tcb.state == TCP_ST_ESTABLISHED) + TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB); - return rc; + s->tcb.state = TCP_ST_CLOSED; + rte_smp_wmb(); + timer_stop(s); } static inline int @@ -620,6 +611,7 @@ send_ack(struct tle_tcp_stream *s, uint32_t tms, uint32_t flags) return rc; } + s->tx.need_da = 0; s->tcb.snd.ack = s->tcb.rcv.nxt; return 0; } @@ -633,19 +625,23 @@ sync_ack(struct tle_tcp_stream *s, const union pkt_info *pi, int32_t rc; uint32_t pid, seq, type; struct tle_dev *dev; - const void *da; + const void *sa, *da; struct tle_dest dst; const struct tcp_hdr *th; - type = s->s.type; + type = pi->tf.type; /* get destination information. */ - if (type == TLE_V4) + if (type == TLE_V4) { da = &pi->addr4.src; - else + sa = &pi->addr4.dst; + } + else { da = &pi->addr6->src; + sa = &pi->addr6->dst; + } - rc = stream_get_dest(&s->s, da, &dst); + rc = stream_get_dest(type, &s->s, sa, da, &dst); if (rc < 0) return rc; @@ -654,11 +650,16 @@ sync_ack(struct tle_tcp_stream *s, const union pkt_info *pi, get_syn_opts(&s->tcb.so, (uintptr_t)(th + 1), m->l4_len - sizeof(*th)); s->tcb.rcv.nxt = si->seq + 1; + s->tcb.rcv.cpy = si->seq + 1; seq = sync_gen_seq(pi, s->tcb.rcv.nxt, ts, s->tcb.so.mss, s->s.ctx->prm.hash_alg, &s->s.ctx->prm.secret_key); - s->tcb.so.ts.ecr = s->tcb.so.ts.val; - s->tcb.so.ts.val = sync_gen_ts(ts, s->tcb.so.wscale); + + if (s->tcb.so.ts.raw) { + s->tcb.so.ts.ecr = s->tcb.so.ts.val; + s->tcb.so.ts.val = sync_gen_ts(ts, s->tcb.so.wscale); + } + s->tcb.so.wscale = (s->tcb.so.wscale == TCP_WSCALE_NONE) ? TCP_WSCALE_NONE : TCP_WSCALE_DEFAULT; s->tcb.so.mss = calc_smss(dst.mtu, &dst); @@ -672,11 +673,13 @@ sync_ack(struct tle_tcp_stream *s, const union pkt_info *pi, dev = dst.dev; pid = get_ip_pid(dev, 1, type, (s->flags & TLE_CTX_FLAG_ST) != 0); - rc = tcp_fill_mbuf(m, s, &dst, 0, pi->port, seq, - TCP_FLAG_SYN | TCP_FLAG_ACK, pid, 1); + rc = tcp_fill_mbuf(m, s, &dst, TCP_OLFLAGS_CKSUM(dst.ol_flags), + pi->port, seq, TCP_FLAG_SYN | TCP_FLAG_ACK, pid, 1); if (rc == 0) rc = send_pkt(s, dev, m); + TCP_INC_STATS(TCP_MIB_PASSIVEOPENS); + return rc; } @@ -800,43 +803,24 @@ restore_syn_opt(union seg_info *si, union tsopt *to, return 0; } -static inline void -stream_term(struct tle_tcp_stream *s) -{ - struct sdr *dr; - - s->tcb.state = TCP_ST_CLOSED; - rte_smp_wmb(); - - timer_stop(s); - - /* close() was already invoked, schedule final cleanup */ - if ((s->tcb.uop & TCP_OP_CLOSE) != 0) { - - dr = CTX_TCP_SDR(s->s.ctx); - STAILQ_INSERT_TAIL(&dr->be, &s->s, link); - - /* notify user that stream need to be closed */ - } else if (s->err.ev != NULL) - tle_event_raise(s->err.ev); - else if (s->err.cb.func != NULL) - s->err.cb.func(s->err.cb.data, &s->s); -} - static inline int stream_fill_dest(struct tle_tcp_stream *s) { int32_t rc; uint32_t type; - const void *da; + const void *sa, *da; - type = s->s.type; - if (type == TLE_V4) + type = s->s.type; + if (type == TLE_V4) { + sa = &s->s.ipv4.addr.dst; da = &s->s.ipv4.addr.src; - else + } + else { + sa = &s->s.ipv6.addr.dst; da = &s->s.ipv6.addr.src; + } - rc = stream_get_dest(&s->s, da, &s->tx.dst); + rc = stream_get_dest(type, &s->s, sa, da, &s->tx.dst); return (rc < 0) ? rc : 0; } @@ -851,19 +835,17 @@ accept_prep_stream(struct tle_tcp_stream *ps, struct stbl *st, int32_t rc; uint32_t rtt; - /* some TX still pending for that stream. */ - if (TCP_STREAM_TX_PENDING(cs)) - return -EAGAIN; - /* setup L4 ports and L3 addresses fields. */ cs->s.port.raw = pi->port.raw; cs->s.pmsk.raw = UINT32_MAX; if (pi->tf.type == TLE_V4) { + cs->s.type = TLE_V4; cs->s.ipv4.addr = pi->addr4; cs->s.ipv4.mask.src = INADDR_NONE; cs->s.ipv4.mask.dst = INADDR_NONE; } else if (pi->tf.type == TLE_V6) { + cs->s.type = TLE_V6; cs->s.ipv6.addr = *pi->addr6; rte_memcpy(&cs->s.ipv6.mask.src, &tle_ipv6_none, sizeof(cs->s.ipv6.mask.src)); @@ -887,7 +869,7 @@ accept_prep_stream(struct tle_tcp_stream *ps, struct stbl *st, cs->tcb.snd.rto = TCP_RTO_DEFAULT; /* copy streams type & flags. */ - cs->s.type = ps->s.type; + cs->s.type = pi->tf.type; cs->flags = ps->flags; /* retrive and cache destination information. */ @@ -897,16 +879,21 @@ accept_prep_stream(struct tle_tcp_stream *ps, struct stbl *st, /* update snd.mss with SMSS value */ cs->tcb.snd.mss = calc_smss(cs->tcb.snd.mss, &cs->tx.dst); + if (cs->tcb.so.ts.raw != 0) { + cs->tcb.snd.mss -= TCP_TX_OPT_LEN_TMS; + } /* setup congestion variables */ cs->tcb.snd.cwnd = initial_cwnd(cs->tcb.snd.mss, ps->tcb.snd.cwnd); cs->tcb.snd.ssthresh = cs->tcb.snd.wnd; cs->tcb.snd.rto_tw = ps->tcb.snd.rto_tw; + cs->tcb.snd.rto_fw = ps->tcb.snd.rto_fw; cs->tcb.state = TCP_ST_ESTABLISHED; + TCP_INC_STATS_ATOMIC(TCP_MIB_CURRESTAB); /* add stream to the table */ - cs->ste = stbl_add_stream(st, pi, cs); + cs->ste = stbl_add_stream(st, &cs->s); if (cs->ste == NULL) return -ENOBUFS; @@ -937,7 +924,7 @@ rx_ack_listen(struct tle_tcp_stream *s, struct stbl *st, *csp = NULL; - if (pi->tf.flags != TCP_FLAG_ACK || rx_check_stream(s, pi) != 0) + if ((pi->tf.flags & TCP_FLAG_ACK) == 0|| rx_check_stream(s, pi) != 0) return -EINVAL; ctx = s->s.ctx; @@ -964,7 +951,8 @@ rx_ack_listen(struct tle_tcp_stream *s, struct stbl *st, /* cleanup on failure */ tcp_stream_down(cs); - stbl_del_stream(st, cs->ste, cs, 0); + TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB); + stbl_del_stream(st, cs->ste, &cs->s); cs->ste = NULL; } @@ -973,7 +961,7 @@ rx_ack_listen(struct tle_tcp_stream *s, struct stbl *st, } static inline int -data_pkt_adjust(const struct tcb *tcb, struct rte_mbuf *mb, uint32_t hlen, +data_pkt_adjust(const struct tcb *tcb, struct rte_mbuf **mb, uint32_t hlen, uint32_t *seqn, uint32_t *plen) { uint32_t len, n, seq; @@ -981,7 +969,7 @@ data_pkt_adjust(const struct tcb *tcb, struct rte_mbuf *mb, uint32_t hlen, seq = *seqn; len = *plen; - rte_pktmbuf_adj(mb, hlen); + rte_pktmbuf_adj(*mb, hlen); if (len == 0) return -ENODATA; /* cut off the start of the packet */ @@ -990,7 +978,7 @@ data_pkt_adjust(const struct tcb *tcb, struct rte_mbuf *mb, uint32_t hlen, if (n >= len) return -ENODATA; - rte_pktmbuf_adj(mb, n); + *mb = _rte_pktmbuf_adj(*mb, n); *seqn = seq + n; *plen = len - n; } @@ -1018,7 +1006,8 @@ rx_ackdata(struct tle_tcp_stream *s, uint32_t ack) tle_event_raise(s->tx.ev); else if (k == 0 && s->tx.cb.func != NULL) s->tx.cb.func(s->tx.cb.data, &s->s); - } + } else + txs_enqueue(s->s.ctx, s); } return n; @@ -1047,6 +1036,7 @@ rx_fin_state(struct tle_tcp_stream *s, struct resp_info *rsp) state = s->tcb.state; if (state == TCP_ST_ESTABLISHED) { + TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB); s->tcb.state = TCP_ST_CLOSE_WAIT; /* raise err.ev & err.cb */ if (s->err.ev != NULL) @@ -1055,6 +1045,11 @@ rx_fin_state(struct tle_tcp_stream *s, struct resp_info *rsp) s->err.cb.func(s->err.cb.data, &s->s); } else if (state == TCP_ST_FIN_WAIT_1 || state == TCP_ST_CLOSING) { rsp->flags |= TCP_FLAG_ACK; + + /* shutdown instead of close happens */ + if (s->err.ev != NULL) + tle_event_raise(s->err.ev); + if (ackfin != 0) stream_timewait(s, s->tcb.snd.rto_tw); else @@ -1089,15 +1084,17 @@ rx_fin(struct tle_tcp_stream *s, uint32_t state, ts = rx_tms_opt(&s->tcb, mb); ret = rx_check_seqack(&s->tcb, seq, si->ack, plen, ts); - if (ret != 0) + if (ret != 0) { + rsp->flags |= TCP_FLAG_ACK; return ret; + } if (state < TCP_ST_ESTABLISHED) return -EINVAL; if (plen != 0) { - ret = data_pkt_adjust(&s->tcb, mb, hlen, &seq, &plen); + ret = data_pkt_adjust(&s->tcb, &mb, hlen, &seq, &plen); if (ret != 0) return ret; if (rx_data_enqueue(s, seq, plen, &mb, 1) != 1) @@ -1108,9 +1105,10 @@ rx_fin(struct tle_tcp_stream *s, uint32_t state, * fast-path: all data & FIN was already sent out * and now is acknowledged. */ - if (s->tcb.snd.fss == s->tcb.snd.nxt && - si->ack == (uint32_t)s->tcb.snd.nxt) { + if (s->tcb.snd.fss >= s->tcb.snd.nxt && + si->ack == (uint32_t)s->tcb.snd.fss) { s->tcb.snd.una = s->tcb.snd.fss; + s->tcb.snd.nxt = s->tcb.snd.una; empty_tq(s); /* conventional ACK processiing */ } else @@ -1148,8 +1146,25 @@ rx_rst(struct tle_tcp_stream *s, uint32_t state, uint32_t flags, else rc = check_seqn(&s->tcb, si->seq, 0); - if (rc == 0) + if (rc == 0) { + /* receive rst, connection is closed abnormal + * and should return errno in later operations. + */ + switch (state) { + case TCP_ST_SYN_SENT: + TCP_INC_STATS(TCP_MIB_ATTEMPTFAILS); + s->tcb.err = ECONNREFUSED; + break; + case TCP_ST_CLOSE_WAIT: + s->tcb.err = EPIPE; + break; + case TCP_ST_CLOSED: + return rc; + default: + s->tcb.err = ECONNRESET; + } stream_term(s); + } return rc; } @@ -1303,7 +1318,7 @@ rx_data_ack(struct tle_tcp_stream *s, struct dack_info *tack, if (ret == 0) { /* skip duplicate data, if any */ - ret = data_pkt_adjust(&s->tcb, mb[i], hlen, + ret = data_pkt_adjust(&s->tcb, &mb[i], hlen, &seq, &plen); } @@ -1336,7 +1351,6 @@ rx_data_ack(struct tle_tcp_stream *s, struct dack_info *tack, /* account for segment received */ ack_info_update(tack, &si[j], ret != 0, plen, ts); - rte_pktmbuf_adj(mb[j], hlen); } n = j - i; @@ -1501,6 +1515,7 @@ rx_ackfin(struct tle_tcp_stream *s) uint32_t state; s->tcb.snd.una = s->tcb.snd.fss; + s->tcb.snd.nxt = s->tcb.snd.una; empty_tq(s); state = s->tcb.state; @@ -1509,6 +1524,13 @@ rx_ackfin(struct tle_tcp_stream *s) else if (state == TCP_ST_FIN_WAIT_1) { timer_stop(s); s->tcb.state = TCP_ST_FIN_WAIT_2; + /* if stream is closed, should be released + * before timeout even without fin from peer + */ + if (s->tcb.uop & TCP_OP_CLOSE) { + s->tcb.snd.rto = s->tcb.snd.rto_fw; + timer_start(s); + } } else if (state == TCP_ST_CLOSING) { stream_timewait(s, s->tcb.snd.rto_tw); } @@ -1568,18 +1590,24 @@ rx_synack(struct tle_tcp_stream *s, uint32_t ts, uint32_t state, s->tcb.snd.una = s->tcb.snd.nxt; s->tcb.snd.mss = calc_smss(so.mss, &s->tx.dst); + if (s->tcb.so.ts.raw != 0) { + s->tcb.snd.mss -= TCP_TX_OPT_LEN_TMS; + } s->tcb.snd.wnd = si->wnd << so.wscale; s->tcb.snd.wu.wl1 = si->seq; s->tcb.snd.wu.wl2 = si->ack; s->tcb.snd.wscale = so.wscale; + s->tcb.snd.cork_ts = 0; /* setup congestion variables */ s->tcb.snd.cwnd = initial_cwnd(s->tcb.snd.mss, s->tcb.snd.cwnd); + s->tcb.snd.ssthresh = s->tcb.snd.wnd; s->tcb.rcv.ts = so.ts.val; s->tcb.rcv.irs = si->seq; s->tcb.rcv.nxt = si->seq + 1; + s->tcb.rcv.cpy = si->seq + 1; /* if peer doesn't support WSCALE opt, recalculate RCV.WND */ s->tcb.rcv.wscale = (so.wscale == TCP_WSCALE_NONE) ? @@ -1594,6 +1622,7 @@ rx_synack(struct tle_tcp_stream *s, uint32_t ts, uint32_t state, timer_stop(s); s->tcb.state = TCP_ST_ESTABLISHED; rte_smp_wmb(); + TCP_INC_STATS_ATOMIC(TCP_MIB_CURRESTAB); if (s->tx.ev != NULL) tle_event_raise(s->tx.ev); @@ -1683,8 +1712,8 @@ rx_stream(struct tle_tcp_stream *s, uint32_t ts, * fast-path: all data & FIN was already sent out * and now is acknowledged. */ - if (s->tcb.snd.fss == s->tcb.snd.nxt && - tack.ack == (uint32_t)s->tcb.snd.nxt) + if (s->tcb.snd.fss >= s->tcb.snd.nxt && + tack.ack == (uint32_t)s->tcb.snd.fss) rx_ackfin(s); else rx_process_ack(s, ts, &tack); @@ -1696,10 +1725,24 @@ rx_stream(struct tle_tcp_stream *s, uint32_t ts, * - received segment with INO data and no TX is scheduled * for that stream. */ - if (tack.segs.badseq != 0 || tack.segs.ofo != 0 || - (tack.segs.data != 0 && - rte_atomic32_read(&s->tx.arm) == 0)) + if (tack.segs.badseq != 0 || tack.segs.ofo != 0) rsp.flags |= TCP_FLAG_ACK; + else if (tack.segs.data != 0 && + rte_atomic32_read(&s->tx.arm) == 0 && + (s->s.option.tcpquickack || + s->tcb.rcv.nxt - s->tcb.snd.ack > 8 * s->tcb.so.mss)) { + rsp.flags |= TCP_FLAG_ACK; + if (s->s.option.tcpquickack > 0) + s->s.option.tcpquickack--; + } + else if (tack.segs.data && rsp.flags == 0) { + if (!s->tx.need_da) + s->tx.need_da = 1; + if (!s->tx.in_daq) { + s->tx.in_daq = 1; + rte_ring_enqueue_bulk(CTX_TCP_DAQ(s->s.ctx), (void**)&s, 1, NULL); + } + } rx_ofo_fin(s, &rsp); @@ -1775,7 +1818,6 @@ rx_postsyn(struct tle_dev *dev, struct stbl *st, uint32_t type, uint32_t ts, state = s->tcb.state; if (state == TCP_ST_LISTEN) { - /* one connection per flow */ cs = NULL; ret = -EINVAL; @@ -1832,6 +1874,72 @@ rx_postsyn(struct tle_dev *dev, struct stbl *st, uint32_t type, uint32_t ts, return num - k; } +static inline void +sync_refuse(struct tle_tcp_stream *s, struct tle_dev *dev, + const union pkt_info *pi, struct rte_mbuf *m) +{ + struct ether_hdr *eth_h; + struct ether_addr eth_addr; + struct ipv4_hdr *ip_h; + uint32_t ip_addr; + struct ipv6_hdr *ipv6_h; + struct in6_addr ipv6_addr; + struct tcp_hdr *th; + uint16_t port; + + /* rst pkt should not contain options for syn */ + rte_pktmbuf_trim(m, m->l4_len - sizeof(*th)); + + eth_h = rte_pktmbuf_mtod(m, struct ether_hdr*); + ether_addr_copy(ð_h->s_addr, ð_addr); + ether_addr_copy(ð_h->d_addr, ð_h->s_addr); + ether_addr_copy(ð_addr, ð_h->d_addr); + + th = rte_pktmbuf_mtod_offset(m, struct tcp_hdr*, + m->l2_len + m->l3_len); + port = th->src_port; + th->src_port = th->dst_port; + th->dst_port = port; + th->tcp_flags = TCP_FLAG_RST | TCP_FLAG_ACK; + th->recv_ack = rte_cpu_to_be_32(rte_be_to_cpu_32(th->sent_seq) + 1); + th->sent_seq = 0; + th->data_off &= 0x0f; + th->data_off |= (sizeof(*th) / 4) << 4; + th->cksum = 0; + + if (pi->tf.type == TLE_V4) { + ip_h = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr*, + m->l2_len); + ip_addr = ip_h->src_addr; + ip_h->src_addr = ip_h->dst_addr; + ip_h->dst_addr = ip_addr; + ip_h->total_length = rte_cpu_to_be_16( + rte_be_to_cpu_16(ip_h->total_length) - + (m->l4_len - sizeof(*th))); + ip_h->hdr_checksum = 0; + th->cksum = rte_ipv4_udptcp_cksum(ip_h, th); + ip_h->hdr_checksum = rte_ipv4_cksum(ip_h); + } else { + ipv6_h = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr*, + m->l2_len); + rte_memcpy(&ipv6_addr, ipv6_h->src_addr, + sizeof(struct in6_addr)); + rte_memcpy(ipv6_h->src_addr, ipv6_h->dst_addr, + sizeof(struct in6_addr)); + rte_memcpy(ipv6_h->dst_addr, &ipv6_addr, + sizeof(struct in6_addr)); + ipv6_h->payload_len = rte_cpu_to_be_16( + rte_be_to_cpu_16(ipv6_h->payload_len) - + (m->l4_len - sizeof(*th))); + th->cksum = rte_ipv6_udptcp_cksum(ipv6_h, th); + } + + if (m->pkt_len < ETHER_MIN_LEN) + rte_pktmbuf_append(m, ETHER_MIN_LEN - m->pkt_len); + + if (send_pkt(s, dev, m) != 0) + rte_pktmbuf_free(m); +} static inline uint32_t rx_syn(struct tle_dev *dev, uint32_t type, uint32_t ts, @@ -1843,18 +1951,28 @@ rx_syn(struct tle_dev *dev, uint32_t type, uint32_t ts, uint32_t i, k; int32_t ret; - s = rx_obtain_listen_stream(dev, &pi[0], type); + s = rx_obtain_listen_stream(dev, &pi[0], type, 0); if (s == NULL) { - for (i = 0; i != num; i++) { - rc[i] = ENOENT; - rp[i] = mb[i]; + /* no socket listening this syn, send rst to refuse connect */ + s = TCP_STREAM(get_stream(dev->ctx)); + if (s != NULL) { + sync_refuse(s, dev, &pi[0], mb[0]); + put_stream(dev->ctx, &s->s, 0); + i = 1; + } else { + i = 0; } - return 0; + k = 0; + for (; i != num; i++) { + rc[k] = ENOENT; + rp[k] = mb[i]; + k++; + } + return num - k; } k = 0; for (i = 0; i != num; i++) { - /* check that this remote is allowed to connect */ if (rx_check_stream(s, &pi[i]) != 0) ret = -ENOENT; @@ -1879,51 +1997,34 @@ tle_tcp_rx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], { struct stbl *st; struct tle_ctx *ctx; - uint32_t i, j, k, mt, n, t, ts; - uint64_t csf; + uint32_t i, j, k, n, t; + uint64_t ts; union pkt_info pi[num]; union seg_info si[num]; - union { - uint8_t t[TLE_VNUM]; - uint32_t raw; - } stu; + + TCP_ADD_STATS(TCP_MIB_INSEGS, num); ctx = dev->ctx; ts = tcp_get_tms(ctx->cycles_ms_shift); st = CTX_TCP_STLB(ctx); - mt = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0); - - stu.raw = 0; /* extract packet info and check the L3/L4 csums */ for (i = 0; i != num; i++) { get_pkt_info(pkt[i], &pi[i], &si[i]); - t = pi[i].tf.type; - csf = dev->rx.ol_flags[t] & - (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD); - - /* check csums in SW */ - if (pi[i].csf == 0 && csf != 0 && check_pkt_csum(pkt[i], csf, - pi[i].tf.type, IPPROTO_TCP) != 0) - pi[i].csf = csf; - - stu.t[t] = mt; + pi[i].csf = check_pkt_csum(pkt[i], t, IPPROTO_TCP); } - if (stu.t[TLE_V4] != 0) - stbl_lock(st, TLE_V4); - if (stu.t[TLE_V6] != 0) - stbl_lock(st, TLE_V6); - k = 0; for (i = 0; i != num; i += j) { - t = pi[i].tf.type; /*basic checks for incoming packet */ - if (t >= TLE_VNUM || pi[i].csf != 0 || dev->dp[t] == NULL) { + if (t >= TLE_VNUM || pi[i].csf != 0) { + TCP_INC_STATS(TCP_MIB_INERRS); + if (t < TLE_VNUM) + TCP_INC_STATS(TCP_MIB_CSUMERRORS); rc[k] = EINVAL; rp[k] = pkt[i]; j = 1; @@ -1942,11 +2043,6 @@ tle_tcp_rx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], } } - if (stu.t[TLE_V4] != 0) - stbl_unlock(st, TLE_V4); - if (stu.t[TLE_V6] != 0) - stbl_unlock(st, TLE_V6); - return num - k; } @@ -1958,21 +2054,42 @@ tle_tcp_stream_accept(struct tle_stream *ts, struct tle_stream *rs[], struct tle_tcp_stream *s; s = TCP_STREAM(ts); - n = _rte_ring_dequeue_burst(s->rx.q, (void **)rs, num); - if (n == 0) + + if (s == NULL) { + rte_errno = EINVAL; return 0; + } - /* - * if we still have packets to read, - * then rearm stream RX event. - */ - if (n == num && rte_ring_count(s->rx.q) != 0) { - if (tcp_stream_try_acquire(s) > 0 && s->rx.ev != NULL) - tle_event_raise(s->rx.ev); + if (tcp_stream_try_acquire(s) > 0) { + if (s->tcb.state != TCP_ST_LISTEN) { + tcp_stream_release(s); + rte_errno = EINVAL; + return 0; + } + + n = _rte_ring_dequeue_burst(s->rx.q, (void **)rs, num); + if (n == 0) + { + tcp_stream_release(s); + rte_errno = EAGAIN; + return 0; + } + + /* + * if we still have packets to read, + * then rearm stream RX event. + */ + if (n == num && rte_ring_count(s->rx.q) != 0) { + if (s->rx.ev != NULL) + tle_event_raise(s->rx.ev); + } + tcp_stream_release(s); + return n; + } else { tcp_stream_release(s); + rte_errno = EINVAL; + return 0; } - - return n; } uint16_t @@ -2000,6 +2117,7 @@ tle_tcp_tx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], uint16_t num) stream_drb_free(s, drb + i, j - i); } + TCP_ADD_STATS(TCP_MIB_OUTSEGS, n); return n; } @@ -2029,7 +2147,7 @@ stream_fill_addr(struct tle_tcp_stream *s, const struct sockaddr *addr) /* setup L4 src ports and src address fields. */ if (s->s.type == TLE_V4) { in4 = (const struct sockaddr_in *)addr; - if (in4->sin_addr.s_addr == INADDR_ANY || in4->sin_port == 0) + if (in4->sin_addr.s_addr == INADDR_ANY) return -EINVAL; s->s.port.src = in4->sin_port; @@ -2039,9 +2157,8 @@ stream_fill_addr(struct tle_tcp_stream *s, const struct sockaddr *addr) } else if (s->s.type == TLE_V6) { in6 = (const struct sockaddr_in6 *)addr; - if (memcmp(&in6->sin6_addr, &tle_ipv6_any, - sizeof(tle_ipv6_any)) == 0 || - in6->sin6_port == 0) + + if (IN6_IS_ADDR_UNSPECIFIED(&in6->sin6_addr)) return -EINVAL; s->s.port.src = in6->sin6_port; @@ -2063,8 +2180,7 @@ stream_fill_addr(struct tle_tcp_stream *s, const struct sockaddr *addr) if (s->s.type == TLE_V4) { if (s->s.ipv4.addr.dst == INADDR_ANY) s->s.ipv4.addr.dst = prm->local_addr4.s_addr; - } else if (memcmp(&s->s.ipv6.addr.dst, &tle_ipv6_any, - sizeof(tle_ipv6_any)) == 0) + } else if (IN6_IS_ADDR_UNSPECIFIED(&s->s.ipv6.addr.dst)) memcpy(&s->s.ipv6.addr.dst, &prm->local_addr6, sizeof(s->s.ipv6.addr.dst)); @@ -2075,7 +2191,8 @@ static inline int tx_syn(struct tle_tcp_stream *s, const struct sockaddr *addr) { int32_t rc; - uint32_t tms, seq; + uint32_t seq; + uint64_t tms; union pkt_info pi; struct stbl *st; struct stbl_entry *se; @@ -2112,7 +2229,7 @@ tx_syn(struct tle_tcp_stream *s, const struct sockaddr *addr) /* add the stream in stream table */ st = CTX_TCP_STLB(s->s.ctx); - se = stbl_add_stream_lock(st, s); + se = stbl_add_stream(st, &s->s); if (se == NULL) return -ENOBUFS; s->ste = se; @@ -2120,6 +2237,7 @@ tx_syn(struct tle_tcp_stream *s, const struct sockaddr *addr) /* put stream into the to-send queue */ txs_enqueue(s->s.ctx, s); + TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); return 0; } @@ -2165,13 +2283,31 @@ tle_tcp_stream_connect(struct tle_stream *ts, const struct sockaddr *addr) uint16_t tle_tcp_stream_recv(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num) { - uint32_t n; + uint32_t n, i; struct tle_tcp_stream *s; + if (ts == NULL) { + rte_errno = ENOTCONN; + return 0; + } + s = TCP_STREAM(ts); + n = _rte_ring_mcs_dequeue_burst(s->rx.q, (void **)pkt, num); - if (n == 0) + if (n == 0) { + if (s->tcb.err != 0) { + rte_errno = s->tcb.err; + } else { + rte_errno = EAGAIN; + } return 0; + } + + for (i = 0; i < n; ++i) + s->tcb.rcv.cpy += rte_pktmbuf_pkt_len(pkt[i]); + + /* update receive window with left recv buffer*/ + s->tcb.rcv.wnd = calc_rx_wnd(s, s->tcb.rcv.wscale); /* * if we still have packets to read, @@ -2183,12 +2319,40 @@ tle_tcp_stream_recv(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num) tcp_stream_release(s); } + /* if have some free space in rx queue, send ack to notify peer */ + if (rte_ring_free_count(s->rx.q) == WIN_NOTIFY_THRESH) { + txs_enqueue(s->s.ctx, s); + } + return n; } +uint16_t +tle_tcp_stream_inq(struct tle_stream *ts) +{ + struct tle_tcp_stream *s; + + s = TCP_STREAM(ts); + return s->tcb.rcv.nxt - s->tcb.rcv.cpy; +} + ssize_t tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov, int iovcnt) +{ + if (ts == NULL) { + rte_errno = ENOTCONN; + return -1; + } + + return tle_tcp_stream_readv_msg(ts, iov, iovcnt, NULL); +} + +extern uint32_t timestamp_needed; + +ssize_t +tle_tcp_stream_readv_msg(struct tle_stream *ts, const struct iovec *iov, + int iovcnt, struct msghdr *msg) { int32_t i; uint32_t mn, n, tn; @@ -2196,13 +2360,47 @@ tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov, struct tle_tcp_stream *s; struct iovec iv; struct rxq_objs mo[2]; + struct sockaddr_in* addr; + struct sockaddr_in6* addr6; s = TCP_STREAM(ts); /* get group of packets */ mn = tcp_rxq_get_objs(s, mo); - if (mn == 0) - return 0; + if (mn == 0) { + if (s->tcb.err != 0) + rte_errno = s->tcb.err; + else + rte_errno = EAGAIN; + return -1; + } + + if (!timestamp_needed) + ts->timestamp = mo[0].mb[0]->timestamp; + + if (msg != NULL && msg->msg_control != NULL) { + if (timestamp_needed) + set_msg_timestamp(msg, mo[0].mb[0]); + else + msg->msg_controllen = 0; + } + + if (msg != NULL && msg->msg_name != NULL) { + if (s->s.type == TLE_V4) { + addr = (struct sockaddr_in*)msg->msg_name; + addr->sin_family = AF_INET; + addr->sin_addr.s_addr = s->s.ipv4.addr.src; + addr->sin_port = s->s.port.src; + msg->msg_namelen = sizeof(struct sockaddr_in); + } else { + addr6 = (struct sockaddr_in6*)msg->msg_name; + addr6->sin6_family = AF_INET6; + rte_memcpy(&addr6->sin6_addr, &s->s.ipv6.addr.src, + sizeof(struct sockaddr_in6)); + addr6->sin6_port = s->s.port.src; + msg->msg_namelen = sizeof(struct sockaddr_in6); + } + } sz = 0; n = 0; @@ -2234,6 +2432,8 @@ tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov, } tcp_rxq_consume(s, tn); + /* update receive window with left recv buffer*/ + s->tcb.rcv.wnd = calc_rx_wnd(s, s->tcb.rcv.wscale); /* * if we still have packets to read, @@ -2245,6 +2445,14 @@ tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov, tcp_stream_release(s); } + s->tcb.rcv.cpy += sz; + + /* if have some free space in rx queue, send ack to notify peer */ + n = rte_ring_free_count(s->rx.q); + if (n - tn < WIN_NOTIFY_THRESH && n >= WIN_NOTIFY_THRESH) { + txs_enqueue(s->s.ctx, s); + } + return sz; } @@ -2268,48 +2476,35 @@ tx_segments(struct tle_tcp_stream *s, uint64_t ol_flags, if (i == num) { /* queue packets for further transmission. */ rc = _rte_ring_enqueue_bulk(s->tx.q, (void **)segs, num); - if (rc != 0) + if (rc != 0) { + rc = -EAGAIN; free_mbufs(segs, num); + } } return rc; } -uint16_t -tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num) +static inline uint16_t +stream_send(struct tle_tcp_stream *s, struct rte_mbuf *pkt[], + uint16_t num, uint16_t mss, uint64_t ol_flags) { - uint32_t i, j, k, mss, n, state; + uint16_t i, j, k; int32_t rc; - uint64_t ol_flags; - struct tle_tcp_stream *s; + uint32_t n, free_slots; struct rte_mbuf *segs[TCP_MAX_PKT_SEG]; - - s = TCP_STREAM(ts); - - /* mark stream as not closable. */ - if (tcp_stream_acquire(s) < 0) { - rte_errno = EAGAIN; - return 0; - } - - state = s->tcb.state; - if (state != TCP_ST_ESTABLISHED && state != TCP_ST_CLOSE_WAIT) { - rte_errno = ENOTCONN; - tcp_stream_release(s); - return 0; - } - - mss = s->tcb.snd.mss; - ol_flags = s->tx.dst.ol_flags; + int32_t pkt_len; k = 0; rc = 0; + pkt_len = 0; while (k != num) { /* prepare and check for TX */ for (i = k; i != num; i++) { if (pkt[i]->pkt_len > mss || pkt[i]->nb_segs > TCP_MAX_PKT_SEG) break; + pkt_len += pkt[i]->pkt_len; rc = tcp_fill_mbuf(pkt[i], s, &s->tx.dst, ol_flags, s->s.port, 0, TCP_FLAG_ACK, 0, 0); if (rc != 0) @@ -2333,6 +2528,7 @@ tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num) pkt[j]->l3_len + pkt[j]->l4_len); pkt[j]->ol_flags &= ol_flags; + pkt_len -= pkt[j]->pkt_len; } break; } @@ -2344,8 +2540,10 @@ tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num) /* segment large packet and enqueue for sending */ } else if (i != num) { + free_slots = rte_ring_free_count(s->tx.q); + free_slots = RTE_MIN(free_slots, RTE_DIM(segs)); /* segment the packet. */ - rc = tcp_segmentation(pkt[i], segs, RTE_DIM(segs), + rc = tcp_segmentation(pkt[i], segs, free_slots, &s->tx.dst, mss); if (rc < 0) { rte_errno = -rc; @@ -2356,19 +2554,166 @@ tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num) if (rc == 0) { /* free the large mbuf */ rte_pktmbuf_free(pkt[i]); + pkt_len += pkt[i]->pkt_len; /* set the mbuf as consumed */ k++; - } else + } else { /* no space left in tx queue */ + RTE_VERIFY(0); + break; + } + } + } + + s->tcb.snd.waitlen += pkt_len; + return k; +} + +static inline uint16_t +stream_send_tso(struct tle_tcp_stream *s, struct rte_mbuf *pkt[], + uint16_t num, uint16_t mss, uint64_t ol_flags) +{ + uint16_t i, k, nb_segs; + int32_t rc, pkt_len; + uint64_t ol_flags1; + struct rte_mbuf *pre_tail; + + k = 0; + rc = 0; + while (k != num) { + /* Make sure there is at least one slot available */ + if (rte_ring_free_count(s->tx.q) == 0) + break; + + /* prepare and check for TX */ + nb_segs = 0; + pkt_len = 0; + pre_tail = NULL; + for (i = k; i != num; i++) { + if (pkt[i]->nb_segs != 1) + rte_panic("chained mbuf: %p\n", pkt[i]); + /* We shall consider cwnd and snd wnd when limit len */ + if (nb_segs + pkt[i]->nb_segs <= TCP_MAX_PKT_SEG && + pkt_len + pkt[i]->pkt_len <= 65535 - RESERVE_HEADER_LEN) { + nb_segs += pkt[i]->nb_segs; + pkt_len += pkt[i]->pkt_len; + if (pre_tail) + pre_tail->next = pkt[i]; + pre_tail = rte_pktmbuf_lastseg(pkt[i]); + } else { + /* enqueue this one now */ + break; + } + } + + if (unlikely(i == k)) { + /* pkt[k] is a too big packet, now we fall back to + * non-tso send; we can optimize it later by + * splitting the mbuf. + */ + if (stream_send(s, &pkt[k], 1, mss, ol_flags) == 1) { + k++; + continue; + } else break; } + + pkt[k]->nb_segs = nb_segs; + pkt[k]->pkt_len = pkt_len; + + ol_flags1 = ol_flags; + if (pkt_len > mss) + ol_flags1 |= PKT_TX_TCP_SEG; + + rc = tcp_fill_mbuf(pkt[k], s, &s->tx.dst, ol_flags1, + s->s.port, 0, TCP_FLAG_ACK, 0, 0); + if (rc != 0) /* hard to recover */ + rte_panic("failed to fill mbuf: %p\n", pkt[k]); + + /* correct mss */ + pkt[k]->tso_segsz = mss; + + s->tcb.snd.waitlen += pkt_len; + /* We already make sure there is at least one slot */ + if (_rte_ring_enqueue_burst(s->tx.q, (void **)pkt + k, 1) < 1) + RTE_VERIFY(0); + + k = i; + } + + return k; +} + +uint16_t +tle_tcp_stream_send(struct tle_stream *ts, struct rte_mbuf *pkt[], uint16_t num) +{ + uint16_t k, mss, state; + uint64_t ol_flags; + struct tle_tcp_stream *s; + + if (ts == NULL) { + rte_errno = EPIPE; + return 0; } + s = TCP_STREAM(ts); + + if (s->tcb.err != 0) { + rte_errno = s->tcb.err; + return 0; + } + + /* mark stream as not closable. */ + if (tcp_stream_acquire(s) < 0) { + rte_errno = EAGAIN; + return 0; + } + + state = s->tcb.state; + switch (state) { + case TCP_ST_ESTABLISHED: + case TCP_ST_CLOSE_WAIT: + break; + case TCP_ST_FIN_WAIT_1: + case TCP_ST_FIN_WAIT_2: + case TCP_ST_CLOSING: + case TCP_ST_LAST_ACK: + rte_errno = EPIPE; + tcp_stream_release(s); + return 0; + default: + rte_errno = ENOTCONN; + tcp_stream_release(s); + return 0; + } + + mss = s->tcb.snd.mss; + + ol_flags = s->tx.dst.ol_flags; + + /* Some reference number on the case: + * " - tap - - " + * ~2Gbps with tso disabled; + * ~16Gbps with tso enabled. + */ + if (rte_ring_free_count(s->tx.q) == 0) { + /* Block send may try without waiting for tx event (raised by acked + * data), so here we will still put this stream for further process + */ + txs_enqueue(s->s.ctx, s); + rte_errno = EAGAIN; + k = 0; + } else if (s->tx.dst.dev->prm.tx_offload & DEV_TX_OFFLOAD_TCP_TSO) + k = stream_send_tso(s, pkt, num, mss, ol_flags); + else + k = stream_send(s, pkt, num, mss, ol_flags); + /* notify BE about more data to send */ if (k != 0) txs_enqueue(s->s.ctx, s); + /* if possible, re-arm stream write event. */ - if (rte_ring_free_count(s->tx.q) != 0 && s->tx.ev != NULL) + if (rte_ring_free_count(s->tx.q) && s->tx.ev != NULL && k == num) tle_event_raise(s->tx.ev); tcp_stream_release(s); @@ -2387,9 +2732,20 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp, struct tle_tcp_stream *s; struct iovec iv; struct rte_mbuf *mb[2 * MAX_PKT_BURST]; + uint16_t mss; + + if (ts == NULL) { + rte_errno = EPIPE; + return -1; + } s = TCP_STREAM(ts); + if (s->tcb.err != 0) { + rte_errno = s->tcb.err; + return -1; + } + /* mark stream as not closable. */ if (tcp_stream_acquire(s) < 0) { rte_errno = EAGAIN; @@ -2397,7 +2753,18 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp, } state = s->tcb.state; - if (state != TCP_ST_ESTABLISHED && state != TCP_ST_CLOSE_WAIT) { + switch (state) { + case TCP_ST_ESTABLISHED: + case TCP_ST_CLOSE_WAIT: + break; + case TCP_ST_FIN_WAIT_1: + case TCP_ST_FIN_WAIT_2: + case TCP_ST_CLOSING: + case TCP_ST_LAST_ACK: + rte_errno = EPIPE; + tcp_stream_release(s); + return -1; + default: rte_errno = ENOTCONN; tcp_stream_release(s); return -1; @@ -2408,11 +2775,24 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp, for (i = 0; i != iovcnt; i++) tsz += iov[i].iov_len; + if (tsz == 0) { + tcp_stream_release(s); + return 0; + } + slen = rte_pktmbuf_data_room_size(mp); - slen = RTE_MIN(slen, s->tcb.snd.mss); + mss = s->tcb.snd.mss; + + slen = RTE_MIN(slen, mss); num = (tsz + slen - 1) / slen; n = rte_ring_free_count(s->tx.q); + + if (n == 0) { + tcp_stream_release(s); + return 0; + } + num = RTE_MIN(num, n); n = RTE_MIN(num, RTE_DIM(mb)); @@ -2456,7 +2836,6 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp, k = 0; if (k != j) { - /* free pkts that were not enqueued */ free_mbufs(mb + k, j - k); @@ -2471,14 +2850,16 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp, } } - if (k != 0) { - + if (k != 0) { /* notify BE about more data to send */ txs_enqueue(s->s.ctx, s); /* if possible, re-arm stream write event. */ if (rte_ring_free_count(s->tx.q) != 0 && s->tx.ev != NULL) tle_event_raise(s->tx.ev); + } else { + rte_errno = EAGAIN; + sz = -1; } tcp_stream_release(s); @@ -2489,6 +2870,7 @@ tle_tcp_stream_writev(struct tle_stream *ts, struct rte_mempool *mp, static inline void tx_data_fin(struct tle_tcp_stream *s, uint32_t tms, uint32_t state) { + uint64_t nxt = s->tcb.snd.nxt; /* try to send some data */ tx_nxt_data(s, tms); @@ -2499,6 +2881,9 @@ tx_data_fin(struct tle_tcp_stream *s, uint32_t tms, uint32_t state) s->tcb.snd.fss != s->tcb.snd.nxt) { s->tcb.snd.fss = ++s->tcb.snd.nxt; send_ack(s, tms, TCP_FLAG_FIN | TCP_FLAG_ACK); + } else if (s->tcb.snd.nxt == nxt) { + /* no pkt is sended, send an ack to notify window */ + send_ack(s, tms, TCP_FLAG_ACK); } } @@ -2549,7 +2934,6 @@ rto_stream(struct tle_tcp_stream *s, uint32_t tms) if (s->tcb.snd.nb_retx < s->tcb.snd.nb_retm) { if (state >= TCP_ST_ESTABLISHED && state <= TCP_ST_LAST_ACK) { - /* update SND.CWD and SND.SSTHRESH */ rto_cwnd_update(&s->tcb); @@ -2579,9 +2963,7 @@ rto_stream(struct tle_tcp_stream *s, uint32_t tms) s->tcb.snd.cwnd = s->tcb.snd.mss; send_ack(s, tms, TCP_FLAG_SYN); - - } else if (state == TCP_ST_TIME_WAIT) { - stream_term(s); + TCP_INC_STATS(TCP_MIB_RETRANSSEGS); } /* RFC6298:5.5 back off the timer */ @@ -2590,24 +2972,65 @@ rto_stream(struct tle_tcp_stream *s, uint32_t tms) timer_restart(s); } else { - send_rst(s, s->tcb.snd.una); + if (state == TCP_ST_SYN_SENT) { + if (stream_fill_dest(s) != 0 || + is_broadcast_ether_addr((struct ether_addr *)s->tx.dst.hdr)) + s->tcb.err = EHOSTUNREACH; + else + /* TODO: do we send rst on this */ + s->tcb.err = ENOTCONN; + } else + send_rst(s, s->tcb.snd.una); stream_term(s); } } +#define DELAY_ACK_CHECK_INTERVAL 50 + int tle_tcp_process(struct tle_ctx *ctx, uint32_t num) { - uint32_t i, k, tms; + uint32_t i, k; + uint64_t tms; struct sdr *dr; struct tle_timer_wheel *tw; struct tle_stream *p; struct tle_tcp_stream *s, *rs[num]; + struct rte_ring* r; + + /* process streams with delayed ack */ + tms = tcp_get_tms(ctx->cycles_ms_shift); + if (tms - CTX_TCP_STREAMS(ctx)->da_ts > DELAY_ACK_CHECK_INTERVAL) { + r = CTX_TCP_DAQ(ctx); + if (rte_ring_count(r) > 0) { + while (true) { + k = rte_ring_dequeue_burst(r, (void **)rs, num, NULL); + for (i = 0; i < k; i++) { + s = rs[i]; + if (rte_atomic32_read(&s->tx.arm) == 0 && + s->tx.need_da) { + if(tcp_stream_try_acquire(s) > 0) { + s->tx.in_daq = 0; + s->s.option.tcpquickack = 8; + send_ack(s, tms, TCP_FLAG_ACK); + } + else + rte_ring_enqueue_burst(r, (void**)&s, 1, NULL); + tcp_stream_release(s); + } else { + s->tx.in_daq = 0; + } + } + if (k < num) + break; + } + } + CTX_TCP_STREAMS(ctx)->da_ts = tms; + } /* process streams with RTO exipred */ tw = CTX_TCP_TMWHL(ctx); - tms = tcp_get_tms(ctx->cycles_ms_shift); tle_timer_expire(tw, tms); k = tle_timer_get_expired_bulk(tw, (void **)rs, RTE_DIM(rs)); @@ -2616,9 +3039,24 @@ tle_tcp_process(struct tle_ctx *ctx, uint32_t num) s = rs[i]; s->timer.handle = NULL; - if (tcp_stream_try_acquire(s) > 0) - rto_stream(s, tms); - tcp_stream_release(s); + while (s->tcb.state == TCP_ST_TIME_WAIT) { + if (tcp_stream_acquire(s) > 0) { + stream_term(s); + tcp_stream_release(s); + break; + } + } + + if (s->tcb.state == TCP_ST_FIN_WAIT_2) { + if (tcp_stream_try_acquire(s) > 0) + stream_term(s); + tcp_stream_release(s); + } + else if (s->tcb.state != TCP_ST_CLOSED) { + if (tcp_stream_try_acquire(s) > 0) + rto_stream(s, tms); + tcp_stream_release(s); + } } /* process streams from to-send queue */ @@ -2626,20 +3064,39 @@ tle_tcp_process(struct tle_ctx *ctx, uint32_t num) k = txs_dequeue_bulk(ctx, rs, RTE_DIM(rs)); for (i = 0; i != k; i++) { - s = rs[i]; - rte_atomic32_set(&s->tx.arm, 0); - if (tcp_stream_try_acquire(s) > 0) - tx_stream(s, tms); - else - txs_enqueue(s->s.ctx, s); + if (tcp_stream_try_acquire(s) > 0) { + if (rte_atomic32_read(&s->tx.arm) > 0) { + rte_atomic32_set(&s->tx.arm, 0); + + if (s->s.option.tcpcork) { + if (s->tcb.snd.cork_ts == 0) + s->tcb.snd.cork_ts = (uint32_t)tms; + if (s->tcb.state < TCP_ST_CLOSE_WAIT && + s->tcb.snd.waitlen < s->tcb.snd.mss && + (uint32_t)tms - s->tcb.snd.cork_ts < 200) { + txs_enqueue(s->s.ctx, s); + tcp_stream_release(s); + continue; + } else { + s->tcb.snd.cork_ts = 0; + } + } + + tx_stream(s, tms); + } + } else { + if (rte_atomic32_read(&s->tx.arm) > 0) + txs_enqueue(s->s.ctx, s); + } tcp_stream_release(s); } /* collect streams to close from the death row */ dr = CTX_TCP_SDR(ctx); + rte_spinlock_lock(&dr->lock); for (k = 0, p = STAILQ_FIRST(&dr->be); k != num && p != NULL; k++, p = STAILQ_NEXT(p, link)) @@ -2649,6 +3106,7 @@ tle_tcp_process(struct tle_ctx *ctx, uint32_t num) STAILQ_INIT(&dr->be); else STAILQ_FIRST(&dr->be) = p; + rte_spinlock_unlock(&dr->lock); /* cleanup closed streams */ for (i = 0; i != k; i++) { diff --git a/lib/libtle_l4p/tcp_rxtx.h b/lib/libtle_l4p/tcp_rxtx.h new file mode 100644 index 0000000..f5a0fac --- /dev/null +++ b/lib/libtle_l4p/tcp_rxtx.h @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2016-2017 Intel Corporation. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TCP_RXTX_H_ +#define _TCP_RXTX_H_ + +#include "tcp_stream.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static inline uint32_t +get_ip_pid(struct tle_dev *dev, uint32_t num, uint32_t type, uint32_t st) +{ + uint32_t pid; + rte_atomic32_t *pa; + + pa = &dev->tx.packet_id[type]; + + if (st == 0) { + pid = rte_atomic32_add_return(pa, num); + return pid - num; + } else { + pid = rte_atomic32_read(pa); + rte_atomic32_set(pa, pid + num); + return pid; + } +} + +static inline void +fill_tcph(struct tcp_hdr *l4h, const struct tcb *tcb, union l4_ports port, + uint32_t seq, uint8_t hlen, uint8_t flags) +{ + uint16_t wnd; + + l4h->src_port = port.dst; + l4h->dst_port = port.src; + + wnd = (flags & TCP_FLAG_SYN) ? + RTE_MIN(tcb->rcv.wnd, (uint32_t)UINT16_MAX) : + tcb->rcv.wnd >> tcb->rcv.wscale; + + /* ??? use sse shuffle to hton all remaining 16 bytes at once. ??? */ + l4h->sent_seq = rte_cpu_to_be_32(seq); + l4h->recv_ack = rte_cpu_to_be_32(tcb->rcv.nxt); + l4h->data_off = hlen / TCP_DATA_ALIGN << TCP_DATA_OFFSET; + l4h->tcp_flags = flags; + l4h->rx_win = rte_cpu_to_be_16(wnd); + l4h->cksum = 0; + l4h->tcp_urp = 0; + + if (flags & TCP_FLAG_SYN) + fill_syn_opts(l4h + 1, &tcb->so); + else if ((flags & TCP_FLAG_RST) == 0 && tcb->so.ts.raw != 0) + fill_tms_opts(l4h + 1, tcb->snd.ts, tcb->rcv.ts); +} + +static inline int +tcp_fill_mbuf(struct rte_mbuf *m, const struct tle_tcp_stream *s, + const struct tle_dest *dst, uint64_t ol_flags, + union l4_ports port, uint32_t seq, uint32_t flags, + uint32_t pid, uint32_t swcsm) +{ + uint32_t l4, len, plen; + struct tcp_hdr *l4h; + char *l2h, *l3; + + len = dst->l2_len + dst->l3_len; + plen = m->pkt_len; + + if (flags & TCP_FLAG_SYN) { + /* basic length */ + l4 = sizeof(*l4h) + TCP_OPT_LEN_MSS; + + /* add wscale space and nop */ + if (s->tcb.so.wscale) { + l4 += TCP_OPT_LEN_WSC + TCP_OPT_LEN_NOP; + } + + /* add timestamp space and nop */ + if (s->tcb.so.ts.raw) { + l4 += TCP_TX_OPT_LEN_TMS; + } + } else if ((flags & TCP_FLAG_RST) == 0 && s->tcb.rcv.ts != 0) { + l4 = sizeof(*l4h) + TCP_TX_OPT_LEN_TMS; + } else { + l4 = sizeof(*l4h); + } + + /* adjust mbuf to put L2/L3/L4 headers into it. */ + l2h = rte_pktmbuf_prepend(m, len + l4); + if (l2h == NULL) + return -EINVAL; + + /* copy L2/L3 header */ + rte_memcpy(l2h, dst->hdr, len); + + /* setup TCP header & options */ + l4h = (struct tcp_hdr *)(l2h + len); + fill_tcph(l4h, &s->tcb, port, seq, l4, flags); + + /* setup mbuf TX offload related fields. */ + m->tx_offload = _mbuf_tx_offload(dst->l2_len, dst->l3_len, l4, 0, 0, 0); + m->ol_flags |= ol_flags; + + /* update proto specific fields. */ + + l3 = l2h + dst->l2_len; + if (((struct ipv4_hdr*)l3)->version_ihl>>4 == 4) { + struct ipv4_hdr *l3h; + l3h = (struct ipv4_hdr *)l3; + l3h->packet_id = rte_cpu_to_be_16(pid); + l3h->total_length = rte_cpu_to_be_16(plen + dst->l3_len + l4); + + if ((ol_flags & PKT_TX_TCP_CKSUM) != 0) + l4h->cksum = _ipv4x_phdr_cksum(l3h, m->l3_len, + ol_flags); + else if (swcsm != 0) + l4h->cksum = _ipv4_udptcp_mbuf_cksum(m, len, l3h); + + if ((ol_flags & PKT_TX_IP_CKSUM) == 0 && swcsm != 0) + l3h->hdr_checksum = _ipv4x_cksum(l3h, m->l3_len); + } else { + struct ipv6_hdr *l3h; + l3h = (struct ipv6_hdr *)l3; + l3h->payload_len = rte_cpu_to_be_16(plen + l4); + if ((ol_flags & PKT_TX_TCP_CKSUM) != 0) + l4h->cksum = rte_ipv6_phdr_cksum(l3h, ol_flags); + else if (swcsm != 0) + l4h->cksum = _ipv6_udptcp_mbuf_cksum(m, len, l3h); + } + + return 0; +} + +static inline void +stream_drb_free(struct tle_tcp_stream *s, struct tle_drb *drbs[], + uint32_t nb_drb) +{ + _rte_ring_enqueue_burst(s->tx.drb.r, (void **)drbs, nb_drb); +} + +static inline uint32_t +stream_drb_alloc(struct tle_tcp_stream *s, struct tle_drb *drbs[], + uint32_t nb_drb) +{ + return _rte_ring_dequeue_burst(s->tx.drb.r, (void **)drbs, nb_drb); +} + +/* + * queue standalone packet to he particular output device + * It assumes that: + * - L2/L3/L4 headers should be already set. + * - packet fits into one segment. + */ +static inline int +send_pkt(struct tle_tcp_stream *s, struct tle_dev *dev, struct rte_mbuf *m) +{ + uint32_t n, nb; + struct tle_drb *drb; + + if (stream_drb_alloc(s, &drb, 1) == 0) + return -ENOBUFS; + + /* enqueue pkt for TX. */ + nb = 1; + n = tle_dring_mp_enqueue(&dev->tx.dr, (const void * const*)&m, 1, + &drb, &nb); + + /* free unused drbs. */ + if (nb != 0) + stream_drb_free(s, &drb, 1); + + return (n == 1) ? 0 : -ENOBUFS; +} + +#define TCP_OLFLAGS_CKSUM(flags) (flags & (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM)) + +static inline int +send_ctrl_pkt(struct tle_tcp_stream *s, struct rte_mbuf *m, uint32_t seq, + uint32_t flags) +{ + const struct tle_dest *dst; + uint32_t pid, type; + int32_t rc; + + dst = &s->tx.dst; + type = s->s.type; + pid = get_ip_pid(dst->dev, 1, type, (s->flags & TLE_CTX_FLAG_ST) != 0); + + rc = tcp_fill_mbuf(m, s, dst, TCP_OLFLAGS_CKSUM(dst->ol_flags), + s->s.port, seq, flags, pid, 1); + if (rc == 0) + rc = send_pkt(s, dst->dev, m); + + return rc; +} + +static inline int +send_rst(struct tle_tcp_stream *s, uint32_t seq) +{ + struct rte_mbuf *m; + int32_t rc; + + m = rte_pktmbuf_alloc(s->tx.dst.head_mp); + if (m == NULL) + return -ENOMEM; + + rc = send_ctrl_pkt(s, m, seq, TCP_FLAG_RST); + if (rc != 0) + rte_pktmbuf_free(m); + else + TCP_INC_STATS(TCP_MIB_OUTRSTS); + + return rc; +} + + + +#ifdef __cplusplus +} +#endif + +#endif /* _TCP_RXTX_H_ */ diff --git a/lib/libtle_l4p/tcp_stream.c b/lib/libtle_l4p/tcp_stream.c index 4e9ddb7..e534c8f 100644 --- a/lib/libtle_l4p/tcp_stream.c +++ b/lib/libtle_l4p/tcp_stream.c @@ -20,6 +20,8 @@ #include #include +#include + #include "tcp_stream.h" #include "tcp_timer.h" #include "stream_table.h" @@ -27,6 +29,7 @@ #include "tcp_ctl.h" #include "tcp_ofo.h" #include "tcp_txq.h" +#include "tcp_rxtx.h" static void unuse_stream(struct tle_tcp_stream *s) @@ -38,29 +41,32 @@ unuse_stream(struct tle_tcp_stream *s) static void fini_stream(struct tle_tcp_stream *s) { - if (s != NULL) { - rte_free(s->rx.q); - tcp_ofo_free(s->rx.ofo); - rte_free(s->tx.q); - rte_free(s->tx.drb.r); - } + rte_free(s); } static void tcp_fini_streams(struct tle_ctx *ctx) { - uint32_t i; struct tcp_streams *ts; + struct tle_stream *s; ts = CTX_TCP_STREAMS(ctx); if (ts != NULL) { stbl_fini(&ts->st); - for (i = 0; i != ctx->prm.max_streams; i++) - fini_stream(&ts->s[i]); + + /* TODO: free those in use? may be not necessary, as we assume + * all streams have been closed and are free. + */ + while (ctx->streams.nb_free--) { + s = STAILQ_FIRST(&ctx->streams.free); + STAILQ_FIRST(&ctx->streams.free) = STAILQ_NEXT(s, link); + fini_stream(TCP_STREAM(s)); + } /* free the timer wheel */ tle_timer_free(ts->tmr); rte_free(ts->tsq); + rte_free(ts->daq); STAILQ_INIT(&ts->dr.fe); STAILQ_INIT(&ts->dr.be); @@ -94,61 +100,100 @@ alloc_ring(uint32_t n, uint32_t flags, int32_t socket) return r; } +/* stream memory layout: + * [tle_tcp_stream] [rx.q] [rx.ofo] [tx.q] [tx.drb.r] + */ static int -init_stream(struct tle_ctx *ctx, struct tle_tcp_stream *s) +add_stream(struct tle_ctx *ctx) { - size_t bsz, rsz, sz; - uint32_t f, i, k, n, nb; + size_t sz_s, sz_rxq, sz_ofo, sz_txq, sz_drb_r, sz; + /* for rx.q */ + uint32_t n_rxq; + /* for rx.ofo */ + struct ofo *ofo; + struct rte_mbuf **obj; + uint32_t ndb, nobj; + size_t dsz, osz; + /* for tx.q */ + uint32_t n_txq; + /* for tx.drb.r */ + size_t bsz, rsz; struct tle_drb *drb; - char name[RTE_RING_NAMESIZE]; - - f = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0) ? 0 : - (RING_F_SP_ENQ | RING_F_SC_DEQ); - - /* init RX part. */ - - n = RTE_MAX(ctx->prm.max_stream_rbufs, 1U); - s->rx.q = alloc_ring(n, f | RING_F_SP_ENQ, ctx->prm.socket_id); - if (s->rx.q == NULL) - return -ENOMEM; - - s->rx.ofo = tcp_ofo_alloc(n, ctx->prm.socket_id); - if (s->rx.ofo == NULL) - return -ENOMEM; + uint32_t k, nb, n_drb; - /* init TX part. */ - - n = RTE_MAX(ctx->prm.max_stream_sbufs, 1U); - s->tx.q = alloc_ring(n, f | RING_F_SC_DEQ, ctx->prm.socket_id); - if (s->tx.q == NULL) - return -ENOMEM; + uint32_t f, i; + char name[RTE_RING_NAMESIZE]; + struct tle_tcp_stream *s; + // stream + sz_s = RTE_ALIGN_CEIL(sizeof(*s), RTE_CACHE_LINE_SIZE); + + // rx.q + n_rxq = RTE_MAX(ctx->prm.max_stream_rbufs, 1U); + n_rxq = rte_align32pow2(n_rxq); + sz_rxq = rte_ring_get_memsize(n_rxq); + sz_rxq = RTE_ALIGN_CEIL(sz_rxq, RTE_CACHE_LINE_SIZE); + + // rx.ofo + calc_ofo_elems(n_rxq, &nobj, &ndb); + osz = sizeof(*ofo) + sizeof(ofo->db[0]) * ndb; + dsz = sizeof(ofo->db[0].obj[0]) * nobj * ndb; + sz_ofo = osz + dsz; + sz_ofo = RTE_ALIGN_CEIL(sz_ofo, RTE_CACHE_LINE_SIZE); + + // tx.q + n_txq = RTE_MAX(ctx->prm.max_stream_sbufs, 1U); + n_txq = rte_align32pow2(n_txq); + sz_txq = rte_ring_get_memsize(n_txq); + sz_txq = RTE_ALIGN_CEIL(sz_txq, RTE_CACHE_LINE_SIZE); + + // tx.drb.r nb = drb_nb_elem(ctx); k = calc_stream_drb_num(ctx, nb); - n = rte_align32pow2(k); - - /* size of the drbs ring */ - rsz = rte_ring_get_memsize(n); + n_drb = rte_align32pow2(k); + rsz = rte_ring_get_memsize(n_drb); /* size of the drbs ring */ rsz = RTE_ALIGN_CEIL(rsz, RTE_CACHE_LINE_SIZE); + bsz = tle_drb_calc_size(nb); /* size of the drb. */ + sz_drb_r = rsz + bsz * k; /* total stream drbs size. */ + sz_drb_r = RTE_ALIGN_CEIL(sz_drb_r, RTE_CACHE_LINE_SIZE); - /* size of the drb. */ - bsz = tle_drb_calc_size(nb); - - /* total stream drbs size. */ - sz = rsz + bsz * k; - - s->tx.drb.r = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, - ctx->prm.socket_id); - if (s->tx.drb.r == NULL) { - TCP_LOG(ERR, "%s(%p): allocation of %zu bytes on socket %d " + sz = sz_s + sz_rxq + sz_ofo + sz_txq + sz_drb_r; + s = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, + ctx->prm.socket_id); + if (s == NULL) { + TCP_LOG(ERR, "%s: allocation of %zu bytes on socket %d " "failed with error code: %d\n", - __func__, s, sz, ctx->prm.socket_id, rte_errno); + __func__, sz, ctx->prm.socket_id, rte_errno); return -ENOMEM; } - snprintf(name, sizeof(name), "%p@%zu", s, sz); - rte_ring_init(s->tx.drb.r, name, n, f); + s->rx.q = (struct rte_ring *)((uintptr_t)s + sz_s); + s->rx.ofo = (struct ofo *)((uintptr_t)s->rx.q + sz_rxq); + ofo = s->rx.ofo; + s->tx.q = (struct rte_ring *)((uintptr_t)s->rx.ofo + sz_ofo); + s->tx.drb.r = (struct rte_ring *)((uintptr_t)s->tx.q + sz_txq); + + // ring flags + f = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0) ? 0 : + (RING_F_SP_ENQ | RING_F_SC_DEQ); + + /* init RX part. */ + snprintf(name, sizeof(name), "%p@%zu", s->rx.q, sz_rxq); + rte_ring_init(s->rx.q, name, n_rxq, f); + obj = (struct rte_mbuf **)&ofo->db[ndb]; + for (i = 0; i != ndb; i++) { + ofo->db[i].nb_max = nobj; + ofo->db[i].obj = obj + i * nobj; + } + ofo->nb_max = ndb; + + /* init TX part. */ + snprintf(name, sizeof(name), "%p@%zu", s->tx.q, sz_txq); + rte_ring_init(s->tx.q, name, n_txq, f); + + snprintf(name, sizeof(name), "%p@%zu", s->tx.drb.r, sz_drb_r); + rte_ring_init(s->tx.drb.r, name, n_drb, f); for (i = 0; i != k; i++) { drb = (struct tle_drb *)((uintptr_t)s->tx.drb.r + rsz + bsz * i); @@ -200,7 +245,7 @@ tcp_init_streams(struct tle_ctx *ctx) f = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0) ? 0 : (RING_F_SP_ENQ | RING_F_SC_DEQ); - sz = sizeof(*ts) + sizeof(ts->s[0]) * ctx->prm.max_streams; + sz = sizeof(*ts); ts = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, ctx->prm.socket_id); if (ts == NULL) { @@ -210,6 +255,7 @@ tcp_init_streams(struct tle_ctx *ctx) return -ENOMEM; } + rte_spinlock_init(&ts->dr.lock); STAILQ_INIT(&ts->dr.fe); STAILQ_INIT(&ts->dr.be); @@ -225,15 +271,16 @@ tcp_init_streams(struct tle_ctx *ctx) } else { ts->tsq = alloc_ring(ctx->prm.max_streams, f | RING_F_SC_DEQ, ctx->prm.socket_id); - if (ts->tsq == NULL) + ts->daq = alloc_ring(ctx->prm.max_streams, + RING_F_SP_ENQ | RING_F_SC_DEQ, ctx->prm.socket_id); + if (ts->tsq == NULL || ts->daq == NULL) rc = -ENOMEM; else - rc = stbl_init(&ts->st, ctx->prm.max_streams, - ctx->prm.socket_id); + rc = stbl_init(&ts->st, (ctx->prm.flags & TLE_CTX_FLAG_ST) == 0); } - for (i = 0; rc == 0 && i != ctx->prm.max_streams; i++) - rc = init_stream(ctx, &ts->s[i]); + for (i = 0; rc == 0 && i != ctx->prm.min_streams; i++) + rc = add_stream(ctx); if (rc != 0) { TCP_LOG(ERR, "initalisation of %u-th stream failed", i); @@ -243,11 +290,30 @@ tcp_init_streams(struct tle_ctx *ctx) return rc; } -static void __attribute__((constructor)) +/* + * Note this function is not thread-safe, and we did not lock here as we + * have the assumption that this ctx is dedicated to one thread. + */ +static uint32_t +tcp_more_streams(struct tle_ctx *ctx) +{ + uint32_t i, nb; + uint32_t nb_max = ctx->prm.max_streams - 1; + uint32_t nb_cur = ctx->streams.nb_cur; + + nb = RTE_MIN(ctx->prm.delta_streams, nb_max - nb_cur); + for (i = 0; i < nb; i++) + if (add_stream(ctx) != 0) + break; + return i; +} + +static void __attribute__((constructor(101))) tcp_stream_setup(void) { static const struct stream_ops tcp_ops = { .init_streams = tcp_init_streams, + .more_streams = tcp_more_streams, .fini_streams = tcp_fini_streams, .free_drbs = tcp_free_drbs, }; @@ -305,16 +371,12 @@ tle_tcp_stream_open(struct tle_ctx *ctx, s = (struct tle_tcp_stream *)get_stream(ctx); if (s == NULL) { - rte_errno = ENFILE; - return NULL; - - /* some TX still pending for that stream. */ - } else if (TCP_STREAM_TX_PENDING(s)) { - put_stream(ctx, &s->s, 0); rte_errno = EAGAIN; return NULL; } + s->s.option.raw = prm->option; + /* setup L4 ports and L3 addresses fields. */ rc = stream_fill_ctx(ctx, &s->s, (const struct sockaddr *)&prm->addr.local, @@ -336,12 +398,14 @@ tle_tcp_stream_open(struct tle_ctx *ctx, /* store other params */ s->flags = ctx->prm.flags; + s->tcb.err = 0; s->tcb.snd.nb_retm = (prm->cfg.nb_retries != 0) ? prm->cfg.nb_retries : TLE_TCP_DEFAULT_RETRIES; s->tcb.snd.cwnd = (ctx->prm.icw == 0) ? TCP_INITIAL_CWND_MAX : ctx->prm.icw; s->tcb.snd.rto_tw = (ctx->prm.timewait == TLE_TCP_TIMEWAIT_DEFAULT) ? TCP_RTO_2MSL : ctx->prm.timewait; + s->tcb.snd.rto_fw = TLE_TCP_FINWAIT_TIMEOUT; tcp_stream_up(s); return &s->s; @@ -351,47 +415,74 @@ tle_tcp_stream_open(struct tle_ctx *ctx, * Helper functions, used by close API. */ static inline int -stream_close(struct tle_ctx *ctx, struct tle_tcp_stream *s) +stream_close(struct tle_ctx *ctx, struct tle_tcp_stream *s, int close) { uint16_t uop; uint32_t state; static const struct tle_stream_cb zcb; + /* mark stream as unavaialbe for RX/TX. */ + tcp_stream_down(s); + /* check was close() already invoked */ uop = s->tcb.uop; - if ((uop & TCP_OP_CLOSE) != 0) + if ((uop & TCP_OP_CLOSE) != 0) { + tcp_stream_up(s); return -EDEADLK; + } /* record that close() was already invoked */ - if (rte_atomic16_cmpset(&s->tcb.uop, uop, uop | TCP_OP_CLOSE) == 0) + if (close && + rte_atomic16_cmpset(&s->tcb.uop, uop, uop | TCP_OP_CLOSE) == 0) { + tcp_stream_up(s); return -EDEADLK; - - /* mark stream as unavaialbe for RX/TX. */ - tcp_stream_down(s); + } /* reset events/callbacks */ - s->rx.ev = NULL; s->tx.ev = NULL; - s->err.ev = NULL; - - s->rx.cb = zcb; s->tx.cb = zcb; - s->err.cb = zcb; + if (close) { + s->rx.ev = NULL; + s->rx.cb = zcb; + s->err.ev = NULL; + s->err.cb = zcb; + } state = s->tcb.state; /* CLOSED, LISTEN, SYN_SENT - we can close the stream straighway */ - if (state <= TCP_ST_SYN_SENT) { + if (close && state <= TCP_ST_SYN_SENT) { tcp_stream_reset(ctx, s); return 0; } + /* if stream is closed, should be released + * before timeout even without fin from peer + */ + if (close && state == TCP_ST_FIN_WAIT_2) { + s->tcb.snd.rto = s->tcb.snd.rto_fw; + timer_start(s); + } + + /* closing tcp socket when receive buffer is not empty is an exception. + * send rst and close the socket immediately. + */ + if (close && rte_ring_count(s->rx.q) > 0) { + TCP_INC_STATS(TCP_MIB_ESTABRESETS); + send_rst(s, s->tcb.snd.nxt); + stream_term(s); + return 0; + } + /* generate FIN and proceed with normal connection termination */ if (state == TCP_ST_ESTABLISHED || state == TCP_ST_CLOSE_WAIT) { - /* change state */ - s->tcb.state = (state == TCP_ST_ESTABLISHED) ? - TCP_ST_FIN_WAIT_1 : TCP_ST_LAST_ACK; + if (state == TCP_ST_ESTABLISHED) { + TCP_DEC_STATS_ATOMIC(TCP_MIB_CURRESTAB); + s->tcb.state = TCP_ST_FIN_WAIT_1; + } else + s->tcb.state = TCP_ST_LAST_ACK; + /* mark stream as writable/readable again */ tcp_stream_up(s); @@ -401,12 +492,8 @@ stream_close(struct tle_ctx *ctx, struct tle_tcp_stream *s) return 0; } - /* - * accroding to the state, close() was already invoked, - * should never that point. - */ - RTE_ASSERT(0); - return -EINVAL; + tcp_stream_up(s); + return 0; } uint32_t @@ -428,7 +515,7 @@ tle_tcp_stream_close_bulk(struct tle_stream *ts[], uint32_t num) } ctx = s->s.ctx; - rc = stream_close(ctx, s); + rc = stream_close(ctx, s, 1); if (rc != 0) break; } @@ -449,7 +536,36 @@ tle_tcp_stream_close(struct tle_stream *ts) return -EINVAL; ctx = s->s.ctx; - return stream_close(ctx, s); + return stream_close(ctx, s, 1); +} + +int +tle_tcp_stream_shutdown(struct tle_stream *ts, int how) +{ + struct tle_tcp_stream *s; + + s = TCP_STREAM(ts); + if (ts == NULL || s->s.type >= TLE_VNUM) + return -EINVAL; + + switch (how) { + case SHUT_WR: + /* Notify other threads which may wait on the event */ + if (s->tx.ev) + tle_event_raise(s->tx.ev); + return stream_close(s->s.ctx, s, 0); + case SHUT_RDWR: + if (s->err.ev) + tle_event_raise(s->err.ev); + return stream_close(s->s.ctx, s, 1); + case SHUT_RD: + /* todo: not implemented yet*/ + return 0; + default: + errno = EINVAL; + } + + return -1; } int @@ -506,6 +622,12 @@ tle_tcp_stream_listen(struct tle_stream *ts) if (ts == NULL || s->s.type >= TLE_VNUM) return -EINVAL; + /* app may listen for multiple times to change backlog, + * we will just return success for such cases. + */ + if (s->tcb.state == TCP_ST_LISTEN) + return 0; + /* mark stream as not closable. */ if (tcp_stream_try_acquire(s) > 0) { rc = rte_atomic16_cmpset(&s->tcb.state, TCP_ST_CLOSED, @@ -611,3 +733,26 @@ tle_tcp_stream_get_mss(const struct tle_stream * ts) s = TCP_STREAM(ts); return s->tcb.snd.mss; } + +int +tle_tcp_stream_get_info(const struct tle_stream * ts, void *info, socklen_t *optlen) +{ + struct tle_tcp_stream *s; + struct tcp_info i; + + if (ts == NULL) + return -EINVAL; + + s = TCP_STREAM(ts); + if (s->tcb.state < TCP_ST_ESTABLISHED) + return -ENOTCONN; + + memset(&i, 0, sizeof(struct tcp_info)); + /* fix me, total retrans? */ + i.tcpi_total_retrans = s->tcb.snd.nb_retx; + + if (*optlen > sizeof(struct tcp_info)) + *optlen = sizeof(struct tcp_info); + rte_memcpy(info, &i, *optlen); + return 0; +} diff --git a/lib/libtle_l4p/tcp_stream.h b/lib/libtle_l4p/tcp_stream.h index 4629fe6..59cf803 100644 --- a/lib/libtle_l4p/tcp_stream.h +++ b/lib/libtle_l4p/tcp_stream.h @@ -52,10 +52,12 @@ enum { }; struct tcb { + int err; volatile uint16_t state; volatile uint16_t uop; /* operations by user performed */ struct { uint32_t nxt; + uint32_t cpy; /* head of yet unread data */ uint32_t irs; /* initial received sequence */ uint32_t wnd; uint32_t ts; @@ -83,7 +85,10 @@ struct tcb { uint32_t ssthresh; /* slow start threshold */ uint32_t rto; /* retransmission timeout */ uint32_t rto_tw; /* TIME_WAIT retransmission timeout */ + uint32_t rto_fw; /* FIN_WAIT_2 waiting timeout */ uint32_t iss; /* initial send sequence */ + uint32_t waitlen; /* total length of unacknowledged pkt */ + uint32_t cork_ts; uint16_t mss; uint8_t wscale; uint8_t nb_retx; /* number of retransmission */ @@ -119,7 +124,10 @@ struct tle_tcp_stream { } rx __rte_cache_aligned; struct { - rte_atomic32_t arm; /* when > 0 stream is in to-send queue */ + rte_atomic32_t arm; /* when > 0 stream need to send pkt */ + rte_atomic32_t in_tsq; /* when > 0 stream is in to-send queue */ + uint8_t in_daq; /* when > 0 stream is in delay-ack queue */ + uint8_t need_da; /* when > 0 stream need to send delay-ack */ struct { uint32_t nb_elem; /* number of objects per drb. */ uint32_t nb_max; /* number of drbs per stream. */ @@ -154,14 +162,16 @@ struct tcp_streams { struct stbl st; struct tle_timer_wheel *tmr; /* timer wheel */ struct rte_ring *tsq; /* to-send streams queue */ + uint64_t da_ts; /* last time executing delay-ack */ + struct rte_ring *daq; /* delay-ack streams queue */ struct sdr dr; /* death row for zombie streams */ - struct tle_tcp_stream s[]; /* array of allocated streams. */ }; #define CTX_TCP_STREAMS(ctx) ((struct tcp_streams *)(ctx)->streams.buf) #define CTX_TCP_STLB(ctx) (&CTX_TCP_STREAMS(ctx)->st) #define CTX_TCP_TMWHL(ctx) (CTX_TCP_STREAMS(ctx)->tmr) #define CTX_TCP_TSQ(ctx) (CTX_TCP_STREAMS(ctx)->tsq) +#define CTX_TCP_DAQ(ctx) (CTX_TCP_STREAMS(ctx)->daq) #define CTX_TCP_SDR(ctx) (&CTX_TCP_STREAMS(ctx)->dr) #ifdef __cplusplus diff --git a/lib/libtle_l4p/tcp_tx_seg.h b/lib/libtle_l4p/tcp_tx_seg.h index a8d2425..b64aa77 100644 --- a/lib/libtle_l4p/tcp_tx_seg.h +++ b/lib/libtle_l4p/tcp_tx_seg.h @@ -27,6 +27,7 @@ tcp_segmentation(struct rte_mbuf *mbin, struct rte_mbuf *mbout[], uint16_t num, struct rte_mbuf *in_seg = NULL; uint32_t nbseg, in_seg_data_pos; uint32_t more_in_segs; + uint16_t out_bytes_remain; in_seg = mbin; in_seg_data_pos = 0; @@ -34,7 +35,7 @@ tcp_segmentation(struct rte_mbuf *mbin, struct rte_mbuf *mbout[], uint16_t num, /* Check that pkts_out is big enough to hold all fragments */ if (mss * num < (uint16_t)mbin->pkt_len) - return -ENOSPC; + return -EAGAIN; more_in_segs = 1; while (more_in_segs) { @@ -48,6 +49,7 @@ tcp_segmentation(struct rte_mbuf *mbin, struct rte_mbuf *mbout[], uint16_t num, return -ENOMEM; } + out_bytes_remain = mss; out_seg_prev = out_pkt; more_out_segs = 1; while (more_out_segs && more_in_segs) { @@ -66,7 +68,7 @@ tcp_segmentation(struct rte_mbuf *mbin, struct rte_mbuf *mbout[], uint16_t num, /* Prepare indirect buffer */ rte_pktmbuf_attach(out_seg, in_seg); - len = mss; + len = out_bytes_remain; if (len > (in_seg->data_len - in_seg_data_pos)) len = in_seg->data_len - in_seg_data_pos; @@ -75,9 +77,10 @@ tcp_segmentation(struct rte_mbuf *mbin, struct rte_mbuf *mbout[], uint16_t num, out_pkt->pkt_len = (uint16_t)(len + out_pkt->pkt_len); out_pkt->nb_segs += 1; in_seg_data_pos += len; + out_bytes_remain -= len; /* Current output packet (i.e. fragment) done ? */ - if (out_pkt->pkt_len >= mss) + if (out_bytes_remain == 0) more_out_segs = 0; /* Current input segment done ? */ diff --git a/lib/libtle_l4p/tcp_txq.h b/lib/libtle_l4p/tcp_txq.h index 78f1d56..4588ffa 100644 --- a/lib/libtle_l4p/tcp_txq.h +++ b/lib/libtle_l4p/tcp_txq.h @@ -69,8 +69,36 @@ static inline void tcp_txq_rst_nxt_head(struct tle_tcp_stream *s) { struct rte_ring *r; + struct rte_mbuf *m, *next; + uint32_t offset, data_len; r = s->tx.q; + + /* if cons.head is not the mbuf to send next, clean offset info */ + if (r->cons.head >= r->cons.tail && r->cons.head < r->prod.tail) { + m = (struct rte_mbuf*)(_rte_ring_get_data(r)[r->cons.head & r->mask]); + if (m->next_pkt != NULL) { + m->next_pkt->next_offset = 0; + m->next_pkt = NULL; + } + } + /* set offset to unacknowledged position and send from it */ + if (r->cons.tail < r->prod.tail) { + m = (struct rte_mbuf*)(_rte_ring_get_data(r)[r->cons.tail & r->mask]); + if (m->una_offset > 0) { + offset = m->una_offset; + next = m; + data_len = m->data_len - PKT_L234_HLEN(m); + while (offset > data_len) { + offset -= data_len; + next = next->next; + data_len = next->data_len; + } + m->next_pkt = next; + next->next_offset = offset; + } + } + r->cons.head = r->cons.tail; } @@ -99,7 +127,8 @@ txs_enqueue(struct tle_ctx *ctx, struct tle_tcp_stream *s) struct rte_ring *r; uint32_t n; - if (rte_atomic32_add_return(&s->tx.arm, 1) == 1) { + rte_atomic32_inc(&s->tx.arm); + if (rte_atomic32_cmpset((volatile uint32_t*)&s->tx.in_tsq, 0, 1)) { r = CTX_TCP_TSQ(ctx); n = _rte_ring_enqueue_burst(r, (void * const *)&s, 1); RTE_VERIFY(n == 1); @@ -110,9 +139,14 @@ static inline uint32_t txs_dequeue_bulk(struct tle_ctx *ctx, struct tle_tcp_stream *s[], uint32_t num) { struct rte_ring *r; + uint32_t n, i; r = CTX_TCP_TSQ(ctx); - return _rte_ring_dequeue_burst(r, (void **)s, num); + n = _rte_ring_dequeue_burst(r, (void **)s, num); + for (i = 0; i < n; i++) { + rte_atomic32_clear(&s[i]->tx.in_tsq); + } + return n; } #ifdef __cplusplus diff --git a/lib/libtle_l4p/tle_ctx.h b/lib/libtle_l4p/tle_ctx.h index de78a6b..17edcad 100644 --- a/lib/libtle_l4p/tle_ctx.h +++ b/lib/libtle_l4p/tle_ctx.h @@ -112,6 +112,8 @@ struct tle_ctx_param { int32_t socket_id; /**< socket ID to allocate memory for. */ uint32_t proto; /**< L4 proto to handle. */ uint32_t max_streams; /**< max number of streams in context. */ + uint32_t min_streams; /**< min number of streams at init. */ + uint32_t delta_streams; /**< delta of streams of each allocation. */ uint32_t max_stream_rbufs; /**< max recv mbufs per stream. */ uint32_t max_stream_sbufs; /**< max send mbufs per stream. */ uint32_t send_bulk_size; /**< expected # of packets per send call. */ @@ -145,6 +147,8 @@ struct tle_ctx_param { */ #define TLE_TCP_TIMEWAIT_DEFAULT UINT32_MAX +#define TLE_TCP_FINWAIT_TIMEOUT 60000 + /** * create L4 processing context. * @param ctx_prm diff --git a/lib/libtle_l4p/tle_event.h b/lib/libtle_l4p/tle_event.h index d730345..495ccad 100644 --- a/lib/libtle_l4p/tle_event.h +++ b/lib/libtle_l4p/tle_event.h @@ -47,12 +47,8 @@ struct tle_event { struct tle_evq { rte_spinlock_t lock; - uint32_t nb_events; - uint32_t nb_armed; - uint32_t nb_free; TAILQ_HEAD(, tle_event) armed; - TAILQ_HEAD(, tle_event) free; - struct tle_event events[0]; + uint32_t nb_armed; }; /** @@ -275,6 +271,29 @@ tle_evq_get(struct tle_evq *evq, const void *evd[], uint32_t num) return n; } +static inline int32_t +tle_evq_get_and_idle(struct tle_evq *evq, const void *evd[], uint32_t num) +{ + uint32_t i, n; + struct tle_event *ev; + + if (evq->nb_armed == 0) + return 0; + + rte_compiler_barrier(); + + rte_spinlock_lock(&evq->lock); + n = RTE_MIN(num, evq->nb_armed); + for (i = 0; i != n; i++) { + ev = TAILQ_FIRST(&evq->armed); + ev->state = TLE_SEV_IDLE; + TAILQ_REMOVE(&evq->armed, ev, ql); + evd[i] = ev->data; + } + evq->nb_armed -= n; + rte_spinlock_unlock(&evq->lock); + return n; +} #ifdef __cplusplus } diff --git a/lib/libtle_l4p/tle_stats.h b/lib/libtle_l4p/tle_stats.h new file mode 100644 index 0000000..717e601 --- /dev/null +++ b/lib/libtle_l4p/tle_stats.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2018 Ant Financial Services Group. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TLE_STATS_H +#define TLE_STATS_H + +#include +#include + +/* tcp mib definitions */ +/* + * RFC 1213: MIB-II TCP group + * RFC 2012 (updates 1213): SNMPv2-MIB-TCP + */ +enum +{ + TCP_MIB_RTOALGORITHM, /* RtoAlgorithm */ + TCP_MIB_RTOMIN, /* RtoMin */ + TCP_MIB_RTOMAX, /* RtoMax */ + TCP_MIB_MAXCONN, /* MaxConn */ + TCP_MIB_ACTIVEOPENS, /* ActiveOpens */ + TCP_MIB_PASSIVEOPENS, /* PassiveOpens */ + TCP_MIB_ATTEMPTFAILS, /* AttemptFails */ + TCP_MIB_ESTABRESETS, /* EstabResets */ + TCP_MIB_CURRESTAB, /* CurrEstab */ + TCP_MIB_INSEGS, /* InSegs */ + TCP_MIB_OUTSEGS, /* OutSegs */ + TCP_MIB_RETRANSSEGS, /* RetransSegs */ + TCP_MIB_INERRS, /* InErrs */ + TCP_MIB_OUTRSTS, /* OutRsts */ + TCP_MIB_CSUMERRORS, /* InCsumErrors */ + TCP_MIB_MAX +}; + +/* udp mib definitions */ +/* + * RFC 1213: MIB-II UDP group + * RFC 2013 (updates 1213): SNMPv2-MIB-UDP + */ +enum +{ + UDP_MIB_INDATAGRAMS, /* InDatagrams */ + UDP_MIB_NOPORTS, /* NoPorts */ + UDP_MIB_INERRORS, /* InErrors */ + UDP_MIB_OUTDATAGRAMS, /* OutDatagrams */ + UDP_MIB_RCVBUFERRORS, /* RcvbufErrors */ + UDP_MIB_SNDBUFERRORS, /* SndbufErrors */ + UDP_MIB_CSUMERRORS, /* InCsumErrors */ + UDP_MIB_IGNOREDMULTI, /* IgnoredMulti */ + UDP_MIB_MAX +}; + +struct tcp_mib { + unsigned long mibs[TCP_MIB_MAX]; +}; + +struct udp_mib { + unsigned long mibs[UDP_MIB_MAX]; +}; + +struct tle_mib { + struct tcp_mib tcp __rte_cache_aligned; + struct udp_mib udp __rte_cache_aligned; +} __rte_cache_aligned; + +extern struct tle_mib default_mib; + +RTE_DECLARE_PER_LCORE(struct tle_mib *, mib); + +#define PERCPU_MIB RTE_PER_LCORE(mib) + +#define SNMP_INC_STATS(mib, field) (mib).mibs[field]++ +#define SNMP_DEC_STATS(mib, field) (mib).mibs[field]-- +#define SNMP_ADD_STATS(mib, field, n) (mib).mibs[field] += n +#define SNMP_ADD_STATS_ATOMIC(mib, field, n) \ + rte_atomic64_add((rte_atomic64_t *)(&(mib).mibs[field]), n) + +#define TCP_INC_STATS(field) SNMP_INC_STATS(PERCPU_MIB->tcp, field) +#define TCP_DEC_STATS(field) SNMP_DEC_STATS(PERCPU_MIB->tcp, field) +#define TCP_ADD_STATS(field, n) SNMP_ADD_STATS(PERCPU_MIB->tcp, field, n) +#define TCP_INC_STATS_ATOMIC(field) SNMP_ADD_STATS_ATOMIC(PERCPU_MIB->tcp, field, 1) +#define TCP_DEC_STATS_ATOMIC(field) SNMP_ADD_STATS_ATOMIC(PERCPU_MIB->tcp, field, (-1)) + +#define UDP_INC_STATS(field) SNMP_INC_STATS(PERCPU_MIB->udp, field) +#define UDP_ADD_STATS(field, n) SNMP_ADD_STATS(PERCPU_MIB->udp, field, n) +#define UDP_ADD_STATS_ATOMIC(field, n) \ + SNMP_ADD_STATS_ATOMIC(PERCPU_MIB->udp, field, n) + +#endif /* TLE_STATS_H */ diff --git a/lib/libtle_l4p/tle_tcp.h b/lib/libtle_l4p/tle_tcp.h index b0cbda6..ad8be8b 100644 --- a/lib/libtle_l4p/tle_tcp.h +++ b/lib/libtle_l4p/tle_tcp.h @@ -49,6 +49,7 @@ struct tle_tcp_stream_cfg { struct tle_tcp_stream_param { struct tle_tcp_stream_addr addr; struct tle_tcp_stream_cfg cfg; + uint64_t option; }; /** @@ -85,6 +86,25 @@ tle_tcp_stream_open(struct tle_ctx *ctx, */ int tle_tcp_stream_close(struct tle_stream *s); +/** + * shutdown an open stream in SHUT_WR way. + * similar to tle_tcp_stream_close(), except: + * - rx still works + * - er still works + * @param s + * Pointer to the stream to close. + * @return + * zero on successful completion. + * - -EINVAL - invalid parameter passed to function + * - -EDEADLK - close was already invoked on that stream + */ +int tle_tcp_stream_shutdown(struct tle_stream *s, int how); + +/** + * Send rst on this stream. + */ +void tle_tcp_stream_kill(struct tle_stream *s); + /** * close a group of open streams. * if the stream is in connected state, then: @@ -267,6 +287,15 @@ uint32_t tle_tcp_stream_update_cfg(struct tle_stream *ts[], uint16_t tle_tcp_stream_recv(struct tle_stream *s, struct rte_mbuf *pkt[], uint16_t num); +/** + * Get how many bytes are received in recv window. + * @param ts + * TCP stream to receive data from. + * @return + * bytes of data inside recv window. + */ +uint16_t tle_tcp_stream_inq(struct tle_stream *s); + /** * Reads iovcnt buffers from the for given TCP stream. * Note that the stream has to be in connected state. @@ -289,6 +318,31 @@ uint16_t tle_tcp_stream_recv(struct tle_stream *s, struct rte_mbuf *pkt[], ssize_t tle_tcp_stream_readv(struct tle_stream *ts, const struct iovec *iov, int iovcnt); +/** + * Reads iovcnt buffers from the for given TCP stream + * and generate control message in msg. + * Note that the stream has to be in connected state. + * Data ordering is preserved. + * Buffers are processed in array order. + * This means that the function will comppletely fill iov[0] + * before proceeding to iov[1], and so on. + * If there is insufficient data, then not all buffers pointed to by iov + * may be filled. + * @param ts + * TCP stream to receive data from. + * @param iov + * Points to an array of iovec structures. + * @param iovcnt + * Number of elements in the *iov* array. + * @param msg + * If not NULL, generate control message into msg_control field of msg. + * @return + * On success, number of bytes read in the stream receive buffer. + * In case of error, returns -1 and error code will be set in rte_errno. + */ +ssize_t tle_tcp_stream_readv_msg(struct tle_stream *ts, const struct iovec *iov, + int iovcnt, struct msghdr *msg); + /** * Consume and queue up to *num* packets, that will be sent eventually * by tle_tcp_tx_bulk(). @@ -420,6 +474,22 @@ uint16_t tle_tcp_tx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], */ int tle_tcp_process(struct tle_ctx *ctx, uint32_t num); +/** + * Get tcp info of a tcp stream. + * This function is not multi-thread safe. + * @param ts + * TCP stream to get info from. + * @param info + * Pointer to store info. + * @param optlen + * Pointer to length of info. + * @return + * zero on successful completion. + * - ENOTCONN - connection is not connected yet. + */ +int +tle_tcp_stream_get_info(const struct tle_stream * ts, void *info, socklen_t *optlen); + #ifdef __cplusplus } #endif diff --git a/lib/libtle_l4p/tle_udp.h b/lib/libtle_l4p/tle_udp.h index d3a8fe9..2db8753 100644 --- a/lib/libtle_l4p/tle_udp.h +++ b/lib/libtle_l4p/tle_udp.h @@ -35,6 +35,7 @@ struct tle_udp_stream_param { struct tle_event *send_ev; /**< send event to use. */ struct tle_stream_cb send_cb; /**< send callback to use. */ + uint64_t option; }; /** @@ -54,6 +55,25 @@ struct tle_stream * tle_udp_stream_open(struct tle_ctx *ctx, const struct tle_udp_stream_param *prm); +/** + * set an existed stream within given UDP context with new param. + * @param ts + * stream to set with new param + * @param ctx + * UDP context to set the stream within. + * @param prm + * Parameters used to set the stream. + * @return + * Pointer to UDP stream structure that can be used in future UDP API calls, + * or NULL on error, with error code set in rte_errno. + * Possible rte_errno errors include: + * - EINVAL - invalid parameter passed to function + * - ENOFILE - max limit of open streams reached for that context + */ +struct tle_stream * +tle_udp_stream_set(struct tle_stream *ts, struct tle_ctx *ctx, + const struct tle_udp_stream_param *prm); + /** * close an open stream. * All packets still remaining in stream receive buffer will be freed. @@ -180,6 +200,24 @@ uint16_t tle_udp_stream_recv(struct tle_stream *s, struct rte_mbuf *pkt[], uint16_t tle_udp_stream_send(struct tle_stream *s, struct rte_mbuf *pkt[], uint16_t num, const struct sockaddr *dst_addr); +/** + * updates configuration (associated events, callbacks, stream parameters) + * for the given streams. + * @param ts + * An array of pointers to the streams to update. + * @param prm + * An array of parameters to update for the given streams. + * @param num + * Number of elements in the *ts* and *prm* arrays. + * @return + * number of streams successfully updated. + * In case of error, error code set in rte_errno. + * Possible rte_errno errors include: + * - EINVAL - invalid parameter passed to function + */ +uint32_t tle_udp_stream_update_cfg(struct tle_stream *ts[], + struct tle_udp_stream_param prm[], uint32_t num); + #ifdef __cplusplus } #endif diff --git a/lib/libtle_l4p/udp_rxtx.c b/lib/libtle_l4p/udp_rxtx.c index ab746d1..4f57454 100644 --- a/lib/libtle_l4p/udp_rxtx.c +++ b/lib/libtle_l4p/udp_rxtx.c @@ -13,7 +13,6 @@ * limitations under the License. */ -#include #include #include #include @@ -24,14 +23,11 @@ #include "misc.h" static inline struct tle_udp_stream * -rx_stream_obtain(struct tle_dev *dev, uint32_t type, uint32_t port) +rx_stream_obtain_by_tuples(struct stbl *st, const union pkt_info *pi) { struct tle_udp_stream *s; - if (type >= TLE_VNUM || dev->dp[type] == NULL) - return NULL; - - s = (struct tle_udp_stream *)dev->dp[type]->streams[port]; + s = UDP_STREAM(stbl_find_stream(st, pi)); if (s == NULL) return NULL; @@ -41,6 +37,24 @@ rx_stream_obtain(struct tle_dev *dev, uint32_t type, uint32_t port) return s; } +static inline struct tle_udp_stream * +rx_stream_obtain(struct tle_dev *dev, uint32_t type, const union pkt_info *pi) +{ + struct tle_udp_stream *s; + + if (type == TLE_V4) + s = bhash_lookup4(dev->ctx->bhash[type], + pi->addr4.dst, pi->port.dst, 1); + else + s = bhash_lookup6(dev->ctx->bhash[type], + pi->addr6->dst, pi->port.dst, 1); + + if (s == NULL || rwl_acquire(&s->rx.use) < 0) + return NULL; + + return s; +} + static inline uint16_t get_pkt_type(const struct rte_mbuf *m) { @@ -57,9 +71,9 @@ get_pkt_type(const struct rte_mbuf *m) } static inline union l4_ports -pkt_info(const struct tle_dev *dev, struct rte_mbuf *m, - union l4_ports *ports, union ipv4_addrs *addr4, - union ipv6_addrs **addr6) +pkt_info_udp(const struct tle_dev *dev, struct rte_mbuf *m, + union l4_ports *ports, union ipv4_addrs *addr4, + union ipv6_addrs **addr6, union pkt_info *pi) { uint32_t len; union l4_ports ret, *up; @@ -73,16 +87,21 @@ pkt_info(const struct tle_dev *dev, struct rte_mbuf *m, len + offsetof(struct ipv4_hdr, src_addr)); addr4->raw = pa4->raw; m->ol_flags |= dev->rx.ol_flags[TLE_V4]; + pi->addr4.raw = pa4->raw; + pi->tf.type = TLE_V4; } else if (ret.src == TLE_V6) { *addr6 = rte_pktmbuf_mtod_offset(m, union ipv6_addrs *, len + offsetof(struct ipv6_hdr, src_addr)); m->ol_flags |= dev->rx.ol_flags[TLE_V6]; + pi->addr6 = *addr6; + pi->tf.type = TLE_V6; } len += m->l3_len; up = rte_pktmbuf_mtod_offset(m, union l4_ports *, len + offsetof(struct udp_hdr, src_port)); ports->raw = up->raw; + pi->port.raw = up->raw; ret.dst = ports->dst; return ret; } @@ -99,6 +118,11 @@ rx_stream(struct tle_udp_stream *s, void *mb[], struct rte_mbuf *rp[], r = _rte_ring_enqueue_burst(s->rx.q, mb, num); + if (unlikely(r != num)) { + UDP_ADD_STATS(UDP_MIB_RCVBUFERRORS, num - r); + UDP_ADD_STATS(UDP_MIB_INERRORS, num - r); + } + /* if RX queue was empty invoke user RX notification callback. */ if (s->rx.cb.func != NULL && r != 0 && rte_ring_count(s->rx.q) == r) s->rx.cb.func(s->rx.cb.data, &s->s); @@ -167,28 +191,64 @@ rx_stream4(struct tle_udp_stream *s, struct rte_mbuf *pkt[], return rx_stream(s, mb, rp + k, rc + k, n); } +/* + * Consider 2 UDP pkt_info *equal* if their: + * - types (IPv4/IPv6) + * - TCP src and dst ports + * - IP src and dst addresses + * are equal. + */ +static inline int +udp_pkt_info_bulk_eq(const union pkt_info pi[], uint32_t num) +{ + uint32_t i; + + i = 1; + + if (pi[0].tf.type == TLE_V4) { + while (i != num && pi[i].tf.type == TLE_V4 && + pi[0].port.raw == pi[i].port.raw && + pi[0].addr4.raw == pi[i].addr4.raw) + i++; + } else if (pi[0].tf.type == TLE_V6) { + while (i != num && pi[i].tf.type == TLE_V6 && + pi[0].port.raw == pi[i].port.raw && + ymm_cmp(&pi[0].addr6->raw, + &pi[i].addr6->raw) == 0) + i++; + } + + return i; +} + uint16_t tle_udp_rx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], struct rte_mbuf *rp[], int32_t rc[], uint16_t num) { + struct stbl *st; struct tle_udp_stream *s; - uint32_t i, j, k, n, p, t; + uint32_t i, j, k, n, t; union l4_ports tp[num], port[num]; union ipv4_addrs a4[num]; union ipv6_addrs *pa6[num]; + union pkt_info pi[num]; + + st = CTX_UDP_STLB(dev->ctx); for (i = 0; i != num; i++) - tp[i] = pkt_info(dev, pkt[i], &port[i], &a4[i], &pa6[i]); + tp[i] = pkt_info_udp(dev, pkt[i], &port[i], &a4[i], + &pa6[i], &pi[i]); k = 0; for (i = 0; i != num; i = j) { - for (j = i + 1; j != num && tp[j].raw == tp[i].raw; j++) - ; + j = i + udp_pkt_info_bulk_eq(pi + i, num - i); t = tp[i].src; - p = tp[i].dst; - s = rx_stream_obtain(dev, t, p); + + s = rx_stream_obtain_by_tuples(st, &pi[i]); + if (s == NULL) + s = rx_stream_obtain(dev, t, &pi[i]); if (s != NULL) { if (t == TLE_V4) @@ -205,6 +265,7 @@ tle_udp_rx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], rwl_release(&s->rx.use); } else { + UDP_ADD_STATS(UDP_MIB_NOPORTS, j - i); for (; i != j; i++) { rc[k] = ENOENT; rp[k] = pkt[i]; @@ -265,6 +326,8 @@ tle_udp_tx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], uint16_t num) stream_drb_release(s, drb + i, j - i); } + UDP_ADD_STATS(UDP_MIB_OUTDATAGRAMS, n); + return n; } @@ -275,28 +338,17 @@ tle_udp_tx_bulk(struct tle_dev *dev, struct rte_mbuf *pkt[], uint16_t num) static inline uint32_t recv_pkt_process(struct rte_mbuf *m[], uint32_t num, uint32_t type) { - uint32_t i, k; - uint64_t f, flg[num], ofl[num]; + uint32_t i, k, offset; - for (i = 0; i != num; i++) { - flg[i] = m[i]->ol_flags; - ofl[i] = m[i]->tx_offload; - } - - k = 0; - for (i = 0; i != num; i++) { - - f = flg[i] & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD); - - /* drop packets with invalid cksum(s). */ - if (f != 0 && check_pkt_csum(m[i], m[i]->ol_flags, type, - IPPROTO_UDP) != 0) { + for (i = 0, k = 0; i != num; i++) { + if (check_pkt_csum(m[i], type, IPPROTO_UDP) != 0) { + UDP_INC_STATS(UDP_MIB_CSUMERRORS); rte_pktmbuf_free(m[i]); m[i] = NULL; k++; } else { - m[i]->ol_flags ^= f; - rte_pktmbuf_adj(m[i], _tx_offload_l4_offset(ofl[i])); + offset = _tx_offload_l4_offset(m[i]->tx_offload); + rte_pktmbuf_adj(m[i], offset); } } @@ -310,22 +362,34 @@ tle_udp_stream_recv(struct tle_stream *us, struct rte_mbuf *pkt[], uint16_t num) struct tle_udp_stream *s; s = UDP_STREAM(us); - n = _rte_ring_mc_dequeue_burst(s->rx.q, (void **)pkt, num); - if (n == 0) - return 0; + n = 0; - /* - * if we still have packets to read, - * then rearm stream RX event. - */ - if (n == num && rte_ring_count(s->rx.q) != 0) { - if (rwl_try_acquire(&s->rx.use) > 0 && s->rx.ev != NULL) - tle_event_raise(s->rx.ev); - rwl_release(&s->rx.use); + while (n == 0) { + n = _rte_ring_mc_dequeue_burst(s->rx.q, (void **)pkt, num); + if (n == 0) { + rte_errno = EAGAIN; + return 0; + } + + /* + * if we still have packets to read, + * then rearm stream RX event. + */ + if (n == num && rte_ring_count(s->rx.q) != 0) { + if (rwl_try_acquire(&s->rx.use) > 0 && s->rx.ev != NULL) + tle_event_raise(s->rx.ev); + rwl_release(&s->rx.use); + } + + k = recv_pkt_process(pkt, n, s->s.type); + if (unlikely(k)) + UDP_ADD_STATS_ATOMIC(UDP_MIB_CSUMERRORS, k); + n = compress_pkt_list(pkt, n, k); } - k = recv_pkt_process(pkt, n, s->s.type); - return compress_pkt_list(pkt, n, k); + if (likely(n)) + UDP_ADD_STATS_ATOMIC(UDP_MIB_INDATAGRAMS, n); + return n; } static inline int @@ -421,7 +485,7 @@ fragment(struct rte_mbuf *pkt, struct rte_mbuf *frag[], uint32_t num, /* Remove the Ethernet header from the input packet */ rte_pktmbuf_adj(pkt, dst->l2_len); - mtu = dst->mtu - dst->l2_len; + mtu = dst->mtu; /* fragment packet */ if (type == TLE_V4) @@ -465,7 +529,7 @@ stream_drb_alloc(struct tle_udp_stream *s, struct tle_drb *drbs[], static inline uint16_t queue_pkt_out(struct tle_udp_stream *s, struct tle_dev *dev, const void *pkt[], uint16_t nb_pkt, - struct tle_drb *drbs[], uint32_t *nb_drb) + struct tle_drb *drbs[], uint32_t *nb_drb, uint8_t all_or_nothing) { uint32_t bsz, i, n, nb, nbc, nbm; @@ -483,12 +547,24 @@ queue_pkt_out(struct tle_udp_stream *s, struct tle_dev *dev, nb += nbc; /* no free drbs, can't send anything */ - if (nb == 0) + if (unlikely(nb == 0)) { + if (all_or_nothing) + UDP_ADD_STATS_ATOMIC(UDP_MIB_SNDBUFERRORS, 1); + else + UDP_ADD_STATS_ATOMIC(UDP_MIB_SNDBUFERRORS, nb_pkt); return 0; + } /* not enough free drbs, reduce number of packets to send. */ - else if (nb != nbm) + else if (nb != nbm) { + if (all_or_nothing) { + UDP_ADD_STATS_ATOMIC(UDP_MIB_SNDBUFERRORS, 1); + return 0; + } + + UDP_ADD_STATS_ATOMIC(UDP_MIB_SNDBUFERRORS, nb_pkt - nb * bsz); nb_pkt = nb * bsz; + } /* enqueue packets to the destination device. */ nbc = nb; @@ -514,12 +590,19 @@ tle_udp_stream_send(struct tle_stream *us, struct rte_mbuf *pkt[], const struct sockaddr_in *d4; const struct sockaddr_in6 *d6; struct tle_udp_stream *s; - const void *da; + const void *sa, *da; union udph udph; struct tle_dest dst; struct tle_drb *drb[num]; + uint8_t ufo; s = UDP_STREAM(us); + if (!rwl_is_up(&(s->tx.use))) { + /* rx is shutdown */ + rte_errno = EPIPE; + return 0; + } + type = s->s.type; /* start filling UDP header. */ @@ -528,7 +611,9 @@ tle_udp_stream_send(struct tle_stream *us, struct rte_mbuf *pkt[], /* figure out what destination addr/port to use. */ if (dst_addr != NULL) { - if (dst_addr->sa_family != s->prm.remote_addr.ss_family) { + if (dst_addr->sa_family != s->prm.remote_addr.ss_family && + (s->prm.remote_addr.ss_family == AF_INET || + !IN6_IS_ADDR_UNSPECIFIED(&s->s.ipv6.addr.dst))) { rte_errno = EINVAL; return 0; } @@ -536,20 +621,26 @@ tle_udp_stream_send(struct tle_stream *us, struct rte_mbuf *pkt[], d4 = (const struct sockaddr_in *)dst_addr; da = &d4->sin_addr; udph.ports.dst = d4->sin_port; + sa = &s->s.ipv4.addr.dst; } else { d6 = (const struct sockaddr_in6 *)dst_addr; da = &d6->sin6_addr; udph.ports.dst = d6->sin6_port; + sa = &s->s.ipv6.addr.dst; } } else { udph.ports.dst = s->s.port.src; - if (type == TLE_V4) + if (type == TLE_V4) { da = &s->s.ipv4.addr.src; - else + sa = &s->s.ipv4.addr.dst; + } + else { da = &s->s.ipv6.addr.src; + sa = &s->s.ipv6.addr.dst; + } } - di = stream_get_dest(&s->s, da, &dst); + di = stream_get_dest(type, &s->s, sa, da, &dst); if (di < 0) { rte_errno = -di; return 0; @@ -564,6 +655,7 @@ tle_udp_stream_send(struct tle_stream *us, struct rte_mbuf *pkt[], return 0; } + ufo = dst.dev->prm.tx_offload & DEV_TX_OFFLOAD_UDP_TSO; nb = 0; for (i = 0, k = 0; k != num; k = i) { @@ -573,7 +665,7 @@ tle_udp_stream_send(struct tle_stream *us, struct rte_mbuf *pkt[], ol_flags = dst.dev->tx.ol_flags[type]; while (i != num && frg == 0) { - frg = pkt[i]->pkt_len > mtu; + frg = (!ufo) && pkt[i]->pkt_len > mtu; if (frg != 0) ol_flags &= ~PKT_TX_UDP_CKSUM; rc = udp_fill_mbuf(pkt[i], type, ol_flags, pid + i, @@ -589,7 +681,7 @@ tle_udp_stream_send(struct tle_stream *us, struct rte_mbuf *pkt[], if (k != i) { k += queue_pkt_out(s, dst.dev, (const void **)(uintptr_t)&pkt[k], i - k, - drb, &nb); + drb, &nb, 0); /* stream TX queue is full. */ if (k != i) { @@ -611,7 +703,7 @@ tle_udp_stream_send(struct tle_stream *us, struct rte_mbuf *pkt[], } n = queue_pkt_out(s, dst.dev, - (const void **)(uintptr_t)frag, rc, drb, &nb); + (const void **)(uintptr_t)frag, rc, drb, &nb, 1); if (n == 0) { while (rc-- != 0) rte_pktmbuf_free(frag[rc]); diff --git a/lib/libtle_l4p/udp_stream.c b/lib/libtle_l4p/udp_stream.c index 29f5a40..f8fd855 100644 --- a/lib/libtle_l4p/udp_stream.c +++ b/lib/libtle_l4p/udp_stream.c @@ -43,74 +43,87 @@ fini_stream(struct tle_udp_stream *s) static void udp_fini_streams(struct tle_ctx *ctx) { - uint32_t i; - struct tle_udp_stream *s; + struct udp_streams *us; + struct tle_stream *s; + + us = CTX_UDP_STREAMS(ctx); + if (us != NULL) { + stbl_fini(&us->st); + + while (ctx->streams.nb_free--) { + s = STAILQ_FIRST(&ctx->streams.free); + STAILQ_FIRST(&ctx->streams.free) = STAILQ_NEXT(s, link); + fini_stream(UDP_STREAM(s)); + } - s = ctx->streams.buf; - if (s != NULL) { - for (i = 0; i != ctx->prm.max_streams; i++) - fini_stream(s + i); } - rte_free(s); + rte_free(us); ctx->streams.buf = NULL; STAILQ_INIT(&ctx->streams.free); } +/* stream memory layout: + * [tle_udp_stream] [rx.q] [tx.drb.r] + */ static int -init_stream(struct tle_ctx *ctx, struct tle_udp_stream *s) +add_stream(struct tle_ctx *ctx) { - size_t bsz, rsz, sz; - uint32_t i, k, n, nb; + size_t sz_s, sz_rxq, sz_drb_r, sz; + /* for rx.q */ + uint32_t n_rxq; + /* for tx.drb.r */ + size_t bsz, rsz; struct tle_drb *drb; - char name[RTE_RING_NAMESIZE]; - - /* init RX part. */ - - n = RTE_MAX(ctx->prm.max_stream_rbufs, 1U); - n = rte_align32pow2(n); - sz = rte_ring_get_memsize(n); + uint32_t k, nb, n_drb; - s->rx.q = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, - ctx->prm.socket_id); - if (s->rx.q == NULL) { - UDP_LOG(ERR, "%s(%p): allocation of %zu bytes on socket %d " - "failed with error code: %d\n", - __func__, s, sz, ctx->prm.socket_id, rte_errno); - return -ENOMEM; - } + uint32_t i, f; + char name[RTE_RING_NAMESIZE]; + struct tle_udp_stream *s; - snprintf(name, sizeof(name), "%p@%zu", s, sz); - rte_ring_init(s->rx.q, name, n, RING_F_SP_ENQ); + // stream + sz_s = RTE_ALIGN_CEIL(sizeof(*s), RTE_CACHE_LINE_SIZE); - /* init TX part. */ + // rx.q + n_rxq = RTE_MAX(ctx->prm.max_stream_rbufs, 1U); + n_rxq = rte_align32pow2(n_rxq); + sz_rxq = rte_ring_get_memsize(n_rxq); + sz_rxq = RTE_ALIGN_CEIL(sz_rxq, RTE_CACHE_LINE_SIZE); + // tx.drb.r nb = drb_nb_elem(ctx); k = calc_stream_drb_num(ctx, nb); - n = rte_align32pow2(k); - - /* size of the drbs ring */ - rsz = rte_ring_get_memsize(n); + n_drb = rte_align32pow2(k); + rsz = rte_ring_get_memsize(n_drb); /* size of the drbs ring */ rsz = RTE_ALIGN_CEIL(rsz, RTE_CACHE_LINE_SIZE); + bsz = tle_drb_calc_size(nb); /* size of the drb. */ + sz_drb_r = rsz + bsz * k; /* total stream drbs size. */ + sz_drb_r = RTE_ALIGN_CEIL(sz_drb_r, RTE_CACHE_LINE_SIZE); - /* size of the drb. */ - bsz = tle_drb_calc_size(nb); - - /* total stream drbs size. */ - sz = rsz + bsz * k; - - s->tx.drb.r = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, - ctx->prm.socket_id); - if (s->tx.drb.r == NULL) { - UDP_LOG(ERR, "%s(%p): allocation of %zu bytes on socket %d " + sz = sz_s + sz_rxq + sz_drb_r; + s = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, + ctx->prm.socket_id); + if (s == NULL) { + UDP_LOG(ERR, "%s: allocation of %zu bytes on socket %d " "failed with error code: %d\n", - __func__, s, sz, ctx->prm.socket_id, rte_errno); + __func__, sz, ctx->prm.socket_id, rte_errno); return -ENOMEM; } - snprintf(name, sizeof(name), "%p@%zu", s, sz); - rte_ring_init(s->tx.drb.r, name, n, 0); + s->rx.q = (struct rte_ring *)((uintptr_t)s + sz_s); + s->tx.drb.r = (struct rte_ring *)((uintptr_t)s->rx.q + sz_rxq); + + // ring flags + f = ((ctx->prm.flags & TLE_CTX_FLAG_ST) == 0) ? 0 : + (RING_F_SP_ENQ | RING_F_SC_DEQ); + /* init RX part. */ + snprintf(name, sizeof(name), "%p@%zu", s->rx.q, sz_rxq); + rte_ring_init(s->rx.q, name, n_rxq, f); + + /* init TX part. */ + snprintf(name, sizeof(name), "%p@%zu", s->tx.drb.r, sz_drb_r); + rte_ring_init(s->tx.drb.r, name, n_drb, f); for (i = 0; i != k; i++) { drb = (struct tle_drb *)((uintptr_t)s->tx.drb.r + rsz + bsz * i); @@ -146,38 +159,59 @@ udp_init_streams(struct tle_ctx *ctx) size_t sz; uint32_t i; int32_t rc; - struct tle_udp_stream *s; + struct udp_streams *us; - sz = sizeof(*s) * ctx->prm.max_streams; - s = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, + sz = sizeof(*us); + us = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, ctx->prm.socket_id); - if (s == NULL) { + if (us == NULL) { UDP_LOG(ERR, "allocation of %zu bytes on socket %d " "for %u udp_streams failed\n", sz, ctx->prm.socket_id, ctx->prm.max_streams); return -ENOMEM; } - ctx->streams.buf = s; + ctx->streams.buf = us; STAILQ_INIT(&ctx->streams.free); - for (i = 0; i != ctx->prm.max_streams; i++) { - rc = init_stream(ctx, s + i); - if (rc != 0) { - UDP_LOG(ERR, "initalisation of %u-th stream failed", i); - udp_fini_streams(ctx); - return rc; - } + rc = stbl_init(&us->st, (ctx->prm.flags & TLE_CTX_FLAG_ST) == 0); + if (rc < 0) { + UDP_LOG(ERR, "failed to init UDP stbl: rc = %dl\n", rc); + return rc; } - return 0; + for (i = 0; rc == 0 && i != ctx->prm.min_streams; i++) + rc = add_stream(ctx); + + if (rc != 0) { + UDP_LOG(ERR, "initalisation of %u-th stream failed", i); + udp_fini_streams(ctx); + } + + return rc; +} + +static uint32_t +udp_more_streams(struct tle_ctx *ctx) +{ + uint32_t i, nb; + uint32_t nb_max = ctx->prm.max_streams; + uint32_t nb_cur = ctx->streams.nb_cur; + + nb = RTE_MIN(ctx->prm.delta_streams, nb_max - nb_cur); + for (i = 0; i < nb; i++) + if (add_stream(ctx) != 0) + break; + + return i; } -static void __attribute__((constructor)) +static void __attribute__((constructor(101))) udp_stream_setup(void) { static const struct stream_ops udp_ops = { .init_streams = udp_init_streams, + .more_streams = udp_more_streams, .fini_streams = udp_fini_streams, .free_drbs = udp_free_drbs, }; @@ -223,6 +257,59 @@ check_stream_prm(const struct tle_ctx *ctx, return 0; } +struct tle_stream * +tle_udp_stream_set(struct tle_stream *ts, struct tle_ctx *ctx, + const struct tle_udp_stream_param *prm) +{ + struct tle_udp_stream *s; + int32_t rc; + + if (ctx == NULL || prm == NULL || check_stream_prm(ctx, prm) != 0) { + tle_udp_stream_close(ts); + rte_errno = EINVAL; + return NULL; + } + + s = UDP_STREAM(ts); + + /* free stream's destination port */ + rc = stream_clear_ctx(ctx, &s->s); + + if (s->ste) { + stbl_del_stream(CTX_UDP_STLB(ctx), s->ste, ts); + s->ste = NULL; + } + + /* copy input parameters. */ + s->prm = *prm; + s->s.option.raw = prm->option; + + /* setup L4 ports and L3 addresses fields. */ + rc = stream_fill_ctx(ctx, &s->s, + (const struct sockaddr *)&prm->local_addr, + (const struct sockaddr *)&prm->remote_addr); + + if (rc != 0) + goto error; + + /* add stream to the table for non-listen type stream */ + if (!is_empty_addr((const struct sockaddr *)&prm->remote_addr)) { + s->ste = stbl_add_stream(CTX_UDP_STLB(ctx), &s->s); + if (s->ste == NULL) { + rc = EEXIST; + goto error; + } + } + + return &s->s; + +error: + tle_udp_stream_close(ts); + rte_errno = rc; + return NULL; + +} + struct tle_stream * tle_udp_stream_open(struct tle_ctx *ctx, const struct tle_udp_stream_param *prm) @@ -237,42 +324,48 @@ tle_udp_stream_open(struct tle_ctx *ctx, s = (struct tle_udp_stream *)get_stream(ctx); if (s == NULL) { - rte_errno = ENFILE; - return NULL; - - /* some TX still pending for that stream. */ - } else if (UDP_STREAM_TX_PENDING(s)) { - put_stream(ctx, &s->s, 0); rte_errno = EAGAIN; return NULL; } /* copy input parameters. */ s->prm = *prm; + s->s.option.raw = prm->option; /* setup L4 ports and L3 addresses fields. */ rc = stream_fill_ctx(ctx, &s->s, (const struct sockaddr *)&prm->local_addr, (const struct sockaddr *)&prm->remote_addr); - if (rc != 0) { - put_stream(ctx, &s->s, 1); - s = NULL; - rte_errno = rc; - } else { - /* setup stream notification menchanism */ - s->rx.ev = prm->recv_ev; - s->rx.cb = prm->recv_cb; - s->tx.ev = prm->send_ev; - s->tx.cb = prm->send_cb; - - /* mark stream as avaialbe for RX/TX */ - if (s->tx.ev != NULL) - tle_event_raise(s->tx.ev); - stream_up(s); + if (rc != 0) + goto error; + + /* add stream to the table for non-listen type stream */ + if (!is_empty_addr((const struct sockaddr *)&prm->remote_addr)) { + s->ste = stbl_add_stream(CTX_UDP_STLB(ctx), &s->s); + if (s->ste == NULL) { + rc = EEXIST; + goto error; + } } + /* setup stream notification menchanism */ + s->rx.ev = prm->recv_ev; + s->rx.cb = prm->recv_cb; + s->tx.ev = prm->send_ev; + s->tx.cb = prm->send_cb; + + /* mark stream as avaialbe for RX/TX */ + if (s->tx.ev != NULL) + tle_event_raise(s->tx.ev); + stream_up(s); + return &s->s; + +error: + put_stream(ctx, &s->s, 1); + rte_errno = rc; + return NULL; } int @@ -312,6 +405,11 @@ tle_udp_stream_close(struct tle_stream *us) /* empty stream's RX queue */ empty_mbuf_ring(s->rx.q); + if (s->ste) { + stbl_del_stream(CTX_UDP_STLB(ctx), s->ste, us); + s->ste = NULL; + } + /* * mark the stream as free again. * if there still are pkts queued for TX, @@ -344,3 +442,56 @@ tle_udp_stream_get_param(const struct tle_stream *us, return 0; } + +/* + * helper function, updates stream config + */ +static inline int +stream_update_cfg(struct tle_stream *us, struct tle_udp_stream_param *prm) +{ + struct tle_udp_stream *s; + + s = UDP_STREAM(us); + + /* setup stream notification menchanism */ + s->rx.ev = prm->recv_ev; + s->rx.cb = prm->recv_cb; + s->tx.ev = prm->send_ev; + s->tx.cb = prm->send_cb; + + rte_smp_wmb(); + + /* invoke async notifications, if any */ + if (rte_ring_count(s->rx.q) != 0) { + if (s->rx.ev != NULL) + tle_event_raise(s->rx.ev); + else if (s->rx.cb.func != NULL) + s->rx.cb.func(s->rx.cb.data, &s->s); + } + + /* always ok to write */ + if (s->tx.ev != NULL) + tle_event_raise(s->tx.ev); + else if (s->tx.cb.func != NULL) + s->tx.cb.func(s->tx.cb.data, &s->s); + + return 0; +} + +uint32_t +tle_udp_stream_update_cfg(struct tle_stream *us[], + struct tle_udp_stream_param prm[], uint32_t num) +{ + int32_t rc; + uint32_t i; + + for (i = 0; i != num; i++) { + rc = stream_update_cfg(us[i], &prm[i]); + if (rc != 0) { + rte_errno = -rc; + break; + } + } + + return i; +} diff --git a/lib/libtle_l4p/udp_stream.h b/lib/libtle_l4p/udp_stream.h index a950e56..55a66f8 100644 --- a/lib/libtle_l4p/udp_stream.h +++ b/lib/libtle_l4p/udp_stream.h @@ -24,6 +24,7 @@ #include "osdep.h" #include "ctx.h" #include "stream.h" +#include "stream_table.h" #ifdef __cplusplus extern "C" { @@ -41,6 +42,7 @@ union udph { struct tle_udp_stream { struct tle_stream s; + struct stbl_entry *ste; /* entry in streams table. */ struct { struct rte_ring *q; @@ -63,6 +65,13 @@ struct tle_udp_stream { struct tle_udp_stream_param prm; } __rte_cache_aligned; +struct udp_streams { + struct stbl st; +}; + +#define CTX_UDP_STREAMS(ctx) ((struct udp_streams *)(ctx)->streams.buf) +#define CTX_UDP_STLB(ctx) (&CTX_UDP_STREAMS(ctx)->st) + #define UDP_STREAM(p) \ ((struct tle_udp_stream *)((uintptr_t)(p) - offsetof(struct tle_udp_stream, s))) diff --git a/lib/libtle_timer/timer.c b/lib/libtle_timer/timer.c index 8b89fd6..0eb7a5c 100644 --- a/lib/libtle_timer/timer.c +++ b/lib/libtle_timer/timer.c @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -59,6 +60,7 @@ struct tle_timer_elmt { struct tle_timer_list { uint32_t num; + rte_spinlock_t lock; LIST_HEAD(, tle_timer_elmt) head; }; @@ -134,6 +136,32 @@ put_timer(struct tle_timer_list *list, struct tle_timer_elmt *e) list->num++; } +static inline struct tle_timer_elmt * +get_free_timer(struct tle_timer_wheel *tw) +{ + unsigned i, n; + struct tle_timer_elmt *e; + + rte_spinlock_lock(&tw->free.lock); + e = LIST_FIRST(&tw->free.head); + if (e == NULL) { + n = 128; + n = RTE_MIN(n, tw->prm.max_timer - tw->free.num); + for (i = 0; i < n; i++) { + e = rte_zmalloc_socket(NULL, sizeof(*e), + sizeof(e), tw->prm.socket_id); + if (e != NULL) + put_timer(&tw->free, e); + else + rte_panic("Failed to allocate timer"); + } + } + + e = get_timer(&tw->free); + rte_spinlock_unlock(&tw->free.lock); + return e; +} + static inline void rem_timer(struct tle_timer_list *list, struct tle_timer_elmt *e) { @@ -149,8 +177,6 @@ tle_timer_create(struct tle_timer_wheel_args *prm, uint64_t now) uint32_t i, j; size_t sz; struct tle_timer_wheel *tw; - struct tle_timer_elmt *e; - struct tle_timer_elmt *timers; if (prm == NULL) { rte_errno = -EINVAL; @@ -169,7 +195,7 @@ tle_timer_create(struct tle_timer_wheel_args *prm, uint64_t now) return NULL; } - sz = sizeof(*tw) + prm->max_timer * sizeof(struct tle_timer_elmt); + sz = sizeof(*tw); /* allocate memory */ tw = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, @@ -182,17 +208,11 @@ tle_timer_create(struct tle_timer_wheel_args *prm, uint64_t now) tw->last_run_time = now; tw->prm = *prm; - timers = (struct tle_timer_elmt *)(tw + 1); /* initialize the lists */ LIST_INIT(&tw->free.head); LIST_INIT(&tw->expired.head); - for (i = 0; i < prm->max_timer; i++) { - e = timers + i; - put_timer(&tw->free, e); - } - for (i = 0; i < TW_N_RINGS; i++) for (j = 0; j < TW_SLOTS_PER_RING; j++) LIST_INIT(&tw->w[i][j].head); @@ -223,11 +243,6 @@ tle_timer_start(struct tle_timer_wheel *tw, void *obj, uint64_t interval) return NULL; } - if (tw->free.num == 0) { - rte_errno = ENOMEM; - return NULL; - } - nb_tick = interval / tw->prm.tick_size; fast_ring_index = nb_tick & TW_RING_MASK; @@ -248,10 +263,12 @@ tle_timer_start(struct tle_timer_wheel *tw, void *obj, uint64_t interval) slow_ring_index %= TW_SLOTS_PER_RING; ts = &tw->w[TW_RING_SLOW][slow_ring_index]; - e = get_timer(&tw->free); + e = get_free_timer(tw); e->obj = obj; e->fast_index = fast_ring_index; + rte_spinlock_lock(&ts->lock); put_timer(ts, e); + rte_spinlock_unlock(&ts->lock); /* Return the user timer-cancellation handle */ return (void *)e; @@ -260,9 +277,11 @@ tle_timer_start(struct tle_timer_wheel *tw, void *obj, uint64_t interval) /* Timer expires less than 51.2 seconds from now */ ts = &tw->w[TW_RING_FAST][fast_ring_index]; - e = get_timer(&tw->free); + e = get_free_timer(tw); e->obj = obj; + rte_spinlock_lock(&ts->lock); put_timer(ts, e); + rte_spinlock_unlock(&ts->lock); /* Give the user a handle to cancel the timer */ return (void *)e; @@ -277,8 +296,25 @@ void tle_timer_stop(struct tle_timer_wheel *tw, void *timer) /* Cancel the timer */ e = (struct tle_timer_elmt *)timer; ts = e->list; - rem_timer(ts, e); - put_timer(&tw->free, e); + while (ts != &tw->free) { + if (ts == NULL) { + rte_pause(); + ts = e->list; + continue; + } + rte_spinlock_lock(&ts->lock); + if (ts != e->list) { + rte_spinlock_unlock(&ts->lock); + ts = e->list; + continue; + } + rem_timer(ts, e); + rte_spinlock_unlock(&ts->lock); + rte_spinlock_lock(&tw->free.lock); + put_timer(&tw->free, e); + rte_spinlock_unlock(&tw->free.lock); + break; + } } /** run the timer wheel. Call in every tick_size cycles @@ -321,25 +357,33 @@ void tle_timer_expire(struct tle_timer_wheel *tw, uint64_t now) ts = &tw->w[TW_RING_SLOW][slow_wheel_index]; /* Deal slow-ring elements into the fast ring. */ + rte_spinlock_lock(&ts->lock); while (ts->num != 0) { e = get_timer(ts); demoted_index = e->fast_index; ts2 = &tw->w[TW_RING_FAST][demoted_index]; + rte_spinlock_lock(&ts2->lock); put_timer(ts2, e); + rte_spinlock_unlock(&ts2->lock); }; LIST_INIT(&ts->head); + rte_spinlock_unlock(&ts->lock); } /* Handle the fast ring */ ts = &tw->w[TW_RING_FAST][fast_wheel_index]; /* Clear the fast-ring slot and move timers in expired list*/ + rte_spinlock_lock(&ts->lock); n = get_timers(ts, re, RTE_DIM(re)); + rte_spinlock_lock(&tw->expired.lock); while (n != 0) { put_timers(&tw->expired, re, n); n = get_timers(ts, re, RTE_DIM(re)); }; + rte_spinlock_unlock(&tw->expired.lock); LIST_INIT(&ts->head); + rte_spinlock_unlock(&ts->lock); tw->current_index[TW_RING_FAST]++; tw->current_tick++; @@ -353,12 +397,16 @@ tle_timer_get_expired_bulk(struct tle_timer_wheel *tw, void *rt[], uint32_t num) uint32_t i, n; struct tle_timer_elmt *e[MAX_TIMER_BURST]; + rte_spinlock_lock(&tw->expired.lock); n = get_timers(&tw->expired, e, num); + rte_spinlock_unlock(&tw->expired.lock); for (i = 0; i != n; i++) rt[i] = e[i]->obj; + rte_spinlock_lock(&tw->free.lock); put_timers(&tw->free, e, n); + rte_spinlock_unlock(&tw->free.lock); return n; }