X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvcl%2Fldp.c;h=0f3147610d820b51c1bfcebb3ef8d634921ef523;hb=729b9c94f8222346a61c21d21a674bcb9b5974f8;hp=9f195c64a1bbb4bb4ea4d3e3e84115676902a0fc;hpb=294afe297c74c7c9413c6bd4856e92c9bc439e7c;p=vpp.git diff --git a/src/vcl/ldp.c b/src/vcl/ldp.c index 9f195c64a1b..0f3147610d8 100644 --- a/src/vcl/ldp.c +++ b/src/vcl/ldp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Cisco and/or its affiliates. + * Copyright (c) 2016-2019 Cisco and/or its affiliates. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: @@ -12,6 +12,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +#ifdef HAVE_GNU_SOURCE +#define _GNU_SOURCE +#endif + #include #include #include @@ -21,6 +26,7 @@ #include #include #include +#include #include #include @@ -51,6 +57,12 @@ #define LDP_MAX_NWORKERS 32 +#ifdef HAVE_GNU_SOURCE +#define SOCKADDR_GET_SA(__addr) __addr.__sockaddr__; +#else +#define SOCKADDR_GET_SA(__addr) _addr; +#endif + typedef struct ldp_worker_ctx_ { u8 *io_buffer; @@ -80,6 +92,7 @@ typedef struct ldp_worker_ctx_ * Epoll state */ u8 epoll_wait_vcl; + u8 mq_epfd_added; int vcl_mq_epfd; } ldp_worker_ctx_t; @@ -102,18 +115,30 @@ typedef struct /** vcl needs next epoll_create to go to libc_epoll */ u8 vcl_needs_real_epoll; + + /** + * crypto state used only for testing + */ + u8 transparent_tls; + u32 ckpair_index; } ldp_main_t; #define LDP_DEBUG ldp->debug #define LDBG(_lvl, _fmt, _args...) \ if (ldp->debug > _lvl) \ - clib_warning ("ldp<%d>: " _fmt, getpid(), ##_args) + { \ + int errno_saved = errno; \ + fprintf (stderr, "ldp<%d>: " _fmt "\n", getpid(), ##_args); \ + errno = errno_saved; \ + } static ldp_main_t ldp_main = { .vlsh_bit_val = (1 << LDP_SID_BIT_MIN), .vlsh_bit_mask = (1 << LDP_SID_BIT_MIN) - 1, .debug = LDP_DEBUG_INIT, + .transparent_tls = 0, + .ckpair_index = ~0, }; static ldp_main_t *ldp = &ldp_main; @@ -130,18 +155,14 @@ ldp_worker_get_current (void) static inline void ldp_set_app_name (char *app_name) { - int rv = snprintf (ldp->app_name, LDP_APP_NAME_MAX, - "ldp-%d-%s", getpid (), app_name); - - if (rv >= LDP_APP_NAME_MAX) - app_name[LDP_APP_NAME_MAX - 1] = 0; + snprintf (ldp->app_name, LDP_APP_NAME_MAX, "%s-ldp-%d", app_name, getpid ()); } static inline char * ldp_get_app_name () { if (ldp->app_name[0] == '\0') - ldp_set_app_name ("app"); + ldp_set_app_name (program_invocation_short_name); return ldp->app_name; } @@ -161,14 +182,21 @@ ldp_fd_to_vlsh (int fd) return (fd - ldp->vlsh_bit_val); } -static inline int +static void +ldp_alloc_workers (void) +{ + if (ldp->workers) + return; + pool_alloc (ldp->workers, LDP_MAX_NWORKERS); +} + +static int ldp_init (void) { ldp_worker_ctx_t *ldpw; int rv; - if (PREDICT_TRUE (ldp->init)) - return 0; + ASSERT (!ldp->init); ldp->init = 1; ldp->vcl_needs_real_epoll = 1; @@ -184,7 +212,7 @@ ldp_init (void) return rv; } ldp->vcl_needs_real_epoll = 0; - pool_alloc (ldp->workers, LDP_MAX_NWORKERS); + ldp_alloc_workers (); ldpw = ldp_worker_get_current (); char *env_var_str = getenv (LDP_ENV_DEBUG); @@ -254,27 +282,44 @@ ldp_init (void) /* Make sure there are enough bits in the fd set for vcl sessions */ if (ldp->vlsh_bit_val > FD_SETSIZE / 2) { - LDBG (0, "ERROR: LDP vlsh bit value %d > FD_SETSIZE/2 %d!", + /* Only valid for select/pselect, so just WARNING and not exit */ + LDBG (0, + "WARNING: LDP vlsh bit value %d > FD_SETSIZE/2 %d, " + "select/pselect not supported now!", ldp->vlsh_bit_val, FD_SETSIZE / 2); - ldp->init = 0; - return -1; } } + env_var_str = getenv (LDP_ENV_TLS_TRANS); + if (env_var_str) + { + ldp->transparent_tls = 1; + } + + /* *INDENT-OFF* */ + pool_foreach (ldpw, ldp->workers) { + clib_memset (&ldpw->clib_time, 0, sizeof (ldpw->clib_time)); + } + /* *INDENT-ON* */ - clib_time_init (&ldpw->clib_time); LDBG (0, "LDP initialization: done!"); return 0; } +#define ldp_init_check() \ + if (PREDICT_FALSE (!ldp->init)) \ + { \ + if ((errno = -ldp_init ())) \ + return -1; \ + } + int close (int fd) { vls_handle_t vlsh; int rv, epfd; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); vlsh = ldp_fd_to_vlsh (fd); if (vlsh != VLS_INVALID_HANDLE) @@ -325,8 +370,7 @@ read (int fd, void *buf, size_t nbytes) vls_handle_t vlsh; ssize_t size; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); vlsh = ldp_fd_to_vlsh (fd); if (vlsh != VLS_INVALID_HANDLE) @@ -353,30 +397,24 @@ readv (int fd, const struct iovec * iov, int iovcnt) vls_handle_t vlsh; ssize_t size = 0; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); vlsh = ldp_fd_to_vlsh (fd); if (vlsh != VLS_INVALID_HANDLE) { - do + for (i = 0; i < iovcnt; ++i) { - for (i = 0; i < iovcnt; ++i) + rv = vls_read (vlsh, iov[i].iov_base, iov[i].iov_len); + if (rv <= 0) + break; + else { - rv = vls_read (vlsh, iov[i].iov_base, iov[i].iov_len); - if (rv < 0) + total += rv; + if (rv < iov[i].iov_len) break; - else - { - total += rv; - if (rv < iov[i].iov_len) - break; - } } } - while ((rv >= 0) && (total == 0)); - - if (rv < 0) + if (rv < 0 && total == 0) { errno = -rv; size = -1; @@ -398,8 +436,7 @@ write (int fd, const void *buf, size_t nbytes) vls_handle_t vlsh; ssize_t size = 0; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); vlsh = ldp_fd_to_vlsh (fd); if (vlsh != VLS_INVALID_HANDLE) @@ -426,30 +463,25 @@ writev (int fd, const struct iovec * iov, int iovcnt) vls_handle_t vlsh; int i, rv = 0; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); vlsh = ldp_fd_to_vlsh (fd); if (vlsh != VLS_INVALID_HANDLE) { - do + for (i = 0; i < iovcnt; ++i) { - for (i = 0; i < iovcnt; ++i) + rv = vls_write_msg (vlsh, iov[i].iov_base, iov[i].iov_len); + if (rv < 0) + break; + else { - rv = vls_write_msg (vlsh, iov[i].iov_base, iov[i].iov_len); - if (rv < 0) + total += rv; + if (rv < iov[i].iov_len) break; - else - { - total += rv; - if (rv < iov[i].iov_len) - break; - } } } - while ((rv >= 0) && (total == 0)); - if (rv < 0) + if (rv < 0 && total == 0) { errno = -rv; size = -1; @@ -465,17 +497,11 @@ writev (int fd, const struct iovec * iov, int iovcnt) return size; } -int -fcntl (int fd, int cmd, ...) +static int +fcntl_internal (int fd, int cmd, va_list ap) { vls_handle_t vlsh; int rv = 0; - va_list ap; - - if ((errno = -ldp_init ())) - return -1; - - va_start (ap, cmd); vlsh = ldp_fd_to_vlsh (fd); LDBG (0, "fd %u vlsh %d, cmd %u", fd, vlsh, cmd); @@ -514,14 +540,45 @@ fcntl (int fd, int cmd, ...) } else { +#ifdef HAVE_FCNTL64 + rv = libc_vfcntl64 (fd, cmd, ap); +#else rv = libc_vfcntl (fd, cmd, ap); +#endif } + return rv; +} + +int +fcntl (int fd, int cmd, ...) +{ + va_list ap; + int rv; + + ldp_init_check (); + + va_start (ap, cmd); + rv = fcntl_internal (fd, cmd, ap); va_end (ap); return rv; } +int +fcntl64 (int fd, int cmd, ...) +{ + va_list ap; + int rv; + + ldp_init_check (); + + va_start (ap, cmd); + rv = fcntl_internal (fd, cmd, ap); + va_end (ap); + return rv; +} + int ioctl (int fd, unsigned long int cmd, ...) { @@ -529,8 +586,7 @@ ioctl (int fd, unsigned long int cmd, ...) va_list ap; int rv; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); va_start (ap, cmd); @@ -592,19 +648,20 @@ ldp_select_init_maps (fd_set * __restrict original, memset (original, 0, n_bytes); /* *INDENT-OFF* */ - clib_bitmap_foreach (fd, *resultb, ({ + clib_bitmap_foreach (fd, *resultb) { if (fd > nfds) break; vlsh = ldp_fd_to_vlsh (fd); if (vlsh == VLS_INVALID_HANDLE) clib_bitmap_set_no_check (*libcb, fd, 1); else - clib_bitmap_set_no_check (*vclb, vlsh_to_session_index (vlsh), 1); - })); + *vclb = clib_bitmap_set (*vclb, vlsh_to_session_index (vlsh), 1); + } /* *INDENT-ON* */ si_bits_set = clib_bitmap_last_set (*vclb) + 1; *si_bits = (si_bits_set > *si_bits) ? si_bits_set : *si_bits; + clib_bitmap_validate (*resultb, *si_bits); libc_bits_set = clib_bitmap_last_set (*libcb) + 1; *libc_bits = (libc_bits_set > *libc_bits) ? libc_bits_set : *libc_bits; @@ -621,8 +678,9 @@ ldp_select_vcl_map_to_libc (clib_bitmap_t * vclb, fd_set * __restrict libcb) return 0; /* *INDENT-OFF* */ - clib_bitmap_foreach (si, vclb, ({ + clib_bitmap_foreach (si, vclb) { vlsh = vls_session_index_to_vlsh (si); + ASSERT (vlsh != VLS_INVALID_HANDLE); fd = ldp_vlsh_to_fd (vlsh); if (PREDICT_FALSE (fd < 0)) { @@ -630,7 +688,7 @@ ldp_select_vcl_map_to_libc (clib_bitmap_t * vclb, fd_set * __restrict libcb) return -1; } FD_SET (fd, libcb); - })); + } /* *INDENT-ON* */ return 0; @@ -641,10 +699,12 @@ ldp_select_libc_map_merge (clib_bitmap_t * result, fd_set * __restrict libcb) { uword fd; + if (!libcb) + return; + /* *INDENT-OFF* */ - clib_bitmap_foreach (fd, result, ({ + clib_bitmap_foreach (fd, result) FD_SET ((int)fd, libcb); - })); /* *INDENT-ON* */ } @@ -668,15 +728,19 @@ ldp_pselect (int nfds, fd_set * __restrict readfds, return -1; } + if (PREDICT_FALSE (ldpw->clib_time.init_cpu_time == 0)) + clib_time_init (&ldpw->clib_time); + if (timeout) { time_out = (timeout->tv_sec == 0 && timeout->tv_nsec == 0) ? (f64) 0 : (f64) timeout->tv_sec + (f64) timeout->tv_nsec / (f64) 1e9; + time_out += clib_time_now (&ldpw->clib_time); + /* select as fine grained sleep */ if (!nfds) { - time_out += clib_time_now (&ldpw->clib_time); while (clib_time_now (&ldpw->clib_time) < time_out) ; return 0; @@ -720,7 +784,8 @@ ldp_pselect (int nfds, fd_set * __restrict readfds, goto done; } - libc_tspec = si_bits ? libc_tspec : *timeout; + if (!si_bits) + libc_tspec = timeout ? *timeout : libc_tspec; do { @@ -728,25 +793,25 @@ ldp_pselect (int nfds, fd_set * __restrict readfds, { if (readfds) clib_memcpy_fast (ldpw->rd_bitmap, ldpw->si_rd_bitmap, - vec_len (ldpw->rd_bitmap) * + vec_len (ldpw->si_rd_bitmap) * sizeof (clib_bitmap_t)); if (writefds) clib_memcpy_fast (ldpw->wr_bitmap, ldpw->si_wr_bitmap, - vec_len (ldpw->wr_bitmap) * + vec_len (ldpw->si_wr_bitmap) * sizeof (clib_bitmap_t)); if (exceptfds) clib_memcpy_fast (ldpw->ex_bitmap, ldpw->si_ex_bitmap, - vec_len (ldpw->ex_bitmap) * + vec_len (ldpw->si_ex_bitmap) * sizeof (clib_bitmap_t)); - rv = vppcom_select (si_bits, readfds ? ldpw->rd_bitmap : NULL, - writefds ? ldpw->wr_bitmap : NULL, - exceptfds ? ldpw->ex_bitmap : NULL, - vcl_timeout); + rv = vls_select (si_bits, readfds ? ldpw->rd_bitmap : NULL, + writefds ? ldpw->wr_bitmap : NULL, + exceptfds ? ldpw->ex_bitmap : NULL, vcl_timeout); if (rv < 0) { errno = -rv; rv = -1; + goto done; } else if (rv > 0) { @@ -851,6 +916,72 @@ pselect (int nfds, fd_set * __restrict readfds, } #endif +/* If transparent TLS mode is turned on, then ldp will load key and cert. + */ +static int +load_cert_key_pair (void) +{ + char *cert_str = getenv (LDP_ENV_TLS_CERT); + char *key_str = getenv (LDP_ENV_TLS_KEY); + char cert_buf[4096], key_buf[4096]; + int cert_size, key_size; + vppcom_cert_key_pair_t crypto; + int ckp_index; + FILE *fp; + + if (!cert_str || !key_str) + { + LDBG (0, "ERROR: failed to read LDP environment %s\n", + LDP_ENV_TLS_CERT); + return -1; + } + + fp = fopen (cert_str, "r"); + if (fp == NULL) + { + LDBG (0, "ERROR: failed to open cert file %s \n", cert_str); + return -1; + } + cert_size = fread (cert_buf, sizeof (char), sizeof (cert_buf), fp); + fclose (fp); + + fp = fopen (key_str, "r"); + if (fp == NULL) + { + LDBG (0, "ERROR: failed to open key file %s \n", key_str); + return -1; + } + key_size = fread (key_buf, sizeof (char), sizeof (key_buf), fp); + fclose (fp); + + crypto.cert = cert_buf; + crypto.key = key_buf; + crypto.cert_len = cert_size; + crypto.key_len = key_size; + ckp_index = vppcom_add_cert_key_pair (&crypto); + if (ckp_index < 0) + { + LDBG (0, "ERROR: failed to add cert key pair\n"); + return -1; + } + + ldp->ckpair_index = ckp_index; + + return 0; +} + +static int +assign_cert_key_pair (vls_handle_t vlsh) +{ + uint32_t ckp_len; + + if (ldp->ckpair_index == ~0 && load_cert_key_pair () < 0) + return -1; + + ckp_len = sizeof (ldp->ckpair_index); + return vls_attr (vlsh, VPPCOM_ATTR_SET_CKPAIR, &ldp->ckpair_index, &ckp_len); +} + int socket (int domain, int type, int protocol) { @@ -858,14 +989,19 @@ socket (int domain, int type, int protocol) u8 is_nonblocking = type & SOCK_NONBLOCK ? 1 : 0; vls_handle_t vlsh; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); if (((domain == AF_INET) || (domain == AF_INET6)) && ((sock_type == SOCK_STREAM) || (sock_type == SOCK_DGRAM))) { - u8 proto = ((sock_type == SOCK_DGRAM) ? - VPPCOM_PROTO_UDP : VPPCOM_PROTO_TCP); + u8 proto; + if (ldp->transparent_tls) + { + proto = VPPCOM_PROTO_TLS; + } + else + proto = ((sock_type == SOCK_DGRAM) ? + VPPCOM_PROTO_UDP : VPPCOM_PROTO_TCP); LDBG (0, "calling vls_create: proto %u (%s), is_nonblocking %u", proto, vppcom_proto_str (proto), is_nonblocking); @@ -878,6 +1014,11 @@ socket (int domain, int type, int protocol) } else { + if (ldp->transparent_tls) + { + if (assign_cert_key_pair (vlsh) < 0) + return -1; + } rv = ldp_vlsh_to_fd (vlsh); } } @@ -902,8 +1043,7 @@ socketpair (int domain, int type, int protocol, int fds[2]) { int rv, sock_type = type & ~(SOCK_CLOEXEC | SOCK_NONBLOCK); - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); if (((domain == AF_INET) || (domain == AF_INET6)) && ((sock_type == SOCK_STREAM) || (sock_type == SOCK_DGRAM))) @@ -922,13 +1062,13 @@ socketpair (int domain, int type, int protocol, int fds[2]) } int -bind (int fd, __CONST_SOCKADDR_ARG addr, socklen_t len) +bind (int fd, __CONST_SOCKADDR_ARG _addr, socklen_t len) { + const struct sockaddr *addr = SOCKADDR_GET_SA (_addr); vls_handle_t vlsh; int rv; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); vlsh = ldp_fd_to_vlsh (fd); if (vlsh != VLS_INVALID_HANDLE) @@ -995,14 +1135,12 @@ done: } static inline int -ldp_copy_ep_to_sockaddr (__SOCKADDR_ARG addr, socklen_t * __restrict len, - vppcom_endpt_t * ep) +ldp_copy_ep_to_sockaddr (struct sockaddr *addr, socklen_t *__restrict len, + vppcom_endpt_t *ep) { - int rv = 0; - int sa_len, copy_len; + int rv = 0, sa_len, copy_len; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); if (addr && len && ep) { @@ -1041,13 +1179,13 @@ ldp_copy_ep_to_sockaddr (__SOCKADDR_ARG addr, socklen_t * __restrict len, } int -getsockname (int fd, __SOCKADDR_ARG addr, socklen_t * __restrict len) +getsockname (int fd, __SOCKADDR_ARG _addr, socklen_t *__restrict len) { + struct sockaddr *addr = SOCKADDR_GET_SA (_addr); vls_handle_t vlsh; int rv; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); vlsh = ldp_fd_to_vlsh (fd); if (vlsh != VLS_INVALID_HANDLE) @@ -1076,20 +1214,20 @@ getsockname (int fd, __SOCKADDR_ARG addr, socklen_t * __restrict len) } else { - rv = libc_getsockname (fd, addr, len); + rv = libc_getsockname (fd, _addr, len); } return rv; } int -connect (int fd, __CONST_SOCKADDR_ARG addr, socklen_t len) +connect (int fd, __CONST_SOCKADDR_ARG _addr, socklen_t len) { + const struct sockaddr *addr = SOCKADDR_GET_SA (_addr); vls_handle_t vlsh; int rv; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); if (!addr) { @@ -1165,13 +1303,13 @@ done: } int -getpeername (int fd, __SOCKADDR_ARG addr, socklen_t * __restrict len) +getpeername (int fd, __SOCKADDR_ARG _addr, socklen_t *__restrict len) { + struct sockaddr *addr = SOCKADDR_GET_SA (_addr); vls_handle_t vlsh; int rv; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); vlsh = ldp_fd_to_vlsh (fd); if (vlsh != VLS_INVALID_HANDLE) @@ -1211,8 +1349,7 @@ send (int fd, const void *buf, size_t n, int flags) vls_handle_t vlsh = ldp_fd_to_vlsh (fd); ssize_t size; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); if (vlsh != VLS_INVALID_HANDLE) { @@ -1238,8 +1375,7 @@ sendfile (int out_fd, int in_fd, off_t * offset, size_t len) vls_handle_t vlsh; ssize_t size = 0; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); vlsh = ldp_fd_to_vlsh (out_fd); if (vlsh != VLS_INVALID_HANDLE) @@ -1281,7 +1417,7 @@ sendfile (int out_fd, int in_fd, off_t * offset, size_t len) size = vls_attr (vlsh, VPPCOM_ATTR_GET_NWRITE, 0, 0); if (size < 0) { - LDBG (0, "ERROR: fd %d: vls_attr: vlsh %u returned %d (%s)!", + LDBG (0, "ERROR: fd %d: vls_attr: vlsh %u returned %ld (%s)!", out_fd, vlsh, size, vppcom_retval_str (size)); vec_reset_length (ldpw->io_buffer); errno = -size; @@ -1388,15 +1524,17 @@ recv (int fd, void *buf, size_t n, int flags) vls_handle_t vlsh; ssize_t size; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); vlsh = ldp_fd_to_vlsh (fd); if (vlsh != VLS_INVALID_HANDLE) { size = vls_recvfrom (vlsh, buf, n, flags, NULL); if (size < 0) - errno = -size; + { + errno = -size; + size = -1; + } } else { @@ -1407,50 +1545,95 @@ recv (int fd, void *buf, size_t n, int flags) } ssize_t -sendto (int fd, const void *buf, size_t n, int flags, - __CONST_SOCKADDR_ARG addr, socklen_t addr_len) +__recv_chk (int fd, void *buf, size_t n, size_t buflen, int flags) { - vls_handle_t vlsh; - ssize_t size; - - if ((errno = -ldp_init ())) + if (n > buflen) return -1; - vlsh = ldp_fd_to_vlsh (fd); - if (vlsh != INVALID_SESSION_ID) - { - vppcom_endpt_t *ep = 0; - vppcom_endpt_t _ep; + return recv (fd, buf, n, flags); +} - if (addr) +static inline int +ldp_vls_sendo (vls_handle_t vlsh, const void *buf, size_t n, + vppcom_endpt_tlv_t *app_tlvs, int flags, + __CONST_SOCKADDR_ARG _addr, socklen_t addr_len) +{ + const struct sockaddr *addr = SOCKADDR_GET_SA (_addr); + vppcom_endpt_t *ep = 0; + vppcom_endpt_t _ep; + + _ep.app_tlvs = app_tlvs; + + if (addr) + { + ep = &_ep; + switch (addr->sa_family) { - ep = &_ep; - switch (addr->sa_family) - { - case AF_INET: - ep->is_ip4 = VPPCOM_IS_IP4; - ep->ip = - (uint8_t *) & ((const struct sockaddr_in *) addr)->sin_addr; - ep->port = - (uint16_t) ((const struct sockaddr_in *) addr)->sin_port; - break; + case AF_INET: + ep->is_ip4 = VPPCOM_IS_IP4; + ep->ip = + (uint8_t *) & ((const struct sockaddr_in *) addr)->sin_addr; + ep->port = (uint16_t) ((const struct sockaddr_in *) addr)->sin_port; + break; - case AF_INET6: - ep->is_ip4 = VPPCOM_IS_IP6; - ep->ip = - (uint8_t *) & ((const struct sockaddr_in6 *) addr)->sin6_addr; - ep->port = - (uint16_t) ((const struct sockaddr_in6 *) addr)->sin6_port; - break; + case AF_INET6: + ep->is_ip4 = VPPCOM_IS_IP6; + ep->ip = + (uint8_t *) & ((const struct sockaddr_in6 *) addr)->sin6_addr; + ep->port = + (uint16_t) ((const struct sockaddr_in6 *) addr)->sin6_port; + break; - default: - errno = EAFNOSUPPORT; - size = -1; - goto done; - } + default: + return EAFNOSUPPORT; } + } + + return vls_sendto (vlsh, (void *) buf, n, flags, ep); +} + +static int +ldp_vls_recvfrom (vls_handle_t vlsh, void *__restrict buf, size_t n, int flags, + __SOCKADDR_ARG _addr, socklen_t *__restrict addr_len) +{ + u8 src_addr[sizeof (struct sockaddr_in6)]; + struct sockaddr *addr = SOCKADDR_GET_SA (_addr); + vppcom_endpt_t ep; + ssize_t size; + int rv; + + if (addr) + { + ep.ip = src_addr; + size = vls_recvfrom (vlsh, buf, n, flags, &ep); - size = vls_sendto (vlsh, (void *) buf, n, flags, ep); + if (size > 0) + { + rv = ldp_copy_ep_to_sockaddr (addr, addr_len, &ep); + if (rv < 0) + size = rv; + } + } + else + size = vls_recvfrom (vlsh, buf, n, flags, NULL); + + return size; +} + +ssize_t +sendto (int fd, const void *buf, size_t n, int flags, + __CONST_SOCKADDR_ARG _addr, socklen_t addr_len) +{ + const struct sockaddr *addr = SOCKADDR_GET_SA (_addr); + vls_handle_t vlsh; + ssize_t size; + + ldp_init_check (); + + vlsh = ldp_fd_to_vlsh (fd); + if (vlsh != VLS_INVALID_HANDLE) + { + size = ldp_vls_sendo (vlsh, buf, n, NULL, flags, addr, addr_len); if (size < 0) { errno = -size; @@ -1462,7 +1645,6 @@ sendto (int fd, const void *buf, size_t n, int flags, size = libc_sendto (fd, buf, n, flags, addr, addr_len); } -done: return size; } @@ -1470,29 +1652,15 @@ ssize_t recvfrom (int fd, void *__restrict buf, size_t n, int flags, __SOCKADDR_ARG addr, socklen_t * __restrict addr_len) { - vls_handle_t sid; + vls_handle_t vlsh; ssize_t size; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); - sid = ldp_fd_to_vlsh (fd); - if (sid != VLS_INVALID_HANDLE) + vlsh = ldp_fd_to_vlsh (fd); + if (vlsh != VLS_INVALID_HANDLE) { - vppcom_endpt_t ep; - u8 src_addr[sizeof (struct sockaddr_in6)]; - - if (addr) - { - ep.ip = src_addr; - size = vls_recvfrom (sid, buf, n, flags, &ep); - - if (size > 0) - size = ldp_copy_ep_to_sockaddr (addr, addr_len, &ep); - } - else - size = vls_recvfrom (sid, buf, n, flags, NULL); - + size = ldp_vls_recvfrom (vlsh, buf, n, flags, addr, addr_len); if (size < 0) { errno = -size; @@ -1507,31 +1675,148 @@ recvfrom (int fd, void *__restrict buf, size_t n, int flags, return size; } +static int +ldp_parse_cmsg (vls_handle_t vlsh, const struct msghdr *msg, + vppcom_endpt_tlv_t **app_tlvs) +{ + uint8_t *ad, *at = (uint8_t *) *app_tlvs; + vppcom_endpt_tlv_t *adh; + struct in_pktinfo *pi; + struct cmsghdr *cmsg; + + cmsg = CMSG_FIRSTHDR (msg); + + while (cmsg != NULL) + { + switch (cmsg->cmsg_level) + { + case SOL_UDP: + switch (cmsg->cmsg_type) + { + case UDP_SEGMENT: + vec_add2 (at, adh, sizeof (*adh)); + adh->data_type = VCL_UDP_SEGMENT; + adh->data_len = sizeof (uint16_t); + vec_add2 (at, ad, sizeof (uint16_t)); + *(uint16_t *) ad = *(uint16_t *) CMSG_DATA (cmsg); + break; + default: + LDBG (1, "SOL_UDP cmsg_type %u not supported", cmsg->cmsg_type); + break; + } + break; + case SOL_IP: + switch (cmsg->cmsg_type) + { + case IP_PKTINFO: + vec_add2 (at, adh, sizeof (*adh)); + adh->data_type = VCL_IP_PKTINFO; + adh->data_len = sizeof (struct in_addr); + vec_add2 (at, ad, sizeof (struct in_addr)); + pi = (void *) CMSG_DATA (cmsg); + clib_memcpy_fast (ad, &pi->ipi_spec_dst, + sizeof (struct in_addr)); + break; + default: + LDBG (1, "SOL_IP cmsg_type %u not supported", cmsg->cmsg_type); + break; + } + break; + default: + LDBG (1, "cmsg_level %u not supported", cmsg->cmsg_level); + break; + } + cmsg = CMSG_NXTHDR ((struct msghdr *) msg, cmsg); + } + *app_tlvs = (vppcom_endpt_tlv_t *) at; + return 0; +} + +static int +ldp_make_cmsg (vls_handle_t vlsh, struct msghdr *msg) +{ + u32 optval, optlen = sizeof (optval); + struct cmsghdr *cmsg; + + cmsg = CMSG_FIRSTHDR (msg); + + if (!vls_attr (vlsh, VPPCOM_ATTR_GET_IP_PKTINFO, (void *) &optval, &optlen)) + return 0; + + if (optval) + { + vppcom_endpt_t ep; + u8 addr_buf[sizeof (struct in_addr)]; + u32 size = sizeof (ep); + + ep.ip = addr_buf; + + if (!vls_attr (vlsh, VPPCOM_ATTR_GET_LCL_ADDR, &ep, &size)) + { + struct in_pktinfo pi = {}; + + clib_memcpy (&pi.ipi_addr, ep.ip, sizeof (struct in_addr)); + cmsg->cmsg_level = SOL_IP; + cmsg->cmsg_type = IP_PKTINFO; + cmsg->cmsg_len = CMSG_LEN (sizeof (pi)); + clib_memcpy (CMSG_DATA (cmsg), &pi, sizeof (pi)); + } + } + + return 0; +} + ssize_t -sendmsg (int fd, const struct msghdr * message, int flags) +sendmsg (int fd, const struct msghdr * msg, int flags) { vls_handle_t vlsh; ssize_t size; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); vlsh = ldp_fd_to_vlsh (fd); if (vlsh != VLS_INVALID_HANDLE) { - LDBG (0, "LDP-TBD"); - errno = ENOSYS; - size = -1; + vppcom_endpt_tlv_t *app_tlvs = 0; + struct iovec *iov = msg->msg_iov; + ssize_t total = 0; + int i, rv = 0; + + ldp_parse_cmsg (vlsh, msg, &app_tlvs); + + for (i = 0; i < msg->msg_iovlen; ++i) + { + rv = ldp_vls_sendo (vlsh, iov[i].iov_base, iov[i].iov_len, app_tlvs, + flags, msg->msg_name, msg->msg_namelen); + if (rv < 0) + break; + else + { + total += rv; + if (rv < iov[i].iov_len) + break; + } + } + + vec_free (app_tlvs); + + if (rv < 0 && total == 0) + { + errno = -rv; + size = -1; + } + else + size = total; } else { - size = libc_sendmsg (fd, message, flags); + size = libc_sendmsg (fd, msg, flags); } return size; } -#ifdef USE_GNU +#ifdef _GNU_SOURCE int sendmmsg (int fd, struct mmsghdr *vmessages, unsigned int vlen, int flags) { @@ -1539,10 +1824,9 @@ sendmmsg (int fd, struct mmsghdr *vmessages, unsigned int vlen, int flags) const char *func_str; u32 sh = ldp_fd_to_vlsh (fd); - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); - if (sh != INVALID_SESSION_ID) + if (sh != VLS_INVALID_HANDLE) { clib_warning ("LDP<%d>: LDP-TBD", getpid ()); errno = ENOSYS; @@ -1580,76 +1864,115 @@ sendmmsg (int fd, struct mmsghdr *vmessages, unsigned int vlen, int flags) #endif ssize_t -recvmsg (int fd, struct msghdr * message, int flags) +recvmsg (int fd, struct msghdr * msg, int flags) { vls_handle_t vlsh; ssize_t size; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); vlsh = ldp_fd_to_vlsh (fd); if (vlsh != VLS_INVALID_HANDLE) { - LDBG (0, "LDP-TBD"); - errno = ENOSYS; - size = -1; + struct iovec *iov = msg->msg_iov; + ssize_t max_deq, total = 0; + int i, rv; + + max_deq = vls_attr (vlsh, VPPCOM_ATTR_GET_NREAD, 0, 0); + if (!max_deq) + return 0; + + for (i = 0; i < msg->msg_iovlen; i++) + { + rv = ldp_vls_recvfrom (vlsh, iov[i].iov_base, iov[i].iov_len, flags, + (i == 0 ? msg->msg_name : NULL), + (i == 0 ? &msg->msg_namelen : NULL)); + if (rv <= 0) + break; + else + { + total += rv; + if (rv < iov[i].iov_len) + break; + } + if (total >= max_deq) + break; + } + + if (rv < 0 && total == 0) + { + errno = -rv; + size = -1; + } + else + { + if (msg->msg_controllen) + ldp_make_cmsg (vlsh, msg); + size = total; + } } else { - size = libc_recvmsg (fd, message, flags); + size = libc_recvmsg (fd, msg, flags); } return size; } -#ifdef USE_GNU +#ifdef _GNU_SOURCE int recvmmsg (int fd, struct mmsghdr *vmessages, unsigned int vlen, int flags, struct timespec *tmo) { - ssize_t size; - const char *func_str; - u32 sh = ldp_fd_to_vlsh (fd); - - if ((errno = -ldp_init ())) - return -1; - - if (sh != INVALID_SESSION_ID) - { - clib_warning ("LDP<%d>: LDP-TBD", getpid ()); - errno = ENOSYS; - size = -1; - } - else - { - func_str = "libc_recvmmsg"; + ldp_worker_ctx_t *ldpw = ldp_worker_get_current (); + u32 sh; - if (LDP_DEBUG > 2) - clib_warning ("LDP<%d>: fd %d (0x%x): calling %s(): " - "vmessages %p, vlen %u, flags 0x%x, tmo %p", - getpid (), fd, fd, func_str, vmessages, vlen, - flags, tmo); + ldp_init_check (); - size = libc_recvmmsg (fd, vmessages, vlen, flags, tmo); - } + sh = ldp_fd_to_vlsh (fd); - if (LDP_DEBUG > 2) + if (sh != VLS_INVALID_HANDLE) { - if (size < 0) + struct mmsghdr *mh; + ssize_t rv = 0; + u32 nvecs = 0; + f64 time_out; + + if (PREDICT_FALSE (ldpw->clib_time.init_cpu_time == 0)) + clib_time_init (&ldpw->clib_time); + if (tmo) { - int errno_val = errno; - perror (func_str); - clib_warning ("LDP<%d>: ERROR: fd %d (0x%x): %s() failed! " - "rv %d, errno = %d", getpid (), fd, fd, - func_str, size, errno_val); - errno = errno_val; + time_out = (f64) tmo->tv_sec + (f64) tmo->tv_nsec / (f64) 1e9; + time_out += clib_time_now (&ldpw->clib_time); } else - clib_warning ("LDP<%d>: fd %d (0x%x): returning %d (0x%x)", - getpid (), fd, fd, size, size); + { + time_out = (f64) ~0; + } + + while (nvecs < vlen) + { + mh = &vmessages[nvecs]; + rv = recvmsg (fd, &mh->msg_hdr, flags); + if (rv > 0) + { + mh->msg_len = rv; + nvecs += 1; + continue; + } + + if (!time_out || clib_time_now (&ldpw->clib_time) >= time_out) + break; + + usleep (1); + } + + return nvecs > 0 ? nvecs : rv; + } + else + { + return libc_recvmmsg (fd, vmessages, vlen, flags, tmo); } - return size; } #endif @@ -1660,8 +1983,7 @@ getsockopt (int fd, int level, int optname, vls_handle_t vlsh; int rv; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); vlsh = ldp_fd_to_vlsh (fd); if (vlsh != VLS_INVALID_HANDLE) @@ -1700,6 +2022,11 @@ getsockopt (int fd, int level, int optname, else rv = -EFAULT; break; + case TCP_CONGESTION: + *optlen = strlen ("cubic"); + strncpy (optval, "cubic", *optlen + 1); + rv = 0; + break; default: LDBG (0, "ERROR: fd %d: getsockopt SOL_TCP: sid %u, " "optname %d unsupported!", fd, vlsh, optname); @@ -1742,12 +2069,21 @@ getsockopt (int fd, int level, int optname, case SO_REUSEADDR: rv = vls_attr (vlsh, VPPCOM_ATTR_GET_REUSEADDR, optval, optlen); break; + case SO_REUSEPORT: + rv = vls_attr (vlsh, VPPCOM_ATTR_GET_REUSEPORT, optval, optlen); + break; case SO_BROADCAST: rv = vls_attr (vlsh, VPPCOM_ATTR_GET_BROADCAST, optval, optlen); break; + case SO_DOMAIN: + rv = vls_attr (vlsh, VPPCOM_ATTR_GET_DOMAIN, optval, optlen); + break; case SO_ERROR: rv = vls_attr (vlsh, VPPCOM_ATTR_GET_ERROR, optval, optlen); break; + case SO_BINDTODEVICE: + rv = 0; + break; default: LDBG (0, "ERROR: fd %d: getsockopt SOL_SOCKET: vlsh %u " "optname %d unsupported!", fd, vlsh, optname); @@ -1779,8 +2115,7 @@ setsockopt (int fd, int level, int optname, vls_handle_t vlsh; int rv; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); vlsh = ldp_fd_to_vlsh (fd); if (vlsh != VLS_INVALID_HANDLE) @@ -1808,6 +2143,11 @@ setsockopt (int fd, int level, int optname, rv = vls_attr (vlsh, VPPCOM_ATTR_SET_TCP_KEEPINTVL, (void *) optval, &optlen); break; + case TCP_CONGESTION: + case TCP_CORK: + /* Ignore */ + rv = 0; + break; default: LDBG (0, "ERROR: fd %d: setsockopt() SOL_TCP: vlsh %u" "optname %d unsupported!", fd, vlsh, optname); @@ -1838,16 +2178,38 @@ setsockopt (int fd, int level, int optname, rv = vls_attr (vlsh, VPPCOM_ATTR_SET_REUSEADDR, (void *) optval, &optlen); break; + case SO_REUSEPORT: + rv = vls_attr (vlsh, VPPCOM_ATTR_SET_REUSEPORT, (void *) optval, + &optlen); + break; case SO_BROADCAST: rv = vls_attr (vlsh, VPPCOM_ATTR_SET_BROADCAST, (void *) optval, &optlen); break; + case SO_LINGER: + rv = 0; + break; default: LDBG (0, "ERROR: fd %d: setsockopt SOL_SOCKET: vlsh %u " "optname %d unsupported!", fd, vlsh, optname); break; } break; + case SOL_IP: + switch (optname) + { + case IP_PKTINFO: + rv = vls_attr (vlsh, VPPCOM_ATTR_SET_IP_PKTINFO, (void *) optval, + &optlen); + break; + default: + LDBG (0, + "ERROR: fd %d: setsockopt SOL_IP: vlsh %u optname %d" + "unsupported!", + fd, vlsh, optname); + break; + } + break; default: break; } @@ -1872,8 +2234,7 @@ listen (int fd, int n) vls_handle_t vlsh; int rv; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); vlsh = ldp_fd_to_vlsh (fd); if (vlsh != VLS_INVALID_HANDLE) @@ -1898,14 +2259,14 @@ listen (int fd, int n) } static inline int -ldp_accept4 (int listen_fd, __SOCKADDR_ARG addr, - socklen_t * __restrict addr_len, int flags) +ldp_accept4 (int listen_fd, __SOCKADDR_ARG _addr, + socklen_t *__restrict addr_len, int flags) { + struct sockaddr *addr = SOCKADDR_GET_SA (_addr); vls_handle_t listen_vlsh, accept_vlsh; int rv; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); listen_vlsh = ldp_fd_to_vlsh (listen_fd); if (listen_vlsh != VLS_INVALID_HANDLE) @@ -1916,7 +2277,7 @@ ldp_accept4 (int listen_fd, __SOCKADDR_ARG addr, ep.ip = src_addr; LDBG (0, "listen fd %d: calling vppcom_session_accept: listen sid %u," - " ep %p, flags 0x%x", listen_fd, listen_vlsh, ep, flags); + " ep %p, flags 0x%x", listen_fd, listen_vlsh, &ep, flags); accept_vlsh = vls_accept (listen_vlsh, &ep, flags); if (accept_vlsh < 0) @@ -1969,31 +2330,15 @@ int shutdown (int fd, int how) { vls_handle_t vlsh; - int rv = 0, flags; - u32 flags_len = sizeof (flags); + int rv = 0; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); vlsh = ldp_fd_to_vlsh (fd); if (vlsh != VLS_INVALID_HANDLE) { LDBG (0, "called shutdown: fd %u vlsh %u how %d", fd, vlsh, how); - - if (vls_attr (vlsh, VPPCOM_ATTR_SET_SHUT, &how, &flags_len)) - { - close (fd); - return -1; - } - - if (vls_attr (vlsh, VPPCOM_ATTR_GET_SHUT, &flags, &flags_len)) - { - close (fd); - return -1; - } - - if (flags == SHUT_RDWR) - rv = close (fd); + rv = vls_shutdown (vlsh, how); } else { @@ -2011,11 +2356,16 @@ epoll_create1 (int flags) vls_handle_t vlsh; int rv; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); - if (ldp->vcl_needs_real_epoll) + if (ldp->vcl_needs_real_epoll || vls_use_real_epoll ()) { + /* Make sure workers have been allocated */ + if (!ldp->workers) + { + ldp_alloc_workers (); + ldpw = ldp_worker_get_current (); + } rv = libc_epoll_create1 (flags); ldp->vcl_needs_real_epoll = 0; ldpw->vcl_mq_epfd = rv; @@ -2049,8 +2399,7 @@ epoll_ctl (int epfd, int op, int fd, struct epoll_event *event) vls_handle_t vep_vlsh, vlsh; int rv; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); vep_vlsh = ldp_fd_to_vlsh (epfd); if (PREDICT_FALSE (vep_vlsh == VLS_INVALID_HANDLE)) @@ -2075,7 +2424,7 @@ epoll_ctl (int epfd, int op, int fd, struct epoll_event *event) if (vlsh != VLS_INVALID_HANDLE) { LDBG (1, "epfd %d: calling vls_epoll_ctl: ep_vlsh %d op %d, vlsh %u," - " event %p", epfd, vep_vlsh, vlsh, event); + " event %p", epfd, vep_vlsh, op, vlsh, event); rv = vls_epoll_ctl (vep_vlsh, op, vlsh, event); if (rv != VPPCOM_OK) @@ -2132,13 +2481,12 @@ static inline int ldp_epoll_pwait (int epfd, struct epoll_event *events, int maxevents, int timeout, const sigset_t * sigmask) { - ldp_worker_ctx_t *ldpw = ldp_worker_get_current (); - double time_to_wait = (double) 0, time_out, now = 0; + ldp_worker_ctx_t *ldpw; + double time_to_wait = (double) 0, max_time; int libc_epfd, rv = 0; vls_handle_t ep_vlsh; - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); if (PREDICT_FALSE (!events || (timeout < -1))) { @@ -2146,6 +2494,10 @@ ldp_epoll_pwait (int epfd, struct epoll_event *events, int maxevents, return -1; } + if (PREDICT_FALSE (vppcom_worker_index () == ~0)) + vls_register_vcl_worker (); + + ldpw = ldp_worker_get_current (); if (epfd == ldpw->vcl_mq_epfd) return libc_epoll_pwait (epfd, events, maxevents, timeout, sigmask); @@ -2157,8 +2509,10 @@ ldp_epoll_pwait (int epfd, struct epoll_event *events, int maxevents, return -1; } + if (PREDICT_FALSE (ldpw->clib_time.init_cpu_time == 0)) + clib_time_init (&ldpw->clib_time); time_to_wait = ((timeout >= 0) ? (double) timeout / 1000 : 0); - time_out = clib_time_now (&ldpw->clib_time) + time_to_wait; + max_time = clib_time_now (&ldpw->clib_time) + time_to_wait; libc_epfd = vls_attr (ep_vlsh, VPPCOM_ATTR_GET_LIBC_EPFD, 0, 0); if (PREDICT_FALSE (libc_epfd < 0)) @@ -2170,8 +2524,7 @@ ldp_epoll_pwait (int epfd, struct epoll_event *events, int maxevents, LDBG (2, "epfd %d: vep_idx %d, libc_epfd %d, events %p, maxevents %d, " "timeout %d, sigmask %p: time_to_wait %.02f", epfd, ep_vlsh, - libc_epfd, events, maxevents, timeout, sigmask, time_to_wait, - time_out); + libc_epfd, events, maxevents, timeout, sigmask, time_to_wait); do { if (!ldpw->epoll_wait_vcl) @@ -2198,11 +2551,122 @@ ldp_epoll_pwait (int epfd, struct epoll_event *events, int maxevents, if (rv != 0) goto done; } + } + while ((timeout == -1) || (clib_time_now (&ldpw->clib_time) < max_time)); + +done: + return rv; +} + +static inline int +ldp_epoll_pwait_eventfd (int epfd, struct epoll_event *events, + int maxevents, int timeout, const sigset_t * sigmask) +{ + ldp_worker_ctx_t *ldpw; + int libc_epfd, rv = 0, num_ev; + vls_handle_t ep_vlsh; + + ldp_init_check (); - if (timeout != -1) - now = clib_time_now (&ldpw->clib_time); + if (PREDICT_FALSE (!events || (timeout < -1))) + { + errno = EFAULT; + return -1; + } + + /* Make sure the vcl worker is valid. Could be that epoll fd was created on + * one thread but it is now used on another */ + if (PREDICT_FALSE (vppcom_worker_index () == ~0)) + vls_register_vcl_worker (); + + ldpw = ldp_worker_get_current (); + if (epfd == ldpw->vcl_mq_epfd) + return libc_epoll_pwait (epfd, events, maxevents, timeout, sigmask); + + ep_vlsh = ldp_fd_to_vlsh (epfd); + if (PREDICT_FALSE (ep_vlsh == VLS_INVALID_HANDLE)) + { + LDBG (0, "epfd %d: bad ep_vlsh %d!", epfd, ep_vlsh); + errno = EBADFD; + return -1; + } + + libc_epfd = vls_attr (ep_vlsh, VPPCOM_ATTR_GET_LIBC_EPFD, 0, 0); + if (PREDICT_FALSE (!libc_epfd)) + { + u32 size = sizeof (epfd); + + LDBG (1, "epfd %d, vep_vlsh %d calling libc_epoll_create1: " + "EPOLL_CLOEXEC", epfd, ep_vlsh); + libc_epfd = libc_epoll_create1 (EPOLL_CLOEXEC); + if (libc_epfd < 0) + { + rv = libc_epfd; + goto done; + } + + rv = vls_attr (ep_vlsh, VPPCOM_ATTR_SET_LIBC_EPFD, &libc_epfd, &size); + if (rv < 0) + { + errno = -rv; + rv = -1; + goto done; + } + } + if (PREDICT_FALSE (libc_epfd <= 0)) + { + errno = -libc_epfd; + rv = -1; + goto done; + } + + if (PREDICT_FALSE (!ldpw->mq_epfd_added)) + { + struct epoll_event e = { 0 }; + e.events = EPOLLIN; + e.data.fd = ldpw->vcl_mq_epfd; + if (libc_epoll_ctl (libc_epfd, EPOLL_CTL_ADD, ldpw->vcl_mq_epfd, &e) < + 0) + { + LDBG (0, "epfd %d, add libc mq epoll fd %d to libc epoll fd %d", + epfd, ldpw->vcl_mq_epfd, libc_epfd); + rv = -1; + goto done; + } + ldpw->mq_epfd_added = 1; + } + + /* Request to only drain unhandled to prevent libc_epoll_wait starved */ + rv = vls_epoll_wait (ep_vlsh, events, maxevents, -2); + if (rv > 0) + goto done; + else if (PREDICT_FALSE (rv < 0)) + { + errno = -rv; + rv = -1; + goto done; + } + + rv = libc_epoll_pwait (libc_epfd, events, maxevents, timeout, sigmask); + if (rv <= 0) + goto done; + for (int i = 0; i < rv; i++) + { + if (events[i].data.fd == ldpw->vcl_mq_epfd) + { + /* We should remove mq epoll fd from events. */ + rv--; + if (i != rv) + { + events[i].events = events[rv].events; + events[i].data.u64 = events[rv].data.u64; + } + num_ev = vls_epoll_wait (ep_vlsh, &events[rv], maxevents - rv, 0); + if (PREDICT_TRUE (num_ev > 0)) + rv += num_ev; + break; + } } - while (now < time_out); done: return rv; @@ -2212,13 +2676,20 @@ int epoll_pwait (int epfd, struct epoll_event *events, int maxevents, int timeout, const sigset_t * sigmask) { - return ldp_epoll_pwait (epfd, events, maxevents, timeout, sigmask); + if (vls_use_eventfd ()) + return ldp_epoll_pwait_eventfd (epfd, events, maxevents, timeout, + sigmask); + else + return ldp_epoll_pwait (epfd, events, maxevents, timeout, sigmask); } int epoll_wait (int epfd, struct epoll_event *events, int maxevents, int timeout) { - return ldp_epoll_pwait (epfd, events, maxevents, timeout, NULL); + if (vls_use_eventfd ()) + return ldp_epoll_pwait_eventfd (epfd, events, maxevents, timeout, NULL); + else + return ldp_epoll_pwait (epfd, events, maxevents, timeout, NULL); } int @@ -2228,14 +2699,15 @@ poll (struct pollfd *fds, nfds_t nfds, int timeout) int rv, i, n_revents = 0; vls_handle_t vlsh; vcl_poll_t *vp; - double wait_for_time; + double max_time; - LDBG (3, "fds %p, nfds %d, timeout %d", fds, nfds, timeout); + LDBG (3, "fds %p, nfds %ld, timeout %d", fds, nfds, timeout); - if (timeout >= 0) - wait_for_time = (f64) timeout / 1000; - else - wait_for_time = -1; + if (PREDICT_FALSE (ldpw->clib_time.init_cpu_time == 0)) + clib_time_init (&ldpw->clib_time); + + max_time = (timeout >= 0) ? (f64) timeout / 1000 : 0; + max_time += clib_time_now (&ldpw->clib_time); for (i = 0; i < nfds; i++) { @@ -2295,8 +2767,7 @@ poll (struct pollfd *fds, nfds_t nfds, int timeout) goto done; } } - while ((wait_for_time == -1) || - (clib_time_now (&ldpw->clib_time) < wait_for_time)); + while ((timeout < 0) || (clib_time_now (&ldpw->clib_time) < max_time)); rv = 0; done: @@ -2325,13 +2796,12 @@ done: return rv; } -#ifdef USE_GNU +#ifdef _GNU_SOURCE int ppoll (struct pollfd *fds, nfds_t nfds, const struct timespec *timeout, const sigset_t * sigmask) { - if ((errno = -ldp_init ())) - return -1; + ldp_init_check (); clib_warning ("LDP<%d>: LDP-TBD", getpid ()); errno = ENOSYS; @@ -2353,8 +2823,11 @@ ldp_constructor (void) { swrap_constructor (); if (ldp_init () != 0) - fprintf (stderr, "\nLDP<%d>: ERROR: ldp_constructor: failed!\n", - getpid ()); + { + fprintf (stderr, "\nLDP<%d>: ERROR: ldp_constructor: failed!\n", + getpid ()); + _exit (1); + } else if (LDP_DEBUG > 0) clib_warning ("LDP<%d>: LDP constructor: done!\n", getpid ()); } @@ -2365,16 +2838,18 @@ ldp_constructor (void) void ldp_destructor (void) { - swrap_destructor (); - if (ldp->init) - ldp->init = 0; + /* + swrap_destructor (); + if (ldp->init) + ldp->init = 0; + */ /* Don't use clib_warning() here because that calls writev() * which will call ldp_init(). */ if (LDP_DEBUG > 0) - printf ("%s:%d: LDP<%d>: LDP destructor: done!\n", - __func__, __LINE__, getpid ()); + fprintf (stderr, "%s:%d: LDP<%d>: LDP destructor: done!\n", + __func__, __LINE__, getpid ()); }