From 5c72a3780f765ba165c240c11921e0f2d229c702 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 23 Apr 2025 19:38:35 +0200 Subject: [PATCH] vlib: refactor file handling - remove unix-input pre-input node - move epoll functionality to graph scheduler to ensure epoll happens before other pre-input nodes - add thread wakeup capability (each thread got eventfd instance) Type: improvement Change-Id: I1d6dfb61199b76aa16a4b17e3c31a104f137ebe6 Signed-off-by: Damjan Marion --- src/vlib/CMakeLists.txt | 1 - src/vlib/file.c | 293 ++++++++++++++++++++++++++++++++ src/vlib/file.h | 2 + src/vlib/freebsd/pci.c | 4 +- src/vlib/linux/pci.c | 13 +- src/vlib/linux/vmbus.c | 13 +- src/vlib/main.c | 13 +- src/vlib/main.h | 10 ++ src/vlib/node_funcs.h | 5 +- src/vlib/stats/init.c | 4 +- src/vlib/threads.c | 9 +- src/vlib/threads.h | 19 ++- src/vlib/unix/cli.c | 38 ----- src/vlib/unix/input.c | 433 ------------------------------------------------ src/vlib/unix/main.c | 5 +- 15 files changed, 340 insertions(+), 522 deletions(-) delete mode 100644 src/vlib/unix/input.c diff --git a/src/vlib/CMakeLists.txt b/src/vlib/CMakeLists.txt index 3448f08cd58..b4fc1775194 100644 --- a/src/vlib/CMakeLists.txt +++ b/src/vlib/CMakeLists.txt @@ -118,7 +118,6 @@ add_vpp_library(vlib time.c trace.c unix/cli.c - unix/input.c unix/main.c unix/plugin.c unix/util.c diff --git a/src/vlib/file.c b/src/vlib/file.c index b9688e9439f..286b0d1f2ad 100644 --- a/src/vlib/file.c +++ b/src/vlib/file.c @@ -9,4 +9,297 @@ #include #include +VLIB_REGISTER_LOG_CLASS (vlib_file_log, static) = { + .class_name = "vlib", + .subclass_name = "file", +}; + +#define log_debug(fmt, ...) \ + vlib_log_debug (vlib_file_log.class, fmt, __VA_ARGS__) +#define log_warn(fmt, ...) \ + vlib_log_warn (vlib_file_log.class, fmt, __VA_ARGS__) +#define log_err(fmt, ...) vlib_log_err (vlib_file_log.class, fmt, __VA_ARGS__) + clib_file_main_t file_main; + +static void +vlib_file_update (clib_file_t *f, clib_file_update_type_t update_type) +{ + vlib_main_t *vm = vlib_get_main_by_index (f->polling_thread_index); + int op = -1, add_del = 0; + + struct epoll_event e = { + .events = EPOLLIN, + .data.ptr = f, + }; + + if (f->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE) + e.events |= EPOLLOUT; + if (f->flags & UNIX_FILE_EVENT_EDGE_TRIGGERED) + e.events |= EPOLLET; + + switch (update_type) + { + case UNIX_FILE_UPDATE_ADD: + op = EPOLL_CTL_ADD; + add_del = 1; + break; + + case UNIX_FILE_UPDATE_MODIFY: + op = EPOLL_CTL_MOD; + break; + + case UNIX_FILE_UPDATE_DELETE: + op = EPOLL_CTL_DEL; + add_del = -1; + break; + + default: + log_err ("%s: unknown update_type %d", __func__, update_type); + return; + } + + if (epoll_ctl (vm->epoll_fd, op, (int) f->file_descriptor, &e) < 0) + { + log_err ("%s: epoll_ctl() failed, errno %d", __func__, errno); + return; + } + + vm->n_epoll_fds += add_del; +} + +static clib_error_t * +wake_read_fn (struct clib_file *f) +{ + u64 val, __clib_unused rv; + rv = read ((int) f->file_descriptor, &val, sizeof (u64)); + return 0; +} + +void +vlib_file_poll_init (vlib_main_t *vm) +{ + vm->epoll_fd = epoll_create (1); + + if (vm->epoll_fd < 0) + clib_panic ("failed to initialize epoll for thread %u", vm->thread_index); + + vm->wakeup_fd = eventfd (0, EFD_NONBLOCK); + + if (vm->wakeup_fd < 0) + clib_panic ("failed to initialize wakeup event for thread %u", + vm->thread_index); + + if (!file_main.file_update) + file_main.file_update = vlib_file_update; + + clib_file_add (&file_main, &(clib_file_t){ + .polling_thread_index = vm->thread_index, + .file_descriptor = vm->wakeup_fd, + .description = format (0, "wakeup thread %u", + vm->thread_index), + .read_function = wake_read_fn, + }); +} + +void +vlib_file_poll (vlib_main_t *vm) +{ + vlib_node_main_t *nm = &vm->node_main; + unix_main_t *um = &unix_main; + struct epoll_event *e, epoll_events[16]; + int n_fds_ready; + int is_main = (vm->thread_index == 0); + int timeout_ms = 0, max_timeout_ms = 10; + u32 ticks; + + /* + * If we've been asked for a fixed-sleep between main loop polls, + * do so right away. + */ + if (PREDICT_FALSE (is_main && um->poll_sleep_usec)) + { + struct timespec ts, tsrem; + ts.tv_sec = 0; + ts.tv_nsec = 1000L * um->poll_sleep_usec; + + while (nanosleep (&ts, &tsrem) < 0) + ts = tsrem; + + goto epoll; + } + + /* we are busy, skip some loops before polling again */ + if (vlib_last_vectors_per_main_loop (vm) >= 2) + goto skip_loops; + + /* at least one node is polling */ + if (nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING]) + goto skip_loops; + + /* pending APIs in the queue */ + if (is_main && vm->api_queue_nonempty) + goto skip_loops; + + if (is_main == 0) + { + if (*vlib_worker_threads->wait_at_barrier) + goto epoll; + + if (vlib_get_first_main ()->time_last_barrier_release + 0.5 >= + vlib_time_now (vm)) + goto skip_loops; + } + + /* check for pending interrupts */ + for (int nt = 0; nt < VLIB_N_NODE_TYPE; nt++) + if (nm->node_interrupts[nt] && + clib_interrupt_is_any_pending (nm->node_interrupts[nt])) + goto epoll; + + /* at this point we know that thread is going to sleep, so let's annonce + * to other threads that they need to wakeup us if they need our attention */ + __atomic_store_n (&vm->thread_sleeps, 1, __ATOMIC_RELAXED); + + ticks = vlib_tw_timer_first_expires_in_ticks (vm); + + if (ticks != TW_SLOTS_PER_RING) + { + timeout_ms = (int) (ticks / ((u32) VLIB_TW_TICKS_PER_SECOND / 1000)); + timeout_ms = clib_min (timeout_ms, max_timeout_ms); + } + else + timeout_ms = max_timeout_ms; + + goto epoll; + +skip_loops: + /* Don't come back for a respectable number of dispatch cycles */ + vm->file_poll_skip_loops = 1024; + +epoll: + n_fds_ready = epoll_wait (vm->epoll_fd, epoll_events, + ARRAY_LEN (epoll_events), timeout_ms); + + __atomic_store_n (&vm->thread_sleeps, 0, __ATOMIC_RELAXED); + __atomic_store_n (&vm->wakeup_pending, 0, __ATOMIC_RELAXED); + + if (n_fds_ready < 0) + { + if (unix_error_is_fatal (errno)) + vlib_panic_with_error (vm, clib_error_return_unix (0, "epoll_wait")); + + /* non fatal error (e.g. EINTR). */ + return; + } + + vm->epoll_waits += 1; + vm->epoll_files_ready += n_fds_ready; + + for (e = epoll_events; e < epoll_events + n_fds_ready; e++) + { + clib_file_t *f = e->data.ptr; + clib_error_t *err; + + if (PREDICT_FALSE (!f->active)) + { + foreach_int (flag, EPOLLIN, EPOLLOUT, EPOLLERR) + if (e->events & flag) + { + const char *str[] = { + [EPOLLIN] = "EPOLLIN", + [EPOLLOUT] = "EPOLLOUT", + [EPOLLERR] = "EPOLLERR", + }; + log_debug ("epoll event %s dropped due to inactive file", + str[flag]); + } + continue; + } + else if (PREDICT_TRUE (!(e->events & EPOLLERR))) + { + if (e->events & EPOLLIN) + { + f->read_events++; + err = f->read_function (f); + if (err) + { + log_err ("file read error: %U", format_clib_error, err); + clib_error_free (err); + } + } + if (e->events & EPOLLOUT) + { + f->write_events++; + err = f->write_function (f); + if (err) + { + log_err ("file write error: %U", format_clib_error, err); + clib_error_free (err); + } + } + } + else + { + if (f->error_function) + { + f->error_events++; + err = f->error_function (f); + if (err) + { + log_err ("file error: %U", format_clib_error, err); + clib_error_free (err); + } + } + else if (f->dont_close == 0) + close ((int) f->file_descriptor); + } + } + + /* maximum epoll events received, there may be more ... */ + if (n_fds_ready == ARRAY_LEN (epoll_events)) + { + timeout_ms = 0; + goto epoll; + } + + /* removing fd from epoll instance doesn't remove event from epoll queue + * so we need to be sure epoll queue is empty before freeing */ + clib_file_free_deleted (&file_main, vm->thread_index); +} + +static clib_error_t * +show_files (vlib_main_t *vm, unformat_input_t *input, vlib_cli_command_t *cmd) +{ + clib_error_t *error = 0; + clib_file_main_t *fm = &file_main; + char path[PATH_MAX]; + u8 *s = 0; + + vlib_cli_output (vm, "%3s %6s %12s %12s %12s %-32s %s", "FD", "Thread", + "Read", "Write", "Error", "File Name", "Description"); + + pool_foreach_pointer (f, fm->file_pool) + { + ssize_t rv; + s = format (s, "/proc/self/fd/%d%c", f->file_descriptor, 0); + rv = readlink ((char *) s, path, PATH_MAX - 1); + + path[rv < 0 ? 0 : rv] = 0; + + vlib_cli_output (vm, "%3d %6d %12d %12d %12d %-32s %v", + f->file_descriptor, f->polling_thread_index, + f->read_events, f->write_events, f->error_events, path, + f->description); + vec_reset_length (s); + } + vec_free (s); + + return error; +} + +VLIB_CLI_COMMAND (cli_show_files, static) = { + .path = "show files", + .short_help = "Show files in use", + .function = show_files, +}; diff --git a/src/vlib/file.h b/src/vlib/file.h index c10c1edc990..82bbb22f650 100644 --- a/src/vlib/file.h +++ b/src/vlib/file.h @@ -9,4 +9,6 @@ extern clib_file_main_t file_main; +void vlib_file_poll_init (vlib_main_t *vm); +void vlib_file_poll (vlib_main_t *vm); #endif /* __vlib_file_h__ */ diff --git a/src/vlib/freebsd/pci.c b/src/vlib/freebsd/pci.c index a4e9eb2dda6..92c27c24373 100644 --- a/src/vlib/freebsd/pci.c +++ b/src/vlib/freebsd/pci.c @@ -375,6 +375,4 @@ freebsd_pci_init (vlib_main_t *vm) return 0; } -VLIB_INIT_FUNCTION (freebsd_pci_init) = { - .runs_after = VLIB_INITS ("unix_input_init"), -}; +VLIB_INIT_FUNCTION (freebsd_pci_init); diff --git a/src/vlib/linux/pci.c b/src/vlib/linux/pci.c index 5d33e533197..bf9c6f27cd5 100644 --- a/src/vlib/linux/pci.c +++ b/src/vlib/linux/pci.c @@ -1579,15 +1579,4 @@ linux_pci_init (vlib_main_t * vm) return 0; } -VLIB_INIT_FUNCTION (linux_pci_init) = -{ - .runs_after = VLIB_INITS("unix_input_init"), -}; - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ +VLIB_INIT_FUNCTION (linux_pci_init); diff --git a/src/vlib/linux/vmbus.c b/src/vlib/linux/vmbus.c index 9dc9d554ebd..27a5e271fd0 100644 --- a/src/vlib/linux/vmbus.c +++ b/src/vlib/linux/vmbus.c @@ -455,15 +455,4 @@ linux_vmbus_init (vlib_main_t * vm) return 0; } -VLIB_INIT_FUNCTION (linux_vmbus_init) = -{ - .runs_before = VLIB_INITS("unix_input_init"), -}; - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ +VLIB_INIT_FUNCTION (linux_vmbus_init); diff --git a/src/vlib/main.c b/src/vlib/main.c index f083bd2693e..ab8c54f4cec 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -45,8 +45,6 @@ #include #include -#include - #define VLIB_FRAME_MAGIC (0xabadc0ed) always_inline u32 * @@ -1579,6 +1577,11 @@ vlib_main_or_worker_loop (vlib_main_t * vm, int is_main) cpu_time_now = clib_cpu_time_now (); + if (vm->file_poll_skip_loops) + vm->file_poll_skip_loops--; + else + vlib_file_poll (vm); + for (vlib_node_type_t nt = 0; nt < VLIB_N_NODE_TYPE; nt++) { if (node_type_attrs[nt].can_be_polled) @@ -1975,14 +1978,15 @@ vlib_main (vlib_main_t * volatile vm, unformat_input_t * input) goto done; } + vlib_tw_init (vm); + vlib_file_poll_init (vm); + /* See unix/main.c; most likely already set up */ if (vgm->init_functions_called == 0) vgm->init_functions_called = hash_create (0, /* value bytes */ 0); if ((error = vlib_call_all_init_functions (vm))) goto done; - vlib_tw_init (vm); - vec_validate (nm->process_restore_current, 10); vec_validate (nm->process_restore_next, 10); vec_set_len (nm->process_restore_current, 0); @@ -2073,6 +2077,7 @@ vlib_worker_thread_fn (void *arg) clib_time_init (&vm->clib_time); clib_mem_set_heap (w->thread_mheap); vlib_tw_init (vm); + vlib_file_poll_init (vm); vm->worker_init_functions_called = hash_create (0, 0); diff --git a/src/vlib/main.h b/src/vlib/main.h index 270b203bd68..1700369738d 100644 --- a/src/vlib/main.h +++ b/src/vlib/main.h @@ -207,6 +207,16 @@ typedef struct vlib_main_t clib_thread_index_t thread_index; u32 numa_node; + /* epoll and eventfd */ + int epoll_fd; + int wakeup_fd; + int n_epoll_fds; + u32 file_poll_skip_loops; + u64 epoll_files_ready; + u64 epoll_waits; + u8 wakeup_pending; + u8 thread_sleeps; + /* control-plane API queue signal pending, length indication */ volatile u32 queue_signal_pending; volatile u32 api_queue_nonempty; diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index b5088282168..17677ee7aec 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -253,7 +253,10 @@ vlib_node_set_interrupt_pending (vlib_main_t *vm, u32 node_index) ASSERT (interrupts); if (vm != vlib_get_main ()) - clib_interrupt_set_atomic (interrupts, n->runtime_index); + { + clib_interrupt_set_atomic (interrupts, n->runtime_index); + vlib_thread_wakeup (vm->thread_index); + } else clib_interrupt_set (interrupts, n->runtime_index); } diff --git a/src/vlib/stats/init.c b/src/vlib/stats/init.c index 212ce5791c2..50f71b3eb11 100644 --- a/src/vlib/stats/init.c +++ b/src/vlib/stats/init.c @@ -254,6 +254,4 @@ statseg_init (vlib_main_t *vm) return stats_segment_socket_init (); } -VLIB_INIT_FUNCTION (statseg_init) = { - .runs_after = VLIB_INITS ("unix_input_init", "linux_epoll_input_init"), -}; +VLIB_INIT_FUNCTION (statseg_init); diff --git a/src/vlib/threads.c b/src/vlib/threads.c index 949ec9c3c0e..a1839e787c3 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -1407,7 +1407,11 @@ vlib_worker_thread_barrier_sync_int (vlib_main_t * vm, const char *func_name) deadline = now + BARRIER_SYNC_TIMEOUT; - *vlib_worker_threads->wait_at_barrier = 1; + __atomic_store_n (vlib_worker_threads->wait_at_barrier, 1, __ATOMIC_RELEASE); + + for (clib_thread_index_t ti = 1; ti < vlib_get_n_threads (); ti++) + vlib_thread_wakeup (ti); + while (*vlib_worker_threads->workers_at_barrier != count) { if ((now = vlib_time_now (vm)) > deadline) @@ -1480,9 +1484,8 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm) * time offset. See vlib_time_now(...) */ vm->time_last_barrier_release = vlib_time_now (vm); - CLIB_MEMORY_STORE_BARRIER (); - *vlib_worker_threads->wait_at_barrier = 0; + __atomic_store_n (vlib_worker_threads->wait_at_barrier, 0, __ATOMIC_RELEASE); while (*vlib_worker_threads->workers_at_barrier > 0) { diff --git a/src/vlib/threads.h b/src/vlib/threads.h index 94bb335fb3b..46a1476ee4b 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -504,13 +504,16 @@ void vlib_workers_sync (void); * Release barrier after workers sync */ void vlib_workers_continue (void); +static_always_inline void +vlib_thread_wakeup (clib_thread_index_t thread_index) +{ + vlib_main_t *vm = vlib_get_main_by_index (thread_index); + ssize_t __clib_unused rv; + u64 val = 1; -#endif /* included_vlib_threads_h */ + if (__atomic_load_n (&vm->thread_sleeps, __ATOMIC_RELAXED)) + if (__atomic_exchange_n (&vm->wakeup_pending, 1, __ATOMIC_RELAXED) == 0) + rv = write (vm->wakeup_fd, &val, sizeof (u64)); +} -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ +#endif /* included_vlib_threads_h */ diff --git a/src/vlib/unix/cli.c b/src/vlib/unix/cli.c index 59ad8381bed..8f0f00c8b9f 100644 --- a/src/vlib/unix/cli.c +++ b/src/vlib/unix/cli.c @@ -3549,44 +3549,6 @@ VLIB_CLI_COMMAND (cli_unix_show_errors, static) = { .function = unix_show_errors, }; -/** CLI command to show various unix error statistics. */ -static clib_error_t * -unix_show_files (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - clib_error_t *error = 0; - clib_file_main_t *fm = &file_main; - char path[PATH_MAX]; - u8 *s = 0; - - vlib_cli_output (vm, "%3s %6s %12s %12s %12s %-32s %s", "FD", "Thread", - "Read", "Write", "Error", "File Name", "Description"); - - pool_foreach_pointer (f, fm->file_pool) - { - int rv; - s = format (s, "/proc/self/fd/%d%c", f->file_descriptor, 0); - rv = readlink((char *) s, path, PATH_MAX - 1); - - path[rv < 0 ? 0 : rv] = 0; - - vlib_cli_output (vm, "%3d %6d %12d %12d %12d %-32s %v", - f->file_descriptor, f->polling_thread_index, - f->read_events, f->write_events, f->error_events, - path, f->description); - vec_reset_length (s); - } - vec_free (s); - - return error; -} - -VLIB_CLI_COMMAND (cli_unix_show_files, static) = { - .path = "show unix files", - .short_help = "Show Unix files in use", - .function = unix_show_files, -}; - /** CLI command to show session command history. */ static clib_error_t * unix_cli_show_history (vlib_main_t * vm, diff --git a/src/vlib/unix/input.c b/src/vlib/unix/input.c deleted file mode 100644 index dcc76fc4234..00000000000 --- a/src/vlib/unix/input.c +++ /dev/null @@ -1,433 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * input.c: Unix file input - * - * Copyright (c) 2008 Eliot Dresselhaus - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include - -/* FIXME autoconf */ -#define HAVE_LINUX_EPOLL - -#ifdef HAVE_LINUX_EPOLL - -#include - -typedef struct -{ - CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); - int epoll_fd; - struct epoll_event *epoll_events; - int n_epoll_fds; - - /* Statistics. */ - u64 epoll_files_ready; - u64 epoll_waits; -} linux_epoll_main_t; - -static linux_epoll_main_t *linux_epoll_mains = 0; - -static void -linux_epoll_file_update (clib_file_t * f, clib_file_update_type_t update_type) -{ - linux_epoll_main_t *em = vec_elt_at_index (linux_epoll_mains, - f->polling_thread_index); - struct epoll_event e = { 0 }; - int op, add_del = 0; - - e.events = EPOLLIN; - if (f->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE) - e.events |= EPOLLOUT; - if (f->flags & UNIX_FILE_EVENT_EDGE_TRIGGERED) - e.events |= EPOLLET; - e.data.u32 = f->index; - - op = -1; - - switch (update_type) - { - case UNIX_FILE_UPDATE_ADD: - op = EPOLL_CTL_ADD; - add_del = 1; - break; - - case UNIX_FILE_UPDATE_MODIFY: - op = EPOLL_CTL_MOD; - break; - - case UNIX_FILE_UPDATE_DELETE: - op = EPOLL_CTL_DEL; - add_del = -1; - break; - - default: - clib_warning ("unknown update_type %d", update_type); - return; - } - - /* worker threads open epoll fd only if needed */ - if (update_type == UNIX_FILE_UPDATE_ADD && em->epoll_fd == -1) - { - em->epoll_fd = epoll_create (1); - if (em->epoll_fd < 0) - { - clib_unix_warning ("epoll_create"); - return; - } - em->n_epoll_fds = 0; - } - - if (epoll_ctl (em->epoll_fd, op, f->file_descriptor, &e) < 0) - { - clib_unix_warning ("epoll_ctl"); - return; - } - - em->n_epoll_fds += add_del; - - if (em->n_epoll_fds == 0) - { - close (em->epoll_fd); - em->epoll_fd = -1; - } -} - -static int -is_int_pending (vlib_node_main_t *nm) -{ - - for (int nt = 0; nt < VLIB_N_NODE_TYPE; nt++) - if (nm->node_interrupts[nt] && - clib_interrupt_is_any_pending (nm->node_interrupts[nt])) - return 1; - return 0; -} - -static_always_inline uword -linux_epoll_input_inline (vlib_main_t *vm, vlib_node_runtime_t *node, - vlib_frame_t *frame, - clib_thread_index_t thread_index) -{ - unix_main_t *um = &unix_main; - clib_file_main_t *fm = &file_main; - linux_epoll_main_t *em = vec_elt_at_index (linux_epoll_mains, thread_index); - struct epoll_event *e; - int n_fds_ready; - int is_main = (thread_index == 0); - - { - vlib_node_main_t *nm = &vm->node_main; - u32 ticks_until_expiration; - f64 timeout; - f64 now; - int timeout_ms = 0, max_timeout_ms = 10; - f64 vector_rate = vlib_last_vectors_per_main_loop (vm); - - if (is_main == 0) - now = vlib_time_now (vm); - - /* - * If we've been asked for a fixed-sleep between main loop polls, - * do so right away. - */ - if (PREDICT_FALSE (is_main && um->poll_sleep_usec)) - { - struct timespec ts, tsrem; - timeout = 0; - timeout_ms = 0; - node->input_main_loops_per_call = 0; - ts.tv_sec = 0; - ts.tv_nsec = 1000 * um->poll_sleep_usec; - - while (nanosleep (&ts, &tsrem) < 0) - { - ts = tsrem; - } - } - /* If we're not working very hard, decide how long to sleep */ - else if (is_main && vector_rate < 2 && vm->api_queue_nonempty == 0 - && nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] == 0) - { - ticks_until_expiration = vlib_tw_timer_first_expires_in_ticks (vm); - - /* Nothing on the fast wheel, sleep 10ms */ - if (ticks_until_expiration == TW_SLOTS_PER_RING) - { - timeout = 10e-3; - timeout_ms = max_timeout_ms; - } - else - { - timeout = (f64) ticks_until_expiration *1e-5; - if (timeout < 1e-3) - timeout_ms = 0; - else - { - timeout_ms = timeout * 1e3; - /* Must be between 1 and 10 ms. */ - timeout_ms = clib_max (1, timeout_ms); - timeout_ms = clib_min (max_timeout_ms, timeout_ms); - } - } - node->input_main_loops_per_call = 0; - } - else if (is_main == 0 && vector_rate < 2 && - (vlib_get_first_main ()->time_last_barrier_release + 0.5 < now) && - nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] == 0) - { - timeout = 10e-3; - timeout_ms = max_timeout_ms; - node->input_main_loops_per_call = 0; - } - else /* busy */ - { - /* Don't come back for a respectable number of dispatch cycles */ - node->input_main_loops_per_call = 1024; - } - - /* Allow any signal to wakeup our sleep. */ - if (is_main || em->epoll_fd != -1) - { - static sigset_t unblock_all_signals; - n_fds_ready = epoll_pwait (em->epoll_fd, - em->epoll_events, - vec_len (em->epoll_events), - timeout_ms, &unblock_all_signals); - - /* This kludge is necessary to run over absurdly old kernels */ - if (n_fds_ready < 0 && errno == ENOSYS) - { - n_fds_ready = epoll_wait (em->epoll_fd, - em->epoll_events, - vec_len (em->epoll_events), timeout_ms); - } - - } - else - { - /* - * Worker thread, no epoll fd's, sleep for 100us at a time - * and check for a barrier sync request - */ - if (timeout_ms) - { - struct timespec ts, tsrem; - f64 limit = now + (f64) timeout_ms * 1e-3; - - while (vlib_time_now (vm) < limit) - { - /* Sleep for 100us at a time */ - ts.tv_sec = 0; - ts.tv_nsec = 1000 * 100; - - while (nanosleep (&ts, &tsrem) < 0) - ts = tsrem; - if (*vlib_worker_threads->wait_at_barrier || - is_int_pending (nm)) - goto done; - } - } - goto done; - } - } - - if (n_fds_ready < 0) - { - if (unix_error_is_fatal (errno)) - vlib_panic_with_error (vm, clib_error_return_unix (0, "epoll_wait")); - - /* non fatal error (e.g. EINTR). */ - goto done; - } - - em->epoll_waits += 1; - em->epoll_files_ready += n_fds_ready; - - for (e = em->epoll_events; e < em->epoll_events + n_fds_ready; e++) - { - u32 i = e->data.u32; - clib_file_t *f; - clib_error_t *errors[4]; - int n_errors = 0; - - /* - * Under rare scenarios, epoll may still post us events for the - * deleted file descriptor. We just deal with it and throw away the - * events for the corresponding file descriptor. - */ - f = clib_file_get (fm, i); - if (PREDICT_FALSE (!f)) - { - if (e->events & EPOLLIN) - { - errors[n_errors] = - clib_error_return (0, "epoll event EPOLLIN dropped due " - "to free index %u", i); - n_errors++; - } - if (e->events & EPOLLOUT) - { - errors[n_errors] = - clib_error_return (0, "epoll event EPOLLOUT dropped due " - "to free index %u", i); - n_errors++; - } - if (e->events & EPOLLERR) - { - errors[n_errors] = - clib_error_return (0, "epoll event EPOLLERR dropped due " - "to free index %u", i); - n_errors++; - } - } - else if (PREDICT_TRUE (!(e->events & EPOLLERR))) - { - if (e->events & EPOLLIN) - { - f->read_events++; - errors[n_errors] = f->read_function (f); - /* Make sure f is valid if the file pool moves */ - if (pool_is_free_index (fm->file_pool, i)) - continue; - f = clib_file_get (fm, i); - n_errors += errors[n_errors] != 0; - } - if (e->events & EPOLLOUT) - { - f->write_events++; - errors[n_errors] = f->write_function (f); - n_errors += errors[n_errors] != 0; - } - } - else - { - if (f->error_function) - { - f->error_events++; - errors[n_errors] = f->error_function (f); - n_errors += errors[n_errors] != 0; - } - else - close (f->file_descriptor); - } - - ASSERT (n_errors < ARRAY_LEN (errors)); - for (i = 0; i < n_errors; i++) - { - unix_save_error (um, errors[i]); - } - } - -done: - return 0; -} - -static uword -linux_epoll_input (vlib_main_t * vm, - vlib_node_runtime_t * node, vlib_frame_t * frame) -{ - clib_thread_index_t thread_index = vlib_get_thread_index (); - - if (thread_index == 0) - return linux_epoll_input_inline (vm, node, frame, 0); - else - return linux_epoll_input_inline (vm, node, frame, thread_index); -} - -VLIB_REGISTER_NODE (linux_epoll_input_node,static) = { - .function = linux_epoll_input, - .type = VLIB_NODE_TYPE_PRE_INPUT, - .name = "unix-epoll-input", -}; - -clib_error_t * -linux_epoll_input_init (vlib_main_t * vm) -{ - linux_epoll_main_t *em; - clib_file_main_t *fm = &file_main; - vlib_thread_main_t *tm = vlib_get_thread_main (); - - - vec_validate_aligned (linux_epoll_mains, tm->n_vlib_mains, - CLIB_CACHE_LINE_BYTES); - - vec_foreach (em, linux_epoll_mains) - { - /* Allocate some events. */ - vec_resize (em->epoll_events, VLIB_FRAME_SIZE); - - if (linux_epoll_mains == em) - { - em->epoll_fd = epoll_create (1); - if (em->epoll_fd < 0) - return clib_error_return_unix (0, "epoll_create"); - } - else - em->epoll_fd = -1; - } - - fm->file_update = linux_epoll_file_update; - - return 0; -} - -VLIB_INIT_FUNCTION (linux_epoll_input_init); - -#endif /* HAVE_LINUX_EPOLL */ - -static clib_error_t * -unix_input_init (vlib_main_t * vm) -{ - return 0; -} - -VLIB_INIT_FUNCTION (unix_input_init) = -{ - .runs_before = VLIB_INITS ("linux_epoll_input_init"), -}; - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c index 37ca5b1fdb5..5841451735e 100644 --- a/src/vlib/unix/main.c +++ b/src/vlib/unix/main.c @@ -77,10 +77,7 @@ unix_main_init (vlib_main_t * vm) return 0; } -VLIB_INIT_FUNCTION (unix_main_init) = -{ - .runs_before = VLIB_INITS ("unix_input_init"), -}; +VLIB_INIT_FUNCTION (unix_main_init); static int unsetup_signal_handlers (int sig) -- 2.16.6