aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/core/lib/iomgr
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/lib/iomgr')
-rw-r--r--src/core/lib/iomgr/combiner.c12
-rw-r--r--src/core/lib/iomgr/combiner.h3
-rw-r--r--src/core/lib/iomgr/error.c2
-rw-r--r--src/core/lib/iomgr/ev_epoll1_linux.c984
-rw-r--r--src/core/lib/iomgr/ev_epoll1_linux.h44
-rw-r--r--src/core/lib/iomgr/ev_epoll_limited_pollers_linux.c2146
-rw-r--r--src/core/lib/iomgr/ev_epoll_limited_pollers_linux.h43
-rw-r--r--src/core/lib/iomgr/ev_epoll_thread_pool_linux.c1337
-rw-r--r--src/core/lib/iomgr/ev_epoll_thread_pool_linux.h43
-rw-r--r--src/core/lib/iomgr/ev_epollex_linux.c1511
-rw-r--r--src/core/lib/iomgr/ev_epollex_linux.h43
-rw-r--r--src/core/lib/iomgr/ev_epollsig_linux.c (renamed from src/core/lib/iomgr/ev_epoll_linux.c)74
-rw-r--r--src/core/lib/iomgr/ev_epollsig_linux.h (renamed from src/core/lib/iomgr/ev_epoll_linux.h)8
-rw-r--r--src/core/lib/iomgr/ev_poll_posix.c39
-rw-r--r--src/core/lib/iomgr/ev_poll_posix.h4
-rw-r--r--src/core/lib/iomgr/ev_posix.c30
-rw-r--r--src/core/lib/iomgr/ev_posix.h7
-rw-r--r--src/core/lib/iomgr/ev_windows.c43
-rw-r--r--src/core/lib/iomgr/exec_ctx.c5
-rw-r--r--src/core/lib/iomgr/exec_ctx.h2
-rw-r--r--src/core/lib/iomgr/iomgr.c4
-rw-r--r--src/core/lib/iomgr/iomgr.h3
-rw-r--r--src/core/lib/iomgr/is_epollexclusive_available.c116
-rw-r--r--src/core/lib/iomgr/is_epollexclusive_available.h41
-rw-r--r--src/core/lib/iomgr/lockfree_event.c16
-rw-r--r--src/core/lib/iomgr/pollset.h7
-rw-r--r--src/core/lib/iomgr/pollset_uv.c2
-rw-r--r--src/core/lib/iomgr/pollset_windows.c6
-rw-r--r--src/core/lib/iomgr/port.h1
-rw-r--r--src/core/lib/iomgr/resource_quota.c17
-rw-r--r--src/core/lib/iomgr/resource_quota.h3
-rw-r--r--src/core/lib/iomgr/sys_epoll_wrapper.h43
-rw-r--r--src/core/lib/iomgr/tcp_client_posix.c12
-rw-r--r--src/core/lib/iomgr/tcp_client_uv.c6
-rw-r--r--src/core/lib/iomgr/tcp_posix.c14
-rw-r--r--src/core/lib/iomgr/tcp_posix.h3
-rw-r--r--src/core/lib/iomgr/tcp_server_posix.c2
-rw-r--r--src/core/lib/iomgr/tcp_server_uv.c32
-rw-r--r--src/core/lib/iomgr/tcp_uv.c26
-rw-r--r--src/core/lib/iomgr/tcp_uv.h3
-rw-r--r--src/core/lib/iomgr/timer_generic.c49
-rw-r--r--src/core/lib/iomgr/timer_manager.c276
-rw-r--r--src/core/lib/iomgr/timer_manager.h52
-rw-r--r--src/core/lib/iomgr/timer_uv.c6
44 files changed, 6956 insertions, 164 deletions
diff --git a/src/core/lib/iomgr/combiner.c b/src/core/lib/iomgr/combiner.c
index 05cdbdad2b..863f22c614 100644
--- a/src/core/lib/iomgr/combiner.c
+++ b/src/core/lib/iomgr/combiner.c
@@ -42,13 +42,13 @@
#include "src/core/lib/iomgr/workqueue.h"
#include "src/core/lib/profiling/timers.h"
-int grpc_combiner_trace = 0;
+grpc_tracer_flag grpc_combiner_trace = GRPC_TRACER_INITIALIZER(false);
-#define GRPC_COMBINER_TRACE(fn) \
- do { \
- if (grpc_combiner_trace) { \
- fn; \
- } \
+#define GRPC_COMBINER_TRACE(fn) \
+ do { \
+ if (GRPC_TRACER_ON(grpc_combiner_trace)) { \
+ fn; \
+ } \
} while (0)
#define STATE_UNORPHANED 1
diff --git a/src/core/lib/iomgr/combiner.h b/src/core/lib/iomgr/combiner.h
index 75dcb0b70a..6ab7a2b26b 100644
--- a/src/core/lib/iomgr/combiner.h
+++ b/src/core/lib/iomgr/combiner.h
@@ -37,6 +37,7 @@
#include <stddef.h>
#include <grpc/support/atm.h>
+#include "src/core/lib/debug/trace.h"
#include "src/core/lib/iomgr/exec_ctx.h"
#include "src/core/lib/support/mpscq.h"
@@ -78,6 +79,6 @@ grpc_closure_scheduler *grpc_combiner_finally_scheduler(grpc_combiner *lock,
bool grpc_combiner_continue_exec_ctx(grpc_exec_ctx *exec_ctx);
-extern int grpc_combiner_trace;
+extern grpc_tracer_flag grpc_combiner_trace;
#endif /* GRPC_CORE_LIB_IOMGR_COMBINER_H */
diff --git a/src/core/lib/iomgr/error.c b/src/core/lib/iomgr/error.c
index 5f2c989aad..685581b5cb 100644
--- a/src/core/lib/iomgr/error.c
+++ b/src/core/lib/iomgr/error.c
@@ -769,7 +769,7 @@ grpc_error *grpc_os_error(const char *file, int line, int err,
GRPC_ERROR_INT_ERRNO, err),
GRPC_ERROR_STR_OS_ERROR,
grpc_slice_from_static_string(strerror(err))),
- GRPC_ERROR_STR_SYSCALL, grpc_slice_from_static_string(call_name));
+ GRPC_ERROR_STR_SYSCALL, grpc_slice_from_copied_string(call_name));
}
#ifdef GPR_WINDOWS
diff --git a/src/core/lib/iomgr/ev_epoll1_linux.c b/src/core/lib/iomgr/ev_epoll1_linux.c
new file mode 100644
index 0000000000..ad69f808cd
--- /dev/null
+++ b/src/core/lib/iomgr/ev_epoll1_linux.c
@@ -0,0 +1,984 @@
+/*
+ *
+ * Copyright 2017, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "src/core/lib/iomgr/port.h"
+
+/* This polling engine is only relevant on linux kernels supporting epoll() */
+#ifdef GRPC_LINUX_EPOLL
+
+#include "src/core/lib/iomgr/ev_epoll1_linux.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <poll.h>
+#include <pthread.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include <grpc/support/alloc.h>
+#include <grpc/support/cpu.h>
+#include <grpc/support/log.h>
+#include <grpc/support/string_util.h>
+#include <grpc/support/tls.h>
+#include <grpc/support/useful.h>
+
+#include "src/core/lib/iomgr/ev_posix.h"
+#include "src/core/lib/iomgr/iomgr_internal.h"
+#include "src/core/lib/iomgr/lockfree_event.h"
+#include "src/core/lib/iomgr/wakeup_fd_posix.h"
+#include "src/core/lib/iomgr/workqueue.h"
+#include "src/core/lib/profiling/timers.h"
+#include "src/core/lib/support/block_annotate.h"
+
+static grpc_wakeup_fd global_wakeup_fd;
+static int g_epfd;
+
+/*******************************************************************************
+ * Fd Declarations
+ */
+
+struct grpc_fd {
+ int fd;
+
+ gpr_atm read_closure;
+ gpr_atm write_closure;
+
+ struct grpc_fd *freelist_next;
+
+ /* The pollset that last noticed that the fd is readable. The actual type
+ * stored in this is (grpc_pollset *) */
+ gpr_atm read_notifier_pollset;
+
+ grpc_iomgr_object iomgr_object;
+};
+
+static void fd_global_init(void);
+static void fd_global_shutdown(void);
+
+/*******************************************************************************
+ * Pollset Declarations
+ */
+
+typedef enum { UNKICKED, KICKED, DESIGNATED_POLLER } kick_state;
+
+struct grpc_pollset_worker {
+ kick_state kick_state;
+ bool initialized_cv;
+ grpc_pollset_worker *next;
+ grpc_pollset_worker *prev;
+ gpr_cv cv;
+ grpc_closure_list schedule_on_end_work;
+};
+
+#define MAX_NEIGHBOURHOODS 1024
+
+typedef struct pollset_neighbourhood {
+ gpr_mu mu;
+ grpc_pollset *active_root;
+ char pad[GPR_CACHELINE_SIZE];
+} pollset_neighbourhood;
+
+struct grpc_pollset {
+ gpr_mu mu;
+ pollset_neighbourhood *neighbourhood;
+ bool reassigning_neighbourhood;
+ grpc_pollset_worker *root_worker;
+ bool kicked_without_poller;
+ bool seen_inactive;
+ bool shutting_down; /* Is the pollset shutting down ? */
+ bool finish_shutdown_called; /* Is the 'finish_shutdown_locked()' called ? */
+ grpc_closure *shutdown_closure; /* Called after after shutdown is complete */
+ int begin_refs;
+
+ grpc_pollset *next;
+ grpc_pollset *prev;
+};
+
+/*******************************************************************************
+ * Pollset-set Declarations
+ */
+
+struct grpc_pollset_set {};
+
+/*******************************************************************************
+ * Common helpers
+ */
+
+static bool append_error(grpc_error **composite, grpc_error *error,
+ const char *desc) {
+ if (error == GRPC_ERROR_NONE) return true;
+ if (*composite == GRPC_ERROR_NONE) {
+ *composite = GRPC_ERROR_CREATE_FROM_COPIED_STRING(desc);
+ }
+ *composite = grpc_error_add_child(*composite, error);
+ return false;
+}
+
+/*******************************************************************************
+ * Fd Definitions
+ */
+
+/* We need to keep a freelist not because of any concerns of malloc performance
+ * but instead so that implementations with multiple threads in (for example)
+ * epoll_wait deal with the race between pollset removal and incoming poll
+ * notifications.
+ *
+ * The problem is that the poller ultimately holds a reference to this
+ * object, so it is very difficult to know when is safe to free it, at least
+ * without some expensive synchronization.
+ *
+ * If we keep the object freelisted, in the worst case losing this race just
+ * becomes a spurious read notification on a reused fd.
+ */
+
+/* The alarm system needs to be able to wakeup 'some poller' sometimes
+ * (specifically when a new alarm needs to be triggered earlier than the next
+ * alarm 'epoch'). This wakeup_fd gives us something to alert on when such a
+ * case occurs. */
+
+static grpc_fd *fd_freelist = NULL;
+static gpr_mu fd_freelist_mu;
+
+static void fd_global_init(void) { gpr_mu_init(&fd_freelist_mu); }
+
+static void fd_global_shutdown(void) {
+ gpr_mu_lock(&fd_freelist_mu);
+ gpr_mu_unlock(&fd_freelist_mu);
+ while (fd_freelist != NULL) {
+ grpc_fd *fd = fd_freelist;
+ fd_freelist = fd_freelist->freelist_next;
+ gpr_free(fd);
+ }
+ gpr_mu_destroy(&fd_freelist_mu);
+}
+
+static grpc_fd *fd_create(int fd, const char *name) {
+ grpc_fd *new_fd = NULL;
+
+ gpr_mu_lock(&fd_freelist_mu);
+ if (fd_freelist != NULL) {
+ new_fd = fd_freelist;
+ fd_freelist = fd_freelist->freelist_next;
+ }
+ gpr_mu_unlock(&fd_freelist_mu);
+
+ if (new_fd == NULL) {
+ new_fd = gpr_malloc(sizeof(grpc_fd));
+ }
+
+ new_fd->fd = fd;
+ grpc_lfev_init(&new_fd->read_closure);
+ grpc_lfev_init(&new_fd->write_closure);
+ gpr_atm_no_barrier_store(&new_fd->read_notifier_pollset, (gpr_atm)NULL);
+
+ new_fd->freelist_next = NULL;
+
+ char *fd_name;
+ gpr_asprintf(&fd_name, "%s fd=%d", name, fd);
+ grpc_iomgr_register_object(&new_fd->iomgr_object, fd_name);
+#ifdef GRPC_FD_REF_COUNT_DEBUG
+ gpr_log(GPR_DEBUG, "FD %d %p create %s", fd, (void *)new_fd, fd_name);
+#endif
+ gpr_free(fd_name);
+
+ struct epoll_event ev = {.events = (uint32_t)(EPOLLIN | EPOLLOUT | EPOLLET),
+ .data.ptr = new_fd};
+ if (epoll_ctl(g_epfd, EPOLL_CTL_ADD, fd, &ev) != 0) {
+ gpr_log(GPR_ERROR, "epoll_ctl failed: %s", strerror(errno));
+ }
+
+ return new_fd;
+}
+
+static int fd_wrapped_fd(grpc_fd *fd) { return fd->fd; }
+
+/* Might be called multiple times */
+static void fd_shutdown(grpc_exec_ctx *exec_ctx, grpc_fd *fd, grpc_error *why) {
+ if (grpc_lfev_set_shutdown(exec_ctx, &fd->read_closure,
+ GRPC_ERROR_REF(why))) {
+ shutdown(fd->fd, SHUT_RDWR);
+ grpc_lfev_set_shutdown(exec_ctx, &fd->write_closure, GRPC_ERROR_REF(why));
+ }
+ GRPC_ERROR_UNREF(why);
+}
+
+static void fd_orphan(grpc_exec_ctx *exec_ctx, grpc_fd *fd,
+ grpc_closure *on_done, int *release_fd,
+ const char *reason) {
+ grpc_error *error = GRPC_ERROR_NONE;
+
+ if (!grpc_lfev_is_shutdown(&fd->read_closure)) {
+ fd_shutdown(exec_ctx, fd, GRPC_ERROR_CREATE_FROM_COPIED_STRING(reason));
+ }
+
+ /* If release_fd is not NULL, we should be relinquishing control of the file
+ descriptor fd->fd (but we still own the grpc_fd structure). */
+ if (release_fd != NULL) {
+ *release_fd = fd->fd;
+ } else {
+ close(fd->fd);
+ }
+
+ grpc_closure_sched(exec_ctx, on_done, GRPC_ERROR_REF(error));
+
+ grpc_iomgr_unregister_object(&fd->iomgr_object);
+ grpc_lfev_destroy(&fd->read_closure);
+ grpc_lfev_destroy(&fd->write_closure);
+
+ gpr_mu_lock(&fd_freelist_mu);
+ fd->freelist_next = fd_freelist;
+ fd_freelist = fd;
+ gpr_mu_unlock(&fd_freelist_mu);
+}
+
+static grpc_pollset *fd_get_read_notifier_pollset(grpc_exec_ctx *exec_ctx,
+ grpc_fd *fd) {
+ gpr_atm notifier = gpr_atm_acq_load(&fd->read_notifier_pollset);
+ return (grpc_pollset *)notifier;
+}
+
+static bool fd_is_shutdown(grpc_fd *fd) {
+ return grpc_lfev_is_shutdown(&fd->read_closure);
+}
+
+static void fd_notify_on_read(grpc_exec_ctx *exec_ctx, grpc_fd *fd,
+ grpc_closure *closure) {
+ grpc_lfev_notify_on(exec_ctx, &fd->read_closure, closure);
+}
+
+static void fd_notify_on_write(grpc_exec_ctx *exec_ctx, grpc_fd *fd,
+ grpc_closure *closure) {
+ grpc_lfev_notify_on(exec_ctx, &fd->write_closure, closure);
+}
+
+static grpc_workqueue *fd_get_workqueue(grpc_fd *fd) {
+ return (grpc_workqueue *)0xb0b51ed;
+}
+
+static void fd_become_readable(grpc_exec_ctx *exec_ctx, grpc_fd *fd,
+ grpc_pollset *notifier) {
+ grpc_lfev_set_ready(exec_ctx, &fd->read_closure);
+
+ /* Note, it is possible that fd_become_readable might be called twice with
+ different 'notifier's when an fd becomes readable and it is in two epoll
+ sets (This can happen briefly during polling island merges). In such cases
+ it does not really matter which notifer is set as the read_notifier_pollset
+ (They would both point to the same polling island anyway) */
+ /* Use release store to match with acquire load in fd_get_read_notifier */
+ gpr_atm_rel_store(&fd->read_notifier_pollset, (gpr_atm)notifier);
+}
+
+static void fd_become_writable(grpc_exec_ctx *exec_ctx, grpc_fd *fd) {
+ grpc_lfev_set_ready(exec_ctx, &fd->write_closure);
+}
+
+/*******************************************************************************
+ * Pollset Definitions
+ */
+
+GPR_TLS_DECL(g_current_thread_pollset);
+GPR_TLS_DECL(g_current_thread_worker);
+static gpr_atm g_active_poller;
+static pollset_neighbourhood *g_neighbourhoods;
+static size_t g_num_neighbourhoods;
+static gpr_mu g_wq_mu;
+static grpc_closure_list g_wq_items;
+
+/* Return true if first in list */
+static bool worker_insert(grpc_pollset *pollset, grpc_pollset_worker *worker) {
+ if (pollset->root_worker == NULL) {
+ pollset->root_worker = worker;
+ worker->next = worker->prev = worker;
+ return true;
+ } else {
+ worker->next = pollset->root_worker;
+ worker->prev = worker->next->prev;
+ worker->next->prev = worker;
+ worker->prev->next = worker;
+ return false;
+ }
+}
+
+/* Return true if last in list */
+typedef enum { EMPTIED, NEW_ROOT, REMOVED } worker_remove_result;
+
+static worker_remove_result worker_remove(grpc_pollset *pollset,
+ grpc_pollset_worker *worker) {
+ if (worker == pollset->root_worker) {
+ if (worker == worker->next) {
+ pollset->root_worker = NULL;
+ return EMPTIED;
+ } else {
+ pollset->root_worker = worker->next;
+ worker->prev->next = worker->next;
+ worker->next->prev = worker->prev;
+ return NEW_ROOT;
+ }
+ } else {
+ worker->prev->next = worker->next;
+ worker->next->prev = worker->prev;
+ return REMOVED;
+ }
+}
+
+static size_t choose_neighbourhood(void) {
+ return (size_t)gpr_cpu_current_cpu() % g_num_neighbourhoods;
+}
+
+static grpc_error *pollset_global_init(void) {
+ gpr_tls_init(&g_current_thread_pollset);
+ gpr_tls_init(&g_current_thread_worker);
+ gpr_atm_no_barrier_store(&g_active_poller, 0);
+ global_wakeup_fd.read_fd = -1;
+ grpc_error *err = grpc_wakeup_fd_init(&global_wakeup_fd);
+ gpr_mu_init(&g_wq_mu);
+ g_wq_items = (grpc_closure_list)GRPC_CLOSURE_LIST_INIT;
+ if (err != GRPC_ERROR_NONE) return err;
+ struct epoll_event ev = {.events = (uint32_t)(EPOLLIN | EPOLLET),
+ .data.ptr = &global_wakeup_fd};
+ if (epoll_ctl(g_epfd, EPOLL_CTL_ADD, global_wakeup_fd.read_fd, &ev) != 0) {
+ return GRPC_OS_ERROR(errno, "epoll_ctl");
+ }
+ g_num_neighbourhoods = GPR_CLAMP(gpr_cpu_num_cores(), 1, MAX_NEIGHBOURHOODS);
+ g_neighbourhoods =
+ gpr_zalloc(sizeof(*g_neighbourhoods) * g_num_neighbourhoods);
+ for (size_t i = 0; i < g_num_neighbourhoods; i++) {
+ gpr_mu_init(&g_neighbourhoods[i].mu);
+ }
+ return GRPC_ERROR_NONE;
+}
+
+static void pollset_global_shutdown(void) {
+ gpr_tls_destroy(&g_current_thread_pollset);
+ gpr_tls_destroy(&g_current_thread_worker);
+ gpr_mu_destroy(&g_wq_mu);
+ if (global_wakeup_fd.read_fd != -1) grpc_wakeup_fd_destroy(&global_wakeup_fd);
+ for (size_t i = 0; i < g_num_neighbourhoods; i++) {
+ gpr_mu_destroy(&g_neighbourhoods[i].mu);
+ }
+ gpr_free(g_neighbourhoods);
+}
+
+static void pollset_init(grpc_pollset *pollset, gpr_mu **mu) {
+ gpr_mu_init(&pollset->mu);
+ *mu = &pollset->mu;
+ pollset->neighbourhood = &g_neighbourhoods[choose_neighbourhood()];
+ pollset->seen_inactive = true;
+}
+
+static void pollset_destroy(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset) {
+ gpr_mu_lock(&pollset->mu);
+ if (!pollset->seen_inactive) {
+ pollset_neighbourhood *neighbourhood = pollset->neighbourhood;
+ gpr_mu_unlock(&pollset->mu);
+ retry_lock_neighbourhood:
+ gpr_mu_lock(&neighbourhood->mu);
+ gpr_mu_lock(&pollset->mu);
+ if (!pollset->seen_inactive) {
+ if (pollset->neighbourhood != neighbourhood) {
+ gpr_mu_unlock(&neighbourhood->mu);
+ neighbourhood = pollset->neighbourhood;
+ gpr_mu_unlock(&pollset->mu);
+ goto retry_lock_neighbourhood;
+ }
+ pollset->prev->next = pollset->next;
+ pollset->next->prev = pollset->prev;
+ if (pollset == pollset->neighbourhood->active_root) {
+ pollset->neighbourhood->active_root =
+ pollset->next == pollset ? NULL : pollset->next;
+ }
+ }
+ gpr_mu_unlock(&pollset->neighbourhood->mu);
+ }
+ gpr_mu_unlock(&pollset->mu);
+ gpr_mu_destroy(&pollset->mu);
+}
+
+static grpc_error *pollset_kick_all(grpc_pollset *pollset) {
+ grpc_error *error = GRPC_ERROR_NONE;
+ if (pollset->root_worker != NULL) {
+ grpc_pollset_worker *worker = pollset->root_worker;
+ do {
+ if (worker->initialized_cv) {
+ worker->kick_state = KICKED;
+ gpr_cv_signal(&worker->cv);
+ } else {
+ worker->kick_state = KICKED;
+ append_error(&error, grpc_wakeup_fd_wakeup(&global_wakeup_fd),
+ "pollset_shutdown");
+ }
+
+ worker = worker->next;
+ } while (worker != pollset->root_worker);
+ }
+ return error;
+}
+
+static void pollset_maybe_finish_shutdown(grpc_exec_ctx *exec_ctx,
+ grpc_pollset *pollset) {
+ if (pollset->shutdown_closure != NULL && pollset->root_worker == NULL &&
+ pollset->begin_refs == 0) {
+ grpc_closure_sched(exec_ctx, pollset->shutdown_closure, GRPC_ERROR_NONE);
+ pollset->shutdown_closure = NULL;
+ }
+}
+
+static void pollset_shutdown(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
+ grpc_closure *closure) {
+ GPR_ASSERT(pollset->shutdown_closure == NULL);
+ pollset->shutdown_closure = closure;
+ GRPC_LOG_IF_ERROR("pollset_shutdown", pollset_kick_all(pollset));
+ pollset_maybe_finish_shutdown(exec_ctx, pollset);
+}
+
+#define MAX_EPOLL_EVENTS 100
+
+static int poll_deadline_to_millis_timeout(gpr_timespec deadline,
+ gpr_timespec now) {
+ gpr_timespec timeout;
+ if (gpr_time_cmp(deadline, gpr_inf_future(deadline.clock_type)) == 0) {
+ return -1;
+ }
+
+ if (gpr_time_cmp(deadline, now) <= 0) {
+ return 0;
+ }
+
+ static const gpr_timespec round_up = {
+ .clock_type = GPR_TIMESPAN, .tv_sec = 0, .tv_nsec = GPR_NS_PER_MS - 1};
+ timeout = gpr_time_sub(deadline, now);
+ int millis = gpr_time_to_millis(gpr_time_add(timeout, round_up));
+ return millis >= 1 ? millis : 1;
+}
+
+static grpc_error *pollset_epoll(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
+ gpr_timespec now, gpr_timespec deadline) {
+ struct epoll_event events[MAX_EPOLL_EVENTS];
+ static const char *err_desc = "pollset_poll";
+
+ int timeout = poll_deadline_to_millis_timeout(deadline, now);
+
+ if (timeout != 0) {
+ GRPC_SCHEDULING_START_BLOCKING_REGION;
+ }
+ int r;
+ do {
+ r = epoll_wait(g_epfd, events, MAX_EPOLL_EVENTS, timeout);
+ } while (r < 0 && errno == EINTR);
+ if (timeout != 0) {
+ GRPC_SCHEDULING_END_BLOCKING_REGION;
+ }
+
+ if (r < 0) return GRPC_OS_ERROR(errno, "epoll_wait");
+
+ grpc_error *error = GRPC_ERROR_NONE;
+ for (int i = 0; i < r; i++) {
+ void *data_ptr = events[i].data.ptr;
+ if (data_ptr == &global_wakeup_fd) {
+ gpr_mu_lock(&g_wq_mu);
+ grpc_closure_list_move(&g_wq_items, &exec_ctx->closure_list);
+ gpr_mu_unlock(&g_wq_mu);
+ append_error(&error, grpc_wakeup_fd_consume_wakeup(&global_wakeup_fd),
+ err_desc);
+ } else {
+ grpc_fd *fd = (grpc_fd *)(data_ptr);
+ bool cancel = (events[i].events & (EPOLLERR | EPOLLHUP)) != 0;
+ bool read_ev = (events[i].events & (EPOLLIN | EPOLLPRI)) != 0;
+ bool write_ev = (events[i].events & EPOLLOUT) != 0;
+ if (read_ev || cancel) {
+ fd_become_readable(exec_ctx, fd, pollset);
+ }
+ if (write_ev || cancel) {
+ fd_become_writable(exec_ctx, fd);
+ }
+ }
+ }
+
+ return error;
+}
+
+static bool begin_worker(grpc_pollset *pollset, grpc_pollset_worker *worker,
+ grpc_pollset_worker **worker_hdl, gpr_timespec *now,
+ gpr_timespec deadline) {
+ if (worker_hdl != NULL) *worker_hdl = worker;
+ worker->initialized_cv = false;
+ worker->kick_state = UNKICKED;
+ worker->schedule_on_end_work = (grpc_closure_list)GRPC_CLOSURE_LIST_INIT;
+ pollset->begin_refs++;
+
+ if (pollset->seen_inactive) {
+ // pollset has been observed to be inactive, we need to move back to the
+ // active list
+ bool is_reassigning = false;
+ if (!pollset->reassigning_neighbourhood) {
+ is_reassigning = true;
+ pollset->reassigning_neighbourhood = true;
+ pollset->neighbourhood = &g_neighbourhoods[choose_neighbourhood()];
+ }
+ pollset_neighbourhood *neighbourhood = pollset->neighbourhood;
+ gpr_mu_unlock(&pollset->mu);
+ // pollset unlocked: state may change (even worker->kick_state)
+ retry_lock_neighbourhood:
+ gpr_mu_lock(&neighbourhood->mu);
+ gpr_mu_lock(&pollset->mu);
+ if (pollset->seen_inactive) {
+ if (neighbourhood != pollset->neighbourhood) {
+ gpr_mu_unlock(&neighbourhood->mu);
+ neighbourhood = pollset->neighbourhood;
+ gpr_mu_unlock(&pollset->mu);
+ goto retry_lock_neighbourhood;
+ }
+ pollset->seen_inactive = false;
+ if (neighbourhood->active_root == NULL) {
+ neighbourhood->active_root = pollset->next = pollset->prev = pollset;
+ if (gpr_atm_no_barrier_cas(&g_active_poller, 0, (gpr_atm)worker)) {
+ worker->kick_state = DESIGNATED_POLLER;
+ }
+ } else {
+ pollset->next = neighbourhood->active_root;
+ pollset->prev = pollset->next->prev;
+ pollset->next->prev = pollset->prev->next = pollset;
+ }
+ }
+ if (is_reassigning) {
+ GPR_ASSERT(pollset->reassigning_neighbourhood);
+ pollset->reassigning_neighbourhood = false;
+ }
+ gpr_mu_unlock(&neighbourhood->mu);
+ }
+ worker_insert(pollset, worker);
+ pollset->begin_refs--;
+ if (worker->kick_state == UNKICKED) {
+ GPR_ASSERT(gpr_atm_no_barrier_load(&g_active_poller) != (gpr_atm)worker);
+ worker->initialized_cv = true;
+ gpr_cv_init(&worker->cv);
+ while (worker->kick_state == UNKICKED &&
+ pollset->shutdown_closure == NULL) {
+ if (gpr_cv_wait(&worker->cv, &pollset->mu, deadline) &&
+ worker->kick_state == UNKICKED) {
+ worker->kick_state = KICKED;
+ }
+ }
+ *now = gpr_now(now->clock_type);
+ }
+
+ return worker->kick_state == DESIGNATED_POLLER &&
+ pollset->shutdown_closure == NULL;
+}
+
+static bool check_neighbourhood_for_available_poller(
+ pollset_neighbourhood *neighbourhood) {
+ bool found_worker = false;
+ do {
+ grpc_pollset *inspect = neighbourhood->active_root;
+ if (inspect == NULL) {
+ break;
+ }
+ gpr_mu_lock(&inspect->mu);
+ GPR_ASSERT(!inspect->seen_inactive);
+ grpc_pollset_worker *inspect_worker = inspect->root_worker;
+ if (inspect_worker != NULL) {
+ do {
+ switch (inspect_worker->kick_state) {
+ case UNKICKED:
+ if (gpr_atm_no_barrier_cas(&g_active_poller, 0,
+ (gpr_atm)inspect_worker)) {
+ inspect_worker->kick_state = DESIGNATED_POLLER;
+ if (inspect_worker->initialized_cv) {
+ gpr_cv_signal(&inspect_worker->cv);
+ }
+ }
+ // even if we didn't win the cas, there's a worker, we can stop
+ found_worker = true;
+ break;
+ case KICKED:
+ break;
+ case DESIGNATED_POLLER:
+ found_worker = true; // ok, so someone else found the worker, but
+ // we'll accept that
+ break;
+ }
+ inspect_worker = inspect_worker->next;
+ } while (inspect_worker != inspect->root_worker);
+ }
+ if (!found_worker) {
+ inspect->seen_inactive = true;
+ if (inspect == neighbourhood->active_root) {
+ neighbourhood->active_root =
+ inspect->next == inspect ? NULL : inspect->next;
+ }
+ inspect->next->prev = inspect->prev;
+ inspect->prev->next = inspect->next;
+ inspect->next = inspect->prev = NULL;
+ }
+ gpr_mu_unlock(&inspect->mu);
+ } while (!found_worker);
+ return found_worker;
+}
+
+static void end_worker(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
+ grpc_pollset_worker *worker,
+ grpc_pollset_worker **worker_hdl) {
+ if (worker_hdl != NULL) *worker_hdl = NULL;
+ worker->kick_state = KICKED;
+ grpc_closure_list_move(&worker->schedule_on_end_work,
+ &exec_ctx->closure_list);
+ if (gpr_atm_no_barrier_load(&g_active_poller) == (gpr_atm)worker) {
+ if (worker->next != worker && worker->next->kick_state == UNKICKED) {
+ GPR_ASSERT(worker->next->initialized_cv);
+ gpr_atm_no_barrier_store(&g_active_poller, (gpr_atm)worker->next);
+ worker->next->kick_state = DESIGNATED_POLLER;
+ gpr_cv_signal(&worker->next->cv);
+ if (grpc_exec_ctx_has_work(exec_ctx)) {
+ gpr_mu_unlock(&pollset->mu);
+ grpc_exec_ctx_flush(exec_ctx);
+ gpr_mu_lock(&pollset->mu);
+ }
+ } else {
+ gpr_atm_no_barrier_store(&g_active_poller, 0);
+ gpr_mu_unlock(&pollset->mu);
+ size_t poller_neighbourhood_idx =
+ (size_t)(pollset->neighbourhood - g_neighbourhoods);
+ bool found_worker = false;
+ bool scan_state[MAX_NEIGHBOURHOODS];
+ for (size_t i = 0; !found_worker && i < g_num_neighbourhoods; i++) {
+ pollset_neighbourhood *neighbourhood =
+ &g_neighbourhoods[(poller_neighbourhood_idx + i) %
+ g_num_neighbourhoods];
+ if (gpr_mu_trylock(&neighbourhood->mu)) {
+ found_worker =
+ check_neighbourhood_for_available_poller(neighbourhood);
+ gpr_mu_unlock(&neighbourhood->mu);
+ scan_state[i] = true;
+ } else {
+ scan_state[i] = false;
+ }
+ }
+ for (size_t i = 0; !found_worker && i < g_num_neighbourhoods; i++) {
+ if (scan_state[i]) continue;
+ pollset_neighbourhood *neighbourhood =
+ &g_neighbourhoods[(poller_neighbourhood_idx + i) %
+ g_num_neighbourhoods];
+ gpr_mu_lock(&neighbourhood->mu);
+ found_worker = check_neighbourhood_for_available_poller(neighbourhood);
+ gpr_mu_unlock(&neighbourhood->mu);
+ }
+ grpc_exec_ctx_flush(exec_ctx);
+ gpr_mu_lock(&pollset->mu);
+ }
+ } else if (grpc_exec_ctx_has_work(exec_ctx)) {
+ gpr_mu_unlock(&pollset->mu);
+ grpc_exec_ctx_flush(exec_ctx);
+ gpr_mu_lock(&pollset->mu);
+ }
+ if (worker->initialized_cv) {
+ gpr_cv_destroy(&worker->cv);
+ }
+ if (EMPTIED == worker_remove(pollset, worker)) {
+ pollset_maybe_finish_shutdown(exec_ctx, pollset);
+ }
+ GPR_ASSERT(gpr_atm_no_barrier_load(&g_active_poller) != (gpr_atm)worker);
+}
+
+/* pollset->po.mu lock must be held by the caller before calling this.
+ The function pollset_work() may temporarily release the lock (pollset->po.mu)
+ during the course of its execution but it will always re-acquire the lock and
+ ensure that it is held by the time the function returns */
+static grpc_error *pollset_work(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
+ grpc_pollset_worker **worker_hdl,
+ gpr_timespec now, gpr_timespec deadline) {
+ grpc_pollset_worker worker;
+ grpc_error *error = GRPC_ERROR_NONE;
+ static const char *err_desc = "pollset_work";
+ if (pollset->kicked_without_poller) {
+ pollset->kicked_without_poller = false;
+ return GRPC_ERROR_NONE;
+ }
+ gpr_tls_set(&g_current_thread_pollset, (intptr_t)pollset);
+ if (begin_worker(pollset, &worker, worker_hdl, &now, deadline)) {
+ gpr_tls_set(&g_current_thread_worker, (intptr_t)&worker);
+ GPR_ASSERT(!pollset->shutdown_closure);
+ GPR_ASSERT(!pollset->seen_inactive);
+ gpr_mu_unlock(&pollset->mu);
+ append_error(&error, pollset_epoll(exec_ctx, pollset, now, deadline),
+ err_desc);
+ gpr_mu_lock(&pollset->mu);
+ gpr_tls_set(&g_current_thread_worker, 0);
+ }
+ end_worker(exec_ctx, pollset, &worker, worker_hdl);
+ gpr_tls_set(&g_current_thread_pollset, 0);
+ return error;
+}
+
+static grpc_error *pollset_kick(grpc_pollset *pollset,
+ grpc_pollset_worker *specific_worker) {
+ if (specific_worker == NULL) {
+ if (gpr_tls_get(&g_current_thread_pollset) != (intptr_t)pollset) {
+ grpc_pollset_worker *root_worker = pollset->root_worker;
+ if (root_worker == NULL) {
+ pollset->kicked_without_poller = true;
+ return GRPC_ERROR_NONE;
+ }
+ grpc_pollset_worker *next_worker = root_worker->next;
+ if (root_worker == next_worker &&
+ root_worker == (grpc_pollset_worker *)gpr_atm_no_barrier_load(
+ &g_active_poller)) {
+ root_worker->kick_state = KICKED;
+ return grpc_wakeup_fd_wakeup(&global_wakeup_fd);
+ } else if (next_worker->kick_state == UNKICKED) {
+ GPR_ASSERT(next_worker->initialized_cv);
+ next_worker->kick_state = KICKED;
+ gpr_cv_signal(&next_worker->cv);
+ return GRPC_ERROR_NONE;
+ } else {
+ return GRPC_ERROR_NONE;
+ }
+ } else {
+ return GRPC_ERROR_NONE;
+ }
+ } else if (specific_worker->kick_state == KICKED) {
+ return GRPC_ERROR_NONE;
+ } else if (gpr_tls_get(&g_current_thread_worker) ==
+ (intptr_t)specific_worker) {
+ specific_worker->kick_state = KICKED;
+ return GRPC_ERROR_NONE;
+ } else if (specific_worker ==
+ (grpc_pollset_worker *)gpr_atm_no_barrier_load(&g_active_poller)) {
+ specific_worker->kick_state = KICKED;
+ return grpc_wakeup_fd_wakeup(&global_wakeup_fd);
+ } else if (specific_worker->initialized_cv) {
+ specific_worker->kick_state = KICKED;
+ gpr_cv_signal(&specific_worker->cv);
+ return GRPC_ERROR_NONE;
+ } else {
+ specific_worker->kick_state = KICKED;
+ return GRPC_ERROR_NONE;
+ }
+}
+
+static void pollset_add_fd(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
+ grpc_fd *fd) {}
+
+/*******************************************************************************
+ * Workqueue Definitions
+ */
+
+#ifdef GRPC_WORKQUEUE_REFCOUNT_DEBUG
+static grpc_workqueue *workqueue_ref(grpc_workqueue *workqueue,
+ const char *file, int line,
+ const char *reason) {
+ return workqueue;
+}
+
+static void workqueue_unref(grpc_exec_ctx *exec_ctx, grpc_workqueue *workqueue,
+ const char *file, int line, const char *reason) {}
+#else
+static grpc_workqueue *workqueue_ref(grpc_workqueue *workqueue) {
+ return workqueue;
+}
+
+static void workqueue_unref(grpc_exec_ctx *exec_ctx,
+ grpc_workqueue *workqueue) {}
+#endif
+
+static void wq_sched(grpc_exec_ctx *exec_ctx, grpc_closure *closure,
+ grpc_error *error) {
+ // find a neighbourhood to wakeup
+ bool scheduled = false;
+ size_t initial_neighbourhood = choose_neighbourhood();
+ for (size_t i = 0; !scheduled && i < g_num_neighbourhoods; i++) {
+ pollset_neighbourhood *neighbourhood =
+ &g_neighbourhoods[(initial_neighbourhood + i) % g_num_neighbourhoods];
+ if (gpr_mu_trylock(&neighbourhood->mu)) {
+ if (neighbourhood->active_root != NULL) {
+ grpc_pollset *inspect = neighbourhood->active_root;
+ do {
+ if (gpr_mu_trylock(&inspect->mu)) {
+ if (inspect->root_worker != NULL) {
+ grpc_pollset_worker *inspect_worker = inspect->root_worker;
+ do {
+ if (inspect_worker->kick_state == UNKICKED) {
+ inspect_worker->kick_state = KICKED;
+ grpc_closure_list_append(
+ &inspect_worker->schedule_on_end_work, closure, error);
+ if (inspect_worker->initialized_cv) {
+ gpr_cv_signal(&inspect_worker->cv);
+ }
+ scheduled = true;
+ }
+ inspect_worker = inspect_worker->next;
+ } while (!scheduled && inspect_worker != inspect->root_worker);
+ }
+ gpr_mu_unlock(&inspect->mu);
+ }
+ inspect = inspect->next;
+ } while (!scheduled && inspect != neighbourhood->active_root);
+ }
+ gpr_mu_unlock(&neighbourhood->mu);
+ }
+ }
+ if (!scheduled) {
+ gpr_mu_lock(&g_wq_mu);
+ grpc_closure_list_append(&g_wq_items, closure, error);
+ gpr_mu_unlock(&g_wq_mu);
+ GRPC_LOG_IF_ERROR("workqueue_scheduler",
+ grpc_wakeup_fd_wakeup(&global_wakeup_fd));
+ }
+}
+
+static const grpc_closure_scheduler_vtable
+ singleton_workqueue_scheduler_vtable = {wq_sched, wq_sched,
+ "epoll1_workqueue"};
+
+static grpc_closure_scheduler singleton_workqueue_scheduler = {
+ &singleton_workqueue_scheduler_vtable};
+
+static grpc_closure_scheduler *workqueue_scheduler(grpc_workqueue *workqueue) {
+ return &singleton_workqueue_scheduler;
+}
+
+/*******************************************************************************
+ * Pollset-set Definitions
+ */
+
+static grpc_pollset_set *pollset_set_create(void) {
+ return (grpc_pollset_set *)((intptr_t)0xdeafbeef);
+}
+
+static void pollset_set_destroy(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *pss) {}
+
+static void pollset_set_add_fd(grpc_exec_ctx *exec_ctx, grpc_pollset_set *pss,
+ grpc_fd *fd) {}
+
+static void pollset_set_del_fd(grpc_exec_ctx *exec_ctx, grpc_pollset_set *pss,
+ grpc_fd *fd) {}
+
+static void pollset_set_add_pollset(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *pss, grpc_pollset *ps) {}
+
+static void pollset_set_del_pollset(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *pss, grpc_pollset *ps) {}
+
+static void pollset_set_add_pollset_set(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *bag,
+ grpc_pollset_set *item) {}
+
+static void pollset_set_del_pollset_set(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *bag,
+ grpc_pollset_set *item) {}
+
+/*******************************************************************************
+ * Event engine binding
+ */
+
+static void shutdown_engine(void) {
+ fd_global_shutdown();
+ pollset_global_shutdown();
+}
+
+static const grpc_event_engine_vtable vtable = {
+ .pollset_size = sizeof(grpc_pollset),
+
+ .fd_create = fd_create,
+ .fd_wrapped_fd = fd_wrapped_fd,
+ .fd_orphan = fd_orphan,
+ .fd_shutdown = fd_shutdown,
+ .fd_is_shutdown = fd_is_shutdown,
+ .fd_notify_on_read = fd_notify_on_read,
+ .fd_notify_on_write = fd_notify_on_write,
+ .fd_get_read_notifier_pollset = fd_get_read_notifier_pollset,
+ .fd_get_workqueue = fd_get_workqueue,
+
+ .pollset_init = pollset_init,
+ .pollset_shutdown = pollset_shutdown,
+ .pollset_destroy = pollset_destroy,
+ .pollset_work = pollset_work,
+ .pollset_kick = pollset_kick,
+ .pollset_add_fd = pollset_add_fd,
+
+ .pollset_set_create = pollset_set_create,
+ .pollset_set_destroy = pollset_set_destroy,
+ .pollset_set_add_pollset = pollset_set_add_pollset,
+ .pollset_set_del_pollset = pollset_set_del_pollset,
+ .pollset_set_add_pollset_set = pollset_set_add_pollset_set,
+ .pollset_set_del_pollset_set = pollset_set_del_pollset_set,
+ .pollset_set_add_fd = pollset_set_add_fd,
+ .pollset_set_del_fd = pollset_set_del_fd,
+
+ .workqueue_ref = workqueue_ref,
+ .workqueue_unref = workqueue_unref,
+ .workqueue_scheduler = workqueue_scheduler,
+
+ .shutdown_engine = shutdown_engine,
+};
+
+/* It is possible that GLIBC has epoll but the underlying kernel doesn't.
+ * Create a dummy epoll_fd to make sure epoll support is available */
+const grpc_event_engine_vtable *grpc_init_epoll1_linux(bool explicit_request) {
+ /* TODO(ctiller): temporary, until this stabilizes */
+ if (!explicit_request) return NULL;
+
+ if (!grpc_has_wakeup_fd()) {
+ return NULL;
+ }
+
+ g_epfd = epoll_create1(EPOLL_CLOEXEC);
+ if (g_epfd < 0) {
+ gpr_log(GPR_ERROR, "epoll unavailable");
+ return NULL;
+ }
+
+ fd_global_init();
+
+ if (!GRPC_LOG_IF_ERROR("pollset_global_init", pollset_global_init())) {
+ close(g_epfd);
+ fd_global_shutdown();
+ return NULL;
+ }
+
+ return &vtable;
+}
+
+#else /* defined(GRPC_LINUX_EPOLL) */
+#if defined(GRPC_POSIX_SOCKET)
+#include "src/core/lib/iomgr/ev_posix.h"
+/* If GRPC_LINUX_EPOLL is not defined, it means epoll is not available. Return
+ * NULL */
+const grpc_event_engine_vtable *grpc_init_epoll1_linux(bool explicit_request) {
+ return NULL;
+}
+#endif /* defined(GRPC_POSIX_SOCKET) */
+#endif /* !defined(GRPC_LINUX_EPOLL) */
diff --git a/src/core/lib/iomgr/ev_epoll1_linux.h b/src/core/lib/iomgr/ev_epoll1_linux.h
new file mode 100644
index 0000000000..bd52478a7c
--- /dev/null
+++ b/src/core/lib/iomgr/ev_epoll1_linux.h
@@ -0,0 +1,44 @@
+/*
+ *
+ * Copyright 2017, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef GRPC_CORE_LIB_IOMGR_EV_EPOLL1_LINUX_H
+#define GRPC_CORE_LIB_IOMGR_EV_EPOLL1_LINUX_H
+
+#include "src/core/lib/iomgr/ev_posix.h"
+#include "src/core/lib/iomgr/port.h"
+
+// a polling engine that utilizes a singleton epoll set and turnstile polling
+
+const grpc_event_engine_vtable *grpc_init_epoll1_linux(bool explicit_request);
+
+#endif /* GRPC_CORE_LIB_IOMGR_EV_EPOLL1_LINUX_H */
diff --git a/src/core/lib/iomgr/ev_epoll_limited_pollers_linux.c b/src/core/lib/iomgr/ev_epoll_limited_pollers_linux.c
new file mode 100644
index 0000000000..d23bf6c06c
--- /dev/null
+++ b/src/core/lib/iomgr/ev_epoll_limited_pollers_linux.c
@@ -0,0 +1,2146 @@
+/*
+ *
+ * Copyright 2017, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "src/core/lib/iomgr/port.h"
+
+/* This polling engine is only relevant on linux kernels supporting epoll() */
+#ifdef GRPC_LINUX_EPOLL
+
+#include "src/core/lib/iomgr/ev_epoll_limited_pollers_linux.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <poll.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include <grpc/support/alloc.h>
+#include <grpc/support/log.h>
+#include <grpc/support/string_util.h>
+#include <grpc/support/tls.h>
+#include <grpc/support/useful.h>
+
+#include "src/core/lib/debug/trace.h"
+#include "src/core/lib/iomgr/ev_posix.h"
+#include "src/core/lib/iomgr/iomgr_internal.h"
+#include "src/core/lib/iomgr/lockfree_event.h"
+#include "src/core/lib/iomgr/timer.h"
+#include "src/core/lib/iomgr/wakeup_fd_posix.h"
+#include "src/core/lib/iomgr/workqueue.h"
+#include "src/core/lib/profiling/timers.h"
+#include "src/core/lib/support/block_annotate.h"
+#include "src/core/lib/support/env.h"
+
+#define GRPC_POLLING_TRACE(fmt, ...) \
+ if (GRPC_TRACER_ON(grpc_polling_trace)) { \
+ gpr_log(GPR_INFO, (fmt), __VA_ARGS__); \
+ }
+
+#define GRPC_POLLSET_KICK_BROADCAST ((grpc_pollset_worker *)1)
+
+/* Uncomment the following to enable extra checks on poll_object operations */
+/* #define PO_DEBUG */
+
+/* The maximum number of polling threads per polling island. By default no
+ limit */
+static int g_max_pollers_per_pi = INT_MAX;
+
+static int grpc_wakeup_signal = -1;
+static bool is_grpc_wakeup_signal_initialized = false;
+
+/* Implements the function defined in grpc_posix.h. This function might be
+ * called before even calling grpc_init() to set either a different signal to
+ * use. If signum == -1, then the use of signals is disabled */
+static void grpc_use_signal(int signum) {
+ grpc_wakeup_signal = signum;
+ is_grpc_wakeup_signal_initialized = true;
+
+ if (grpc_wakeup_signal < 0) {
+ gpr_log(GPR_INFO,
+ "Use of signals is disabled. Epoll engine will not be used");
+ } else {
+ gpr_log(GPR_INFO, "epoll engine will be using signal: %d",
+ grpc_wakeup_signal);
+ }
+}
+
+struct polling_island;
+
+typedef enum {
+ POLL_OBJ_FD,
+ POLL_OBJ_POLLSET,
+ POLL_OBJ_POLLSET_SET
+} poll_obj_type;
+
+typedef struct poll_obj {
+#ifdef PO_DEBUG
+ poll_obj_type obj_type;
+#endif
+ gpr_mu mu;
+ struct polling_island *pi;
+} poll_obj;
+
+static const char *poll_obj_string(poll_obj_type po_type) {
+ switch (po_type) {
+ case POLL_OBJ_FD:
+ return "fd";
+ case POLL_OBJ_POLLSET:
+ return "pollset";
+ case POLL_OBJ_POLLSET_SET:
+ return "pollset_set";
+ }
+
+ GPR_UNREACHABLE_CODE(return "UNKNOWN");
+}
+
+/*******************************************************************************
+ * Fd Declarations
+ */
+
+#define FD_FROM_PO(po) ((grpc_fd *)(po))
+
+struct grpc_fd {
+ poll_obj po;
+
+ int fd;
+ /* refst format:
+ bit 0 : 1=Active / 0=Orphaned
+ bits 1-n : refcount
+ Ref/Unref by two to avoid altering the orphaned bit */
+ gpr_atm refst;
+
+ /* The fd is either closed or we relinquished control of it. In either
+ cases, this indicates that the 'fd' on this structure is no longer
+ valid */
+ bool orphaned;
+
+ gpr_atm read_closure;
+ gpr_atm write_closure;
+
+ struct grpc_fd *freelist_next;
+ grpc_closure *on_done_closure;
+
+ /* The pollset that last noticed that the fd is readable. The actual type
+ * stored in this is (grpc_pollset *) */
+ gpr_atm read_notifier_pollset;
+
+ grpc_iomgr_object iomgr_object;
+};
+
+/* Reference counting for fds */
+// #define GRPC_FD_REF_COUNT_DEBUG
+#ifdef GRPC_FD_REF_COUNT_DEBUG
+static void fd_ref(grpc_fd *fd, const char *reason, const char *file, int line);
+static void fd_unref(grpc_fd *fd, const char *reason, const char *file,
+ int line);
+#define GRPC_FD_REF(fd, reason) fd_ref(fd, reason, __FILE__, __LINE__)
+#define GRPC_FD_UNREF(fd, reason) fd_unref(fd, reason, __FILE__, __LINE__)
+#else
+static void fd_ref(grpc_fd *fd);
+static void fd_unref(grpc_fd *fd);
+#define GRPC_FD_REF(fd, reason) fd_ref(fd)
+#define GRPC_FD_UNREF(fd, reason) fd_unref(fd)
+#endif
+
+static void fd_global_init(void);
+static void fd_global_shutdown(void);
+
+/*******************************************************************************
+ * Polling island Declarations
+ */
+
+#ifdef GRPC_WORKQUEUE_REFCOUNT_DEBUG
+
+#define PI_ADD_REF(p, r) pi_add_ref_dbg((p), (r), __FILE__, __LINE__)
+#define PI_UNREF(exec_ctx, p, r) \
+ pi_unref_dbg((exec_ctx), (p), (r), __FILE__, __LINE__)
+
+#else /* defined(GRPC_WORKQUEUE_REFCOUNT_DEBUG) */
+
+#define PI_ADD_REF(p, r) pi_add_ref((p))
+#define PI_UNREF(exec_ctx, p, r) pi_unref((exec_ctx), (p))
+
+#endif /* !defined(GRPC_PI_REF_COUNT_DEBUG) */
+
+typedef struct worker_node {
+ struct worker_node *next;
+ struct worker_node *prev;
+} worker_node;
+
+/* This is also used as grpc_workqueue (by directly casing it) */
+typedef struct polling_island {
+ grpc_closure_scheduler workqueue_scheduler;
+
+ gpr_mu mu;
+ /* Ref count. Use PI_ADD_REF() and PI_UNREF() macros to increment/decrement
+ the refcount.
+ Once the ref count becomes zero, this structure is destroyed which means
+ we should ensure that there is never a scenario where a PI_ADD_REF() is
+ racing with a PI_UNREF() that just made the ref_count zero. */
+ gpr_atm ref_count;
+
+ /* Pointer to the polling_island this merged into.
+ * merged_to value is only set once in polling_island's lifetime (and that too
+ * only if the island is merged with another island). Because of this, we can
+ * use gpr_atm type here so that we can do atomic access on this and reduce
+ * lock contention on 'mu' mutex.
+ *
+ * Note that if this field is not NULL (i.e not 0), all the remaining fields
+ * (except mu and ref_count) are invalid and must be ignored. */
+ gpr_atm merged_to;
+
+ /* Number of threads currently polling on this island */
+ gpr_atm poller_count;
+ /* Mutex guarding the read end of the workqueue (must be held to pop from
+ * workqueue_items) */
+ gpr_mu workqueue_read_mu;
+ /* Queue of closures to be executed */
+ gpr_mpscq workqueue_items;
+ /* Count of items in workqueue_items */
+ gpr_atm workqueue_item_count;
+ /* Wakeup fd used to wake pollers to check the contents of workqueue_items */
+ grpc_wakeup_fd workqueue_wakeup_fd;
+
+ /* The list of workers waiting to do polling on this polling island */
+ gpr_mu worker_list_mu;
+ worker_node worker_list_head;
+
+ /* The fd of the underlying epoll set */
+ int epoll_fd;
+
+ /* The file descriptors in the epoll set */
+ size_t fd_cnt;
+ size_t fd_capacity;
+ grpc_fd **fds;
+} polling_island;
+
+/*******************************************************************************
+ * Pollset Declarations
+ */
+#define WORKER_FROM_WORKER_LIST_NODE(p) \
+ (struct grpc_pollset_worker *)(((char *)(p)) - \
+ offsetof(grpc_pollset_worker, pi_list_link))
+struct grpc_pollset_worker {
+ /* Thread id of this worker */
+ pthread_t pt_id;
+
+ /* Used to prevent a worker from getting kicked multiple times */
+ gpr_atm is_kicked;
+
+ struct grpc_pollset_worker *next;
+ struct grpc_pollset_worker *prev;
+
+ /* Indicates if it is this worker's turn to do epoll */
+ gpr_atm is_polling_turn;
+
+ /* Node in the polling island's worker list. */
+ worker_node pi_list_link;
+};
+
+struct grpc_pollset {
+ poll_obj po;
+
+ grpc_pollset_worker root_worker;
+ bool kicked_without_pollers;
+
+ bool shutting_down; /* Is the pollset shutting down ? */
+ bool finish_shutdown_called; /* Is the 'finish_shutdown_locked()' called ? */
+ grpc_closure *shutdown_done; /* Called after after shutdown is complete */
+};
+
+/*******************************************************************************
+ * Pollset-set Declarations
+ */
+struct grpc_pollset_set {
+ poll_obj po;
+};
+
+/*******************************************************************************
+ * Common helpers
+ */
+
+static bool append_error(grpc_error **composite, grpc_error *error,
+ const char *desc) {
+ if (error == GRPC_ERROR_NONE) return true;
+ if (*composite == GRPC_ERROR_NONE) {
+ *composite = GRPC_ERROR_CREATE_FROM_COPIED_STRING(desc);
+ }
+ *composite = grpc_error_add_child(*composite, error);
+ return false;
+}
+
+/*******************************************************************************
+ * Polling island Definitions
+ */
+
+/* The wakeup fd that is used to wake up all threads in a Polling island. This
+ is useful in the polling island merge operation where we need to wakeup all
+ the threads currently polling the smaller polling island (so that they can
+ start polling the new/merged polling island)
+
+ NOTE: This fd is initialized to be readable and MUST NOT be consumed i.e the
+ threads that woke up MUST NOT call grpc_wakeup_fd_consume_wakeup() */
+static grpc_wakeup_fd polling_island_wakeup_fd;
+
+/* The polling island being polled right now.
+ See comments in workqueue_maybe_wakeup for why this is tracked. */
+static __thread polling_island *g_current_thread_polling_island;
+
+/* Forward declaration */
+static void polling_island_delete(grpc_exec_ctx *exec_ctx, polling_island *pi);
+static void workqueue_enqueue(grpc_exec_ctx *exec_ctx, grpc_closure *closure,
+ grpc_error *error);
+
+#ifdef GRPC_TSAN
+/* Currently TSAN may incorrectly flag data races between epoll_ctl and
+ epoll_wait for any grpc_fd structs that are added to the epoll set via
+ epoll_ctl and are returned (within a very short window) via epoll_wait().
+
+ To work-around this race, we establish a happens-before relation between
+ the code just-before epoll_ctl() and the code after epoll_wait() by using
+ this atomic */
+gpr_atm g_epoll_sync;
+#endif /* defined(GRPC_TSAN) */
+
+static const grpc_closure_scheduler_vtable workqueue_scheduler_vtable = {
+ workqueue_enqueue, workqueue_enqueue, "workqueue"};
+
+static void pi_add_ref(polling_island *pi);
+static void pi_unref(grpc_exec_ctx *exec_ctx, polling_island *pi);
+
+#ifdef GRPC_WORKQUEUE_REFCOUNT_DEBUG
+static void pi_add_ref_dbg(polling_island *pi, const char *reason,
+ const char *file, int line) {
+ long old_cnt = gpr_atm_acq_load(&pi->ref_count);
+ pi_add_ref(pi);
+ gpr_log(GPR_DEBUG, "Add ref pi: %p, old: %ld -> new:%ld (%s) - (%s, %d)",
+ (void *)pi, old_cnt, old_cnt + 1, reason, file, line);
+}
+
+static void pi_unref_dbg(grpc_exec_ctx *exec_ctx, polling_island *pi,
+ const char *reason, const char *file, int line) {
+ long old_cnt = gpr_atm_acq_load(&pi->ref_count);
+ pi_unref(exec_ctx, pi);
+ gpr_log(GPR_DEBUG, "Unref pi: %p, old:%ld -> new:%ld (%s) - (%s, %d)",
+ (void *)pi, old_cnt, (old_cnt - 1), reason, file, line);
+}
+
+static grpc_workqueue *workqueue_ref(grpc_workqueue *workqueue,
+ const char *file, int line,
+ const char *reason) {
+ if (workqueue != NULL) {
+ pi_add_ref_dbg((polling_island *)workqueue, reason, file, line);
+ }
+ return workqueue;
+}
+
+static void workqueue_unref(grpc_exec_ctx *exec_ctx, grpc_workqueue *workqueue,
+ const char *file, int line, const char *reason) {
+ if (workqueue != NULL) {
+ pi_unref_dbg(exec_ctx, (polling_island *)workqueue, reason, file, line);
+ }
+}
+#else
+static grpc_workqueue *workqueue_ref(grpc_workqueue *workqueue) {
+ if (workqueue != NULL) {
+ pi_add_ref((polling_island *)workqueue);
+ }
+ return workqueue;
+}
+
+static void workqueue_unref(grpc_exec_ctx *exec_ctx,
+ grpc_workqueue *workqueue) {
+ if (workqueue != NULL) {
+ pi_unref(exec_ctx, (polling_island *)workqueue);
+ }
+}
+#endif
+
+static void pi_add_ref(polling_island *pi) {
+ gpr_atm_no_barrier_fetch_add(&pi->ref_count, 1);
+}
+
+static void pi_unref(grpc_exec_ctx *exec_ctx, polling_island *pi) {
+ /* If ref count went to zero, delete the polling island.
+ Note that this deletion not be done under a lock. Once the ref count goes
+ to zero, we are guaranteed that no one else holds a reference to the
+ polling island (and that there is no racing pi_add_ref() call either).
+
+ Also, if we are deleting the polling island and the merged_to field is
+ non-empty, we should remove a ref to the merged_to polling island
+ */
+ if (1 == gpr_atm_full_fetch_add(&pi->ref_count, -1)) {
+ polling_island *next = (polling_island *)gpr_atm_acq_load(&pi->merged_to);
+ polling_island_delete(exec_ctx, pi);
+ if (next != NULL) {
+ PI_UNREF(exec_ctx, next, "pi_delete"); /* Recursive call */
+ }
+ }
+}
+
+static void worker_node_init(worker_node *node) {
+ node->next = node->prev = node;
+}
+
+/* Not thread safe. Do under a list-level lock */
+static void push_back_worker_node(worker_node *head, worker_node *node) {
+ node->next = head;
+ node->prev = head->prev;
+ head->prev->next = node;
+ head->prev = node;
+}
+
+/* Not thread safe. Do under a list-level lock */
+static void remove_worker_node(worker_node *node) {
+ node->next->prev = node->prev;
+ node->prev->next = node->next;
+ /* If node's next and prev point to itself, the node is considered detached
+ * from the list*/
+ node->next = node->prev = node;
+}
+
+/* Not thread safe. Do under a list-level lock */
+static worker_node *pop_front_worker_node(worker_node *head) {
+ worker_node *node = head->next;
+ if (node != head) {
+ remove_worker_node(node);
+ } else {
+ node = NULL;
+ }
+
+ return node;
+}
+
+/* Returns true if the node's next and prev are pointing to itself (which
+ indicates that the node is not in the list */
+static bool is_worker_node_detached(worker_node *node) {
+ return (node->next == node->prev && node->next == node);
+}
+
+/* The caller is expected to hold pi->mu lock before calling this function
+ */
+static void polling_island_add_fds_locked(polling_island *pi, grpc_fd **fds,
+ size_t fd_count, bool add_fd_refs,
+ grpc_error **error) {
+ int err;
+ size_t i;
+ struct epoll_event ev;
+ char *err_msg;
+ const char *err_desc = "polling_island_add_fds";
+
+#ifdef GRPC_TSAN
+ /* See the definition of g_epoll_sync for more context */
+ gpr_atm_rel_store(&g_epoll_sync, (gpr_atm)0);
+#endif /* defined(GRPC_TSAN) */
+
+ for (i = 0; i < fd_count; i++) {
+ ev.events = (uint32_t)(EPOLLIN | EPOLLOUT | EPOLLET);
+ ev.data.ptr = fds[i];
+ err = epoll_ctl(pi->epoll_fd, EPOLL_CTL_ADD, fds[i]->fd, &ev);
+
+ if (err < 0) {
+ if (errno != EEXIST) {
+ gpr_asprintf(
+ &err_msg,
+ "epoll_ctl (epoll_fd: %d) add fd: %d failed with error: %d (%s)",
+ pi->epoll_fd, fds[i]->fd, errno, strerror(errno));
+ append_error(error, GRPC_OS_ERROR(errno, err_msg), err_desc);
+ gpr_free(err_msg);
+ }
+
+ continue;
+ }
+
+ if (pi->fd_cnt == pi->fd_capacity) {
+ pi->fd_capacity = GPR_MAX(pi->fd_capacity + 8, pi->fd_cnt * 3 / 2);
+ pi->fds = gpr_realloc(pi->fds, sizeof(grpc_fd *) * pi->fd_capacity);
+ }
+
+ pi->fds[pi->fd_cnt++] = fds[i];
+ if (add_fd_refs) {
+ GRPC_FD_REF(fds[i], "polling_island");
+ }
+ }
+}
+
+/* The caller is expected to hold pi->mu before calling this */
+static void polling_island_add_wakeup_fd_locked(polling_island *pi,
+ grpc_wakeup_fd *wakeup_fd,
+ grpc_error **error) {
+ struct epoll_event ev;
+ int err;
+ char *err_msg;
+ const char *err_desc = "polling_island_add_wakeup_fd";
+
+ ev.events = (uint32_t)(EPOLLIN | EPOLLET);
+ ev.data.ptr = wakeup_fd;
+ err = epoll_ctl(pi->epoll_fd, EPOLL_CTL_ADD,
+ GRPC_WAKEUP_FD_GET_READ_FD(wakeup_fd), &ev);
+ if (err < 0 && errno != EEXIST) {
+ gpr_asprintf(&err_msg,
+ "epoll_ctl (epoll_fd: %d) add wakeup fd: %d failed with "
+ "error: %d (%s)",
+ pi->epoll_fd, GRPC_WAKEUP_FD_GET_READ_FD(wakeup_fd), errno,
+ strerror(errno));
+ append_error(error, GRPC_OS_ERROR(errno, err_msg), err_desc);
+ gpr_free(err_msg);
+ }
+}
+
+/* The caller is expected to hold pi->mu lock before calling this function */
+static void polling_island_remove_all_fds_locked(polling_island *pi,
+ bool remove_fd_refs,
+ grpc_error **error) {
+ int err;
+ size_t i;
+ char *err_msg;
+ const char *err_desc = "polling_island_remove_fds";
+
+ for (i = 0; i < pi->fd_cnt; i++) {
+ err = epoll_ctl(pi->epoll_fd, EPOLL_CTL_DEL, pi->fds[i]->fd, NULL);
+ if (err < 0 && errno != ENOENT) {
+ gpr_asprintf(&err_msg,
+ "epoll_ctl (epoll_fd: %d) delete fds[%zu]: %d failed with "
+ "error: %d (%s)",
+ pi->epoll_fd, i, pi->fds[i]->fd, errno, strerror(errno));
+ append_error(error, GRPC_OS_ERROR(errno, err_msg), err_desc);
+ gpr_free(err_msg);
+ }
+
+ if (remove_fd_refs) {
+ GRPC_FD_UNREF(pi->fds[i], "polling_island");
+ }
+ }
+
+ pi->fd_cnt = 0;
+}
+
+/* The caller is expected to hold pi->mu lock before calling this function */
+static void polling_island_remove_fd_locked(polling_island *pi, grpc_fd *fd,
+ bool is_fd_closed,
+ grpc_error **error) {
+ int err;
+ size_t i;
+ char *err_msg;
+ const char *err_desc = "polling_island_remove_fd";
+
+ /* If fd is already closed, then it would have been automatically been removed
+ from the epoll set */
+ if (!is_fd_closed) {
+ err = epoll_ctl(pi->epoll_fd, EPOLL_CTL_DEL, fd->fd, NULL);
+ if (err < 0 && errno != ENOENT) {
+ gpr_asprintf(
+ &err_msg,
+ "epoll_ctl (epoll_fd: %d) del fd: %d failed with error: %d (%s)",
+ pi->epoll_fd, fd->fd, errno, strerror(errno));
+ append_error(error, GRPC_OS_ERROR(errno, err_msg), err_desc);
+ gpr_free(err_msg);
+ }
+ }
+
+ for (i = 0; i < pi->fd_cnt; i++) {
+ if (pi->fds[i] == fd) {
+ pi->fds[i] = pi->fds[--pi->fd_cnt];
+ GRPC_FD_UNREF(fd, "polling_island");
+ break;
+ }
+ }
+}
+
+/* Might return NULL in case of an error */
+static polling_island *polling_island_create(grpc_exec_ctx *exec_ctx,
+ grpc_fd *initial_fd,
+ grpc_error **error) {
+ polling_island *pi = NULL;
+ const char *err_desc = "polling_island_create";
+
+ *error = GRPC_ERROR_NONE;
+
+ pi = gpr_malloc(sizeof(*pi));
+ pi->workqueue_scheduler.vtable = &workqueue_scheduler_vtable;
+ gpr_mu_init(&pi->mu);
+ pi->fd_cnt = 0;
+ pi->fd_capacity = 0;
+ pi->fds = NULL;
+ pi->epoll_fd = -1;
+
+ gpr_mu_init(&pi->workqueue_read_mu);
+ gpr_mpscq_init(&pi->workqueue_items);
+ gpr_atm_rel_store(&pi->workqueue_item_count, 0);
+
+ gpr_atm_rel_store(&pi->ref_count, 0);
+ gpr_atm_rel_store(&pi->poller_count, 0);
+ gpr_atm_rel_store(&pi->merged_to, (gpr_atm)NULL);
+
+ gpr_mu_init(&pi->worker_list_mu);
+ worker_node_init(&pi->worker_list_head);
+
+ if (!append_error(error, grpc_wakeup_fd_init(&pi->workqueue_wakeup_fd),
+ err_desc)) {
+ goto done;
+ }
+
+ pi->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+
+ if (pi->epoll_fd < 0) {
+ append_error(error, GRPC_OS_ERROR(errno, "epoll_create1"), err_desc);
+ goto done;
+ }
+
+ polling_island_add_wakeup_fd_locked(pi, &pi->workqueue_wakeup_fd, error);
+
+ if (initial_fd != NULL) {
+ polling_island_add_fds_locked(pi, &initial_fd, 1, true, error);
+ }
+
+done:
+ if (*error != GRPC_ERROR_NONE) {
+ polling_island_delete(exec_ctx, pi);
+ pi = NULL;
+ }
+ return pi;
+}
+
+static void polling_island_delete(grpc_exec_ctx *exec_ctx, polling_island *pi) {
+ GPR_ASSERT(pi->fd_cnt == 0);
+
+ if (pi->epoll_fd >= 0) {
+ close(pi->epoll_fd);
+ }
+ GPR_ASSERT(gpr_atm_no_barrier_load(&pi->workqueue_item_count) == 0);
+ gpr_mu_destroy(&pi->workqueue_read_mu);
+ gpr_mpscq_destroy(&pi->workqueue_items);
+ gpr_mu_destroy(&pi->mu);
+ grpc_wakeup_fd_destroy(&pi->workqueue_wakeup_fd);
+ gpr_mu_destroy(&pi->worker_list_mu);
+ GPR_ASSERT(is_worker_node_detached(&pi->worker_list_head));
+
+ gpr_free(pi->fds);
+ gpr_free(pi);
+}
+
+/* Attempts to gets the last polling island in the linked list (liked by the
+ * 'merged_to' field). Since this does not lock the polling island, there are no
+ * guarantees that the island returned is the last island */
+static polling_island *polling_island_maybe_get_latest(polling_island *pi) {
+ polling_island *next = (polling_island *)gpr_atm_acq_load(&pi->merged_to);
+ while (next != NULL) {
+ pi = next;
+ next = (polling_island *)gpr_atm_acq_load(&pi->merged_to);
+ }
+
+ return pi;
+}
+
+/* Gets the lock on the *latest* polling island i.e the last polling island in
+ the linked list (linked by the 'merged_to' field). Call gpr_mu_unlock on the
+ returned polling island's mu.
+ Usage: To lock/unlock polling island "pi", do the following:
+ polling_island *pi_latest = polling_island_lock(pi);
+ ...
+ ... critical section ..
+ ...
+ gpr_mu_unlock(&pi_latest->mu); // NOTE: use pi_latest->mu. NOT pi->mu */
+static polling_island *polling_island_lock(polling_island *pi) {
+ polling_island *next = NULL;
+
+ while (true) {
+ next = (polling_island *)gpr_atm_acq_load(&pi->merged_to);
+ if (next == NULL) {
+ /* Looks like 'pi' is the last node in the linked list but unless we check
+ this by holding the pi->mu lock, we cannot be sure (i.e without the
+ pi->mu lock, we don't prevent island merges).
+ To be absolutely sure, check once more by holding the pi->mu lock */
+ gpr_mu_lock(&pi->mu);
+ next = (polling_island *)gpr_atm_acq_load(&pi->merged_to);
+ if (next == NULL) {
+ /* pi is infact the last node and we have the pi->mu lock. we're done */
+ break;
+ }
+
+ /* pi->merged_to is not NULL i.e pi isn't the last node anymore. pi->mu
+ * isn't the lock we are interested in. Continue traversing the list */
+ gpr_mu_unlock(&pi->mu);
+ }
+
+ pi = next;
+ }
+
+ return pi;
+}
+
+/* Gets the lock on the *latest* polling islands in the linked lists pointed by
+ *p and *q (and also updates *p and *q to point to the latest polling islands)
+
+ This function is needed because calling the following block of code to obtain
+ locks on polling islands (*p and *q) is prone to deadlocks.
+ {
+ polling_island_lock(*p, true);
+ polling_island_lock(*q, true);
+ }
+
+ Usage/example:
+ polling_island *p1;
+ polling_island *p2;
+ ..
+ polling_island_lock_pair(&p1, &p2);
+ ..
+ .. Critical section with both p1 and p2 locked
+ ..
+ // Release locks: Always call polling_island_unlock_pair() to release locks
+ polling_island_unlock_pair(p1, p2);
+*/
+static void polling_island_lock_pair(polling_island **p, polling_island **q) {
+ polling_island *pi_1 = *p;
+ polling_island *pi_2 = *q;
+ polling_island *next_1 = NULL;
+ polling_island *next_2 = NULL;
+
+ /* The algorithm is simple:
+ - Go to the last polling islands in the linked lists *pi_1 and *pi_2 (and
+ keep updating pi_1 and pi_2)
+ - Then obtain locks on the islands by following a lock order rule of
+ locking polling_island with lower address first
+ Special case: Before obtaining the locks, check if pi_1 and pi_2 are
+ pointing to the same island. If that is the case, we can just call
+ polling_island_lock()
+ - After obtaining both the locks, double check that the polling islands
+ are still the last polling islands in their respective linked lists
+ (this is because there might have been polling island merges before
+ we got the lock)
+ - If the polling islands are the last islands, we are done. If not,
+ release the locks and continue the process from the first step */
+ while (true) {
+ next_1 = (polling_island *)gpr_atm_acq_load(&pi_1->merged_to);
+ while (next_1 != NULL) {
+ pi_1 = next_1;
+ next_1 = (polling_island *)gpr_atm_acq_load(&pi_1->merged_to);
+ }
+
+ next_2 = (polling_island *)gpr_atm_acq_load(&pi_2->merged_to);
+ while (next_2 != NULL) {
+ pi_2 = next_2;
+ next_2 = (polling_island *)gpr_atm_acq_load(&pi_2->merged_to);
+ }
+
+ if (pi_1 == pi_2) {
+ pi_1 = pi_2 = polling_island_lock(pi_1);
+ break;
+ }
+
+ if (pi_1 < pi_2) {
+ gpr_mu_lock(&pi_1->mu);
+ gpr_mu_lock(&pi_2->mu);
+ } else {
+ gpr_mu_lock(&pi_2->mu);
+ gpr_mu_lock(&pi_1->mu);
+ }
+
+ next_1 = (polling_island *)gpr_atm_acq_load(&pi_1->merged_to);
+ next_2 = (polling_island *)gpr_atm_acq_load(&pi_2->merged_to);
+ if (next_1 == NULL && next_2 == NULL) {
+ break;
+ }
+
+ gpr_mu_unlock(&pi_1->mu);
+ gpr_mu_unlock(&pi_2->mu);
+ }
+
+ *p = pi_1;
+ *q = pi_2;
+}
+
+static void polling_island_unlock_pair(polling_island *p, polling_island *q) {
+ if (p == q) {
+ gpr_mu_unlock(&p->mu);
+ } else {
+ gpr_mu_unlock(&p->mu);
+ gpr_mu_unlock(&q->mu);
+ }
+}
+
+static void workqueue_maybe_wakeup(polling_island *pi) {
+ /* If this thread is the current poller, then it may be that it's about to
+ decrement the current poller count, so we need to look past this thread */
+ bool is_current_poller = (g_current_thread_polling_island == pi);
+ gpr_atm min_current_pollers_for_wakeup = is_current_poller ? 1 : 0;
+ gpr_atm current_pollers = gpr_atm_no_barrier_load(&pi->poller_count);
+ /* Only issue a wakeup if it's likely that some poller could come in and take
+ it right now. Note that since we do an anticipatory mpscq_pop every poll
+ loop, it's ok if we miss the wakeup here, as we'll get the work item when
+ the next poller enters anyway. */
+ if (current_pollers > min_current_pollers_for_wakeup) {
+ GRPC_LOG_IF_ERROR("workqueue_wakeup_fd",
+ grpc_wakeup_fd_wakeup(&pi->workqueue_wakeup_fd));
+ }
+}
+
+static void workqueue_move_items_to_parent(polling_island *q) {
+ polling_island *p = (polling_island *)gpr_atm_no_barrier_load(&q->merged_to);
+ if (p == NULL) {
+ return;
+ }
+ gpr_mu_lock(&q->workqueue_read_mu);
+ int num_added = 0;
+ while (gpr_atm_no_barrier_load(&q->workqueue_item_count) > 0) {
+ gpr_mpscq_node *n = gpr_mpscq_pop(&q->workqueue_items);
+ if (n != NULL) {
+ gpr_atm_no_barrier_fetch_add(&q->workqueue_item_count, -1);
+ gpr_atm_no_barrier_fetch_add(&p->workqueue_item_count, 1);
+ gpr_mpscq_push(&p->workqueue_items, n);
+ num_added++;
+ }
+ }
+ gpr_mu_unlock(&q->workqueue_read_mu);
+ if (num_added > 0) {
+ workqueue_maybe_wakeup(p);
+ }
+ workqueue_move_items_to_parent(p);
+}
+
+static polling_island *polling_island_merge(polling_island *p,
+ polling_island *q,
+ grpc_error **error) {
+ /* Get locks on both the polling islands */
+ polling_island_lock_pair(&p, &q);
+
+ if (p != q) {
+ /* Make sure that p points to the polling island with fewer fds than q */
+ if (p->fd_cnt > q->fd_cnt) {
+ GPR_SWAP(polling_island *, p, q);
+ }
+
+ /* Merge p with q i.e move all the fds from p (The one with fewer fds) to q
+ Note that the refcounts on the fds being moved will not change here.
+ This is why the last param in the following two functions is 'false') */
+ polling_island_add_fds_locked(q, p->fds, p->fd_cnt, false, error);
+ polling_island_remove_all_fds_locked(p, false, error);
+
+ /* Wakeup all the pollers (if any) on p so that they pickup this change */
+ polling_island_add_wakeup_fd_locked(p, &polling_island_wakeup_fd, error);
+
+ /* Add the 'merged_to' link from p --> q */
+ gpr_atm_rel_store(&p->merged_to, (gpr_atm)q);
+ PI_ADD_REF(q, "pi_merge"); /* To account for the new incoming ref from p */
+
+ workqueue_move_items_to_parent(p);
+ }
+ /* else if p == q, nothing needs to be done */
+
+ polling_island_unlock_pair(p, q);
+
+ /* Return the merged polling island (Note that no merge would have happened
+ if p == q which is ok) */
+ return q;
+}
+
+static void workqueue_enqueue(grpc_exec_ctx *exec_ctx, grpc_closure *closure,
+ grpc_error *error) {
+ GPR_TIMER_BEGIN("workqueue.enqueue", 0);
+ grpc_workqueue *workqueue = (grpc_workqueue *)closure->scheduler;
+ /* take a ref to the workqueue: otherwise it can happen that whatever events
+ * this kicks off ends up destroying the workqueue before this function
+ * completes */
+ GRPC_WORKQUEUE_REF(workqueue, "enqueue");
+ polling_island *pi = (polling_island *)workqueue;
+ gpr_atm last = gpr_atm_no_barrier_fetch_add(&pi->workqueue_item_count, 1);
+ closure->error_data.error = error;
+ gpr_mpscq_push(&pi->workqueue_items, &closure->next_data.atm_next);
+ if (last == 0) {
+ workqueue_maybe_wakeup(pi);
+ }
+ workqueue_move_items_to_parent(pi);
+ GRPC_WORKQUEUE_UNREF(exec_ctx, workqueue, "enqueue");
+ GPR_TIMER_END("workqueue.enqueue", 0);
+}
+
+static grpc_closure_scheduler *workqueue_scheduler(grpc_workqueue *workqueue) {
+ polling_island *pi = (polling_island *)workqueue;
+ return workqueue == NULL ? grpc_schedule_on_exec_ctx
+ : &pi->workqueue_scheduler;
+}
+
+static grpc_error *polling_island_global_init() {
+ grpc_error *error = GRPC_ERROR_NONE;
+
+ error = grpc_wakeup_fd_init(&polling_island_wakeup_fd);
+ if (error == GRPC_ERROR_NONE) {
+ error = grpc_wakeup_fd_wakeup(&polling_island_wakeup_fd);
+ }
+
+ return error;
+}
+
+static void polling_island_global_shutdown() {
+ grpc_wakeup_fd_destroy(&polling_island_wakeup_fd);
+}
+
+/*******************************************************************************
+ * Fd Definitions
+ */
+
+/* We need to keep a freelist not because of any concerns of malloc performance
+ * but instead so that implementations with multiple threads in (for example)
+ * epoll_wait deal with the race between pollset removal and incoming poll
+ * notifications.
+ *
+ * The problem is that the poller ultimately holds a reference to this
+ * object, so it is very difficult to know when is safe to free it, at least
+ * without some expensive synchronization.
+ *
+ * If we keep the object freelisted, in the worst case losing this race just
+ * becomes a spurious read notification on a reused fd.
+ */
+
+/* The alarm system needs to be able to wakeup 'some poller' sometimes
+ * (specifically when a new alarm needs to be triggered earlier than the next
+ * alarm 'epoch'). This wakeup_fd gives us something to alert on when such a
+ * case occurs. */
+
+static grpc_fd *fd_freelist = NULL;
+static gpr_mu fd_freelist_mu;
+
+#ifdef GRPC_FD_REF_COUNT_DEBUG
+#define REF_BY(fd, n, reason) ref_by(fd, n, reason, __FILE__, __LINE__)
+#define UNREF_BY(fd, n, reason) unref_by(fd, n, reason, __FILE__, __LINE__)
+static void ref_by(grpc_fd *fd, int n, const char *reason, const char *file,
+ int line) {
+ gpr_log(GPR_DEBUG, "FD %d %p ref %d %ld -> %ld [%s; %s:%d]", fd->fd,
+ (void *)fd, n, gpr_atm_no_barrier_load(&fd->refst),
+ gpr_atm_no_barrier_load(&fd->refst) + n, reason, file, line);
+#else
+#define REF_BY(fd, n, reason) ref_by(fd, n)
+#define UNREF_BY(fd, n, reason) unref_by(fd, n)
+static void ref_by(grpc_fd *fd, int n) {
+#endif
+ GPR_ASSERT(gpr_atm_no_barrier_fetch_add(&fd->refst, n) > 0);
+}
+
+#ifdef GRPC_FD_REF_COUNT_DEBUG
+static void unref_by(grpc_fd *fd, int n, const char *reason, const char *file,
+ int line) {
+ gpr_atm old;
+ gpr_log(GPR_DEBUG, "FD %d %p unref %d %ld -> %ld [%s; %s:%d]", fd->fd,
+ (void *)fd, n, gpr_atm_no_barrier_load(&fd->refst),
+ gpr_atm_no_barrier_load(&fd->refst) - n, reason, file, line);
+#else
+static void unref_by(grpc_fd *fd, int n) {
+ gpr_atm old;
+#endif
+ old = gpr_atm_full_fetch_add(&fd->refst, -n);
+ if (old == n) {
+ /* Add the fd to the freelist */
+ gpr_mu_lock(&fd_freelist_mu);
+ fd->freelist_next = fd_freelist;
+ fd_freelist = fd;
+ grpc_iomgr_unregister_object(&fd->iomgr_object);
+
+ grpc_lfev_destroy(&fd->read_closure);
+ grpc_lfev_destroy(&fd->write_closure);
+
+ gpr_mu_unlock(&fd_freelist_mu);
+ } else {
+ GPR_ASSERT(old > n);
+ }
+}
+
+/* Increment refcount by two to avoid changing the orphan bit */
+#ifdef GRPC_FD_REF_COUNT_DEBUG
+static void fd_ref(grpc_fd *fd, const char *reason, const char *file,
+ int line) {
+ ref_by(fd, 2, reason, file, line);
+}
+
+static void fd_unref(grpc_fd *fd, const char *reason, const char *file,
+ int line) {
+ unref_by(fd, 2, reason, file, line);
+}
+#else
+static void fd_ref(grpc_fd *fd) { ref_by(fd, 2); }
+static void fd_unref(grpc_fd *fd) { unref_by(fd, 2); }
+#endif
+
+static void fd_global_init(void) { gpr_mu_init(&fd_freelist_mu); }
+
+static void fd_global_shutdown(void) {
+ gpr_mu_lock(&fd_freelist_mu);
+ gpr_mu_unlock(&fd_freelist_mu);
+ while (fd_freelist != NULL) {
+ grpc_fd *fd = fd_freelist;
+ fd_freelist = fd_freelist->freelist_next;
+ gpr_mu_destroy(&fd->po.mu);
+ gpr_free(fd);
+ }
+ gpr_mu_destroy(&fd_freelist_mu);
+}
+
+static grpc_fd *fd_create(int fd, const char *name) {
+ grpc_fd *new_fd = NULL;
+
+ gpr_mu_lock(&fd_freelist_mu);
+ if (fd_freelist != NULL) {
+ new_fd = fd_freelist;
+ fd_freelist = fd_freelist->freelist_next;
+ }
+ gpr_mu_unlock(&fd_freelist_mu);
+
+ if (new_fd == NULL) {
+ new_fd = gpr_malloc(sizeof(grpc_fd));
+ gpr_mu_init(&new_fd->po.mu);
+ }
+
+ /* Note: It is not really needed to get the new_fd->po.mu lock here. If this
+ * is a newly created fd (or an fd we got from the freelist), no one else
+ * would be holding a lock to it anyway. */
+ gpr_mu_lock(&new_fd->po.mu);
+ new_fd->po.pi = NULL;
+#ifdef PO_DEBUG
+ new_fd->po.obj_type = POLL_OBJ_FD;
+#endif
+
+ gpr_atm_rel_store(&new_fd->refst, (gpr_atm)1);
+ new_fd->fd = fd;
+ new_fd->orphaned = false;
+ grpc_lfev_init(&new_fd->read_closure);
+ grpc_lfev_init(&new_fd->write_closure);
+ gpr_atm_no_barrier_store(&new_fd->read_notifier_pollset, (gpr_atm)NULL);
+
+ new_fd->freelist_next = NULL;
+ new_fd->on_done_closure = NULL;
+
+ gpr_mu_unlock(&new_fd->po.mu);
+
+ char *fd_name;
+ gpr_asprintf(&fd_name, "%s fd=%d", name, fd);
+ grpc_iomgr_register_object(&new_fd->iomgr_object, fd_name);
+#ifdef GRPC_FD_REF_COUNT_DEBUG
+ gpr_log(GPR_DEBUG, "FD %d %p create %s", fd, (void *)new_fd, fd_name);
+#endif
+ gpr_free(fd_name);
+ return new_fd;
+}
+
+static int fd_wrapped_fd(grpc_fd *fd) {
+ int ret_fd = -1;
+ gpr_mu_lock(&fd->po.mu);
+ if (!fd->orphaned) {
+ ret_fd = fd->fd;
+ }
+ gpr_mu_unlock(&fd->po.mu);
+
+ return ret_fd;
+}
+
+static void fd_orphan(grpc_exec_ctx *exec_ctx, grpc_fd *fd,
+ grpc_closure *on_done, int *release_fd,
+ const char *reason) {
+ bool is_fd_closed = false;
+ grpc_error *error = GRPC_ERROR_NONE;
+ polling_island *unref_pi = NULL;
+
+ gpr_mu_lock(&fd->po.mu);
+ fd->on_done_closure = on_done;
+
+ /* If release_fd is not NULL, we should be relinquishing control of the file
+ descriptor fd->fd (but we still own the grpc_fd structure). */
+ if (release_fd != NULL) {
+ *release_fd = fd->fd;
+ } else {
+ close(fd->fd);
+ is_fd_closed = true;
+ }
+
+ fd->orphaned = true;
+
+ /* Remove the active status but keep referenced. We want this grpc_fd struct
+ to be alive (and not added to freelist) until the end of this function */
+ REF_BY(fd, 1, reason);
+
+ /* Remove the fd from the polling island:
+ - Get a lock on the latest polling island (i.e the last island in the
+ linked list pointed by fd->po.pi). This is the island that
+ would actually contain the fd
+ - Remove the fd from the latest polling island
+ - Unlock the latest polling island
+ - Set fd->po.pi to NULL (but remove the ref on the polling island
+ before doing this.) */
+ if (fd->po.pi != NULL) {
+ polling_island *pi_latest = polling_island_lock(fd->po.pi);
+ polling_island_remove_fd_locked(pi_latest, fd, is_fd_closed, &error);
+ gpr_mu_unlock(&pi_latest->mu);
+
+ unref_pi = fd->po.pi;
+ fd->po.pi = NULL;
+ }
+
+ grpc_closure_sched(exec_ctx, fd->on_done_closure, GRPC_ERROR_REF(error));
+
+ gpr_mu_unlock(&fd->po.mu);
+ UNREF_BY(fd, 2, reason); /* Drop the reference */
+ if (unref_pi != NULL) {
+ /* Unref stale polling island here, outside the fd lock above.
+ The polling island owns a workqueue which owns an fd, and unreffing
+ inside the lock can cause an eventual lock loop that makes TSAN very
+ unhappy. */
+ PI_UNREF(exec_ctx, unref_pi, "fd_orphan");
+ }
+ GRPC_LOG_IF_ERROR("fd_orphan", GRPC_ERROR_REF(error));
+ GRPC_ERROR_UNREF(error);
+}
+
+static grpc_pollset *fd_get_read_notifier_pollset(grpc_exec_ctx *exec_ctx,
+ grpc_fd *fd) {
+ gpr_atm notifier = gpr_atm_acq_load(&fd->read_notifier_pollset);
+ return (grpc_pollset *)notifier;
+}
+
+static bool fd_is_shutdown(grpc_fd *fd) {
+ return grpc_lfev_is_shutdown(&fd->read_closure);
+}
+
+/* Might be called multiple times */
+static void fd_shutdown(grpc_exec_ctx *exec_ctx, grpc_fd *fd, grpc_error *why) {
+ if (grpc_lfev_set_shutdown(exec_ctx, &fd->read_closure,
+ GRPC_ERROR_REF(why))) {
+ shutdown(fd->fd, SHUT_RDWR);
+ grpc_lfev_set_shutdown(exec_ctx, &fd->write_closure, GRPC_ERROR_REF(why));
+ }
+ GRPC_ERROR_UNREF(why);
+}
+
+static void fd_notify_on_read(grpc_exec_ctx *exec_ctx, grpc_fd *fd,
+ grpc_closure *closure) {
+ grpc_lfev_notify_on(exec_ctx, &fd->read_closure, closure);
+}
+
+static void fd_notify_on_write(grpc_exec_ctx *exec_ctx, grpc_fd *fd,
+ grpc_closure *closure) {
+ grpc_lfev_notify_on(exec_ctx, &fd->write_closure, closure);
+}
+
+static grpc_workqueue *fd_get_workqueue(grpc_fd *fd) {
+ gpr_mu_lock(&fd->po.mu);
+ grpc_workqueue *workqueue =
+ GRPC_WORKQUEUE_REF((grpc_workqueue *)fd->po.pi, "fd_get_workqueue");
+ gpr_mu_unlock(&fd->po.mu);
+ return workqueue;
+}
+
+/*******************************************************************************
+ * Pollset Definitions
+ */
+GPR_TLS_DECL(g_current_thread_pollset);
+GPR_TLS_DECL(g_current_thread_worker);
+static __thread bool g_initialized_sigmask;
+static __thread sigset_t g_orig_sigmask;
+static __thread sigset_t g_wakeup_sig_set;
+
+static void sig_handler(int sig_num) {
+#ifdef GRPC_EPOLL_DEBUG
+ gpr_log(GPR_INFO, "Received signal %d", sig_num);
+#endif
+}
+
+static void pollset_worker_init(grpc_pollset_worker *worker) {
+ worker->pt_id = pthread_self();
+ worker->next = worker->prev = NULL;
+ gpr_atm_no_barrier_store(&worker->is_kicked, (gpr_atm)0);
+ gpr_atm_no_barrier_store(&worker->is_polling_turn, (gpr_atm)0);
+ worker_node_init(&worker->pi_list_link);
+}
+
+static void poller_kick_init() { signal(grpc_wakeup_signal, sig_handler); }
+
+/* Global state management */
+static grpc_error *pollset_global_init(void) {
+ gpr_tls_init(&g_current_thread_pollset);
+ gpr_tls_init(&g_current_thread_worker);
+ poller_kick_init();
+ return GRPC_ERROR_NONE;
+}
+
+static void pollset_global_shutdown(void) {
+ gpr_tls_destroy(&g_current_thread_pollset);
+ gpr_tls_destroy(&g_current_thread_worker);
+}
+
+static grpc_error *worker_kick(grpc_pollset_worker *worker,
+ gpr_atm *is_kicked) {
+ grpc_error *err = GRPC_ERROR_NONE;
+
+ /* Kick the worker only if it was not already kicked */
+ if (gpr_atm_no_barrier_cas(is_kicked, (gpr_atm)0, (gpr_atm)1)) {
+ GRPC_POLLING_TRACE(
+ "pollset_worker_kick: Kicking worker: %p (thread id: %ld)",
+ (void *)worker, (long int)worker->pt_id);
+ int err_num = pthread_kill(worker->pt_id, grpc_wakeup_signal);
+ if (err_num != 0) {
+ err = GRPC_OS_ERROR(err_num, "pthread_kill");
+ }
+ }
+ return err;
+}
+
+static grpc_error *pollset_worker_kick(grpc_pollset_worker *worker) {
+ return worker_kick(worker, &worker->is_kicked);
+}
+
+static grpc_error *poller_kick(grpc_pollset_worker *worker) {
+ return worker_kick(worker, &worker->is_polling_turn);
+}
+
+/* Return 1 if the pollset has active threads in pollset_work (pollset must
+ * be locked) */
+static int pollset_has_workers(grpc_pollset *p) {
+ return p->root_worker.next != &p->root_worker;
+}
+
+static void remove_worker(grpc_pollset *p, grpc_pollset_worker *worker) {
+ worker->prev->next = worker->next;
+ worker->next->prev = worker->prev;
+}
+
+static grpc_pollset_worker *pop_front_worker(grpc_pollset *p) {
+ if (pollset_has_workers(p)) {
+ grpc_pollset_worker *w = p->root_worker.next;
+ remove_worker(p, w);
+ return w;
+ } else {
+ return NULL;
+ }
+}
+
+static void push_back_worker(grpc_pollset *p, grpc_pollset_worker *worker) {
+ worker->next = &p->root_worker;
+ worker->prev = worker->next->prev;
+ worker->prev->next = worker->next->prev = worker;
+}
+
+static void push_front_worker(grpc_pollset *p, grpc_pollset_worker *worker) {
+ worker->prev = &p->root_worker;
+ worker->next = worker->prev->next;
+ worker->prev->next = worker->next->prev = worker;
+}
+
+/* p->mu must be held before calling this function */
+static grpc_error *pollset_kick(grpc_pollset *p,
+ grpc_pollset_worker *specific_worker) {
+ GPR_TIMER_BEGIN("pollset_kick", 0);
+ grpc_error *error = GRPC_ERROR_NONE;
+ const char *err_desc = "Kick Failure";
+ grpc_pollset_worker *worker = specific_worker;
+ if (worker != NULL) {
+ if (worker == GRPC_POLLSET_KICK_BROADCAST) {
+ if (pollset_has_workers(p)) {
+ GPR_TIMER_BEGIN("pollset_kick.broadcast", 0);
+ for (worker = p->root_worker.next; worker != &p->root_worker;
+ worker = worker->next) {
+ if (gpr_tls_get(&g_current_thread_worker) != (intptr_t)worker) {
+ append_error(&error, pollset_worker_kick(worker), err_desc);
+ }
+ }
+ GPR_TIMER_END("pollset_kick.broadcast", 0);
+ } else {
+ p->kicked_without_pollers = true;
+ }
+ } else {
+ GPR_TIMER_MARK("kicked_specifically", 0);
+ if (gpr_tls_get(&g_current_thread_worker) != (intptr_t)worker) {
+ append_error(&error, pollset_worker_kick(worker), err_desc);
+ }
+ }
+ } else if (gpr_tls_get(&g_current_thread_pollset) != (intptr_t)p) {
+ /* Since worker == NULL, it means that we can kick "any" worker on this
+ pollset 'p'. If 'p' happens to be the same pollset this thread is
+ currently polling (i.e in pollset_work() function), then there is no need
+ to kick any other worker since the current thread can just absorb the
+ kick. This is the reason why we enter this case only when
+ g_current_thread_pollset is != p */
+
+ GPR_TIMER_MARK("kick_anonymous", 0);
+ worker = pop_front_worker(p);
+ if (worker != NULL) {
+ GPR_TIMER_MARK("finally_kick", 0);
+ push_back_worker(p, worker);
+ append_error(&error, pollset_worker_kick(worker), err_desc);
+ } else {
+ GPR_TIMER_MARK("kicked_no_pollers", 0);
+ p->kicked_without_pollers = true;
+ }
+ }
+
+ GPR_TIMER_END("pollset_kick", 0);
+ GRPC_LOG_IF_ERROR("pollset_kick", GRPC_ERROR_REF(error));
+ return error;
+}
+
+static void pollset_init(grpc_pollset *pollset, gpr_mu **mu) {
+ gpr_mu_init(&pollset->po.mu);
+ *mu = &pollset->po.mu;
+ pollset->po.pi = NULL;
+#ifdef PO_DEBUG
+ pollset->po.obj_type = POLL_OBJ_POLLSET;
+#endif
+
+ pollset->root_worker.next = pollset->root_worker.prev = &pollset->root_worker;
+ pollset->kicked_without_pollers = false;
+
+ pollset->shutting_down = false;
+ pollset->finish_shutdown_called = false;
+ pollset->shutdown_done = NULL;
+}
+
+/* Convert millis to timespec (clock-type is assumed to be GPR_TIMESPAN) */
+static struct timespec millis_to_timespec(int millis) {
+ struct timespec linux_ts;
+ gpr_timespec gpr_ts;
+
+ if (millis == -1) {
+ gpr_ts = gpr_inf_future(GPR_TIMESPAN);
+ } else {
+ gpr_ts = gpr_time_from_millis(millis, GPR_TIMESPAN);
+ }
+
+ linux_ts.tv_sec = (time_t)gpr_ts.tv_sec;
+ linux_ts.tv_nsec = gpr_ts.tv_nsec;
+ return linux_ts;
+}
+
+/* Convert a timespec to milliseconds:
+ - Very small or negative poll times are clamped to zero to do a non-blocking
+ poll (which becomes spin polling)
+ - Other small values are rounded up to one millisecond
+ - Longer than a millisecond polls are rounded up to the next nearest
+ millisecond to avoid spinning
+ - Infinite timeouts are converted to -1 */
+static int poll_deadline_to_millis_timeout(gpr_timespec deadline,
+ gpr_timespec now) {
+ gpr_timespec timeout;
+ static const int64_t max_spin_polling_us = 10;
+ if (gpr_time_cmp(deadline, gpr_inf_future(deadline.clock_type)) == 0) {
+ return -1;
+ }
+
+ if (gpr_time_cmp(deadline, gpr_time_add(now, gpr_time_from_micros(
+ max_spin_polling_us,
+ GPR_TIMESPAN))) <= 0) {
+ return 0;
+ }
+ timeout = gpr_time_sub(deadline, now);
+ int millis = gpr_time_to_millis(gpr_time_add(
+ timeout, gpr_time_from_nanos(GPR_NS_PER_MS - 1, GPR_TIMESPAN)));
+ return millis >= 1 ? millis : 1;
+}
+
+static void fd_become_readable(grpc_exec_ctx *exec_ctx, grpc_fd *fd,
+ grpc_pollset *notifier) {
+ grpc_lfev_set_ready(exec_ctx, &fd->read_closure);
+
+ /* Note, it is possible that fd_become_readable might be called twice with
+ different 'notifier's when an fd becomes readable and it is in two epoll
+ sets (This can happen briefly during polling island merges). In such cases
+ it does not really matter which notifer is set as the read_notifier_pollset
+ (They would both point to the same polling island anyway) */
+ /* Use release store to match with acquire load in fd_get_read_notifier */
+ gpr_atm_rel_store(&fd->read_notifier_pollset, (gpr_atm)notifier);
+}
+
+static void fd_become_writable(grpc_exec_ctx *exec_ctx, grpc_fd *fd) {
+ grpc_lfev_set_ready(exec_ctx, &fd->write_closure);
+}
+
+static void pollset_release_polling_island(grpc_exec_ctx *exec_ctx,
+ grpc_pollset *ps, char *reason) {
+ if (ps->po.pi != NULL) {
+ PI_UNREF(exec_ctx, ps->po.pi, reason);
+ }
+ ps->po.pi = NULL;
+}
+
+static void finish_shutdown_locked(grpc_exec_ctx *exec_ctx,
+ grpc_pollset *pollset) {
+ /* The pollset cannot have any workers if we are at this stage */
+ GPR_ASSERT(!pollset_has_workers(pollset));
+
+ pollset->finish_shutdown_called = true;
+
+ /* Release the ref and set pollset->po.pi to NULL */
+ pollset_release_polling_island(exec_ctx, pollset, "ps_shutdown");
+ grpc_closure_sched(exec_ctx, pollset->shutdown_done, GRPC_ERROR_NONE);
+}
+
+/* pollset->po.mu lock must be held by the caller before calling this */
+static void pollset_shutdown(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
+ grpc_closure *closure) {
+ GPR_TIMER_BEGIN("pollset_shutdown", 0);
+ GPR_ASSERT(!pollset->shutting_down);
+ pollset->shutting_down = true;
+ pollset->shutdown_done = closure;
+ pollset_kick(pollset, GRPC_POLLSET_KICK_BROADCAST);
+
+ /* If the pollset has any workers, we cannot call finish_shutdown_locked()
+ because it would release the underlying polling island. In such a case, we
+ let the last worker call finish_shutdown_locked() from pollset_work() */
+ if (!pollset_has_workers(pollset)) {
+ GPR_ASSERT(!pollset->finish_shutdown_called);
+ GPR_TIMER_MARK("pollset_shutdown.finish_shutdown_locked", 0);
+ finish_shutdown_locked(exec_ctx, pollset);
+ }
+ GPR_TIMER_END("pollset_shutdown", 0);
+}
+
+/* pollset_shutdown is guaranteed to be called before pollset_destroy. So other
+ * than destroying the mutexes, there is nothing special that needs to be done
+ * here */
+static void pollset_destroy(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset) {
+ GPR_ASSERT(!pollset_has_workers(pollset));
+ gpr_mu_destroy(&pollset->po.mu);
+}
+
+static bool maybe_do_workqueue_work(grpc_exec_ctx *exec_ctx,
+ polling_island *pi) {
+ if (gpr_mu_trylock(&pi->workqueue_read_mu)) {
+ gpr_mpscq_node *n = gpr_mpscq_pop(&pi->workqueue_items);
+ gpr_mu_unlock(&pi->workqueue_read_mu);
+ if (n != NULL) {
+ if (gpr_atm_full_fetch_add(&pi->workqueue_item_count, -1) > 1) {
+ workqueue_maybe_wakeup(pi);
+ }
+ grpc_closure *c = (grpc_closure *)n;
+ grpc_error *error = c->error_data.error;
+#ifndef NDEBUG
+ c->scheduled = false;
+#endif
+ c->cb(exec_ctx, c->cb_arg, error);
+ GRPC_ERROR_UNREF(error);
+ return true;
+ } else if (gpr_atm_no_barrier_load(&pi->workqueue_item_count) > 0) {
+ /* n == NULL might mean there's work but it's not available to be popped
+ * yet - try to ensure another workqueue wakes up to check shortly if so
+ */
+ workqueue_maybe_wakeup(pi);
+ }
+ }
+ return false;
+}
+
+/* NOTE: This function may modify 'now' */
+static bool acquire_polling_lease(grpc_pollset_worker *worker,
+ polling_island *pi, gpr_timespec deadline,
+ gpr_timespec *now) {
+ bool is_lease_acquired = false;
+
+ gpr_mu_lock(&pi->worker_list_mu); // LOCK
+ long num_pollers = gpr_atm_no_barrier_load(&pi->poller_count);
+
+ if (num_pollers >= g_max_pollers_per_pi) {
+ push_back_worker_node(&pi->worker_list_head, &worker->pi_list_link);
+ gpr_mu_unlock(&pi->worker_list_mu); // UNLOCK
+
+ bool is_timeout = false;
+ int ret;
+ int timeout_ms = poll_deadline_to_millis_timeout(deadline, *now);
+ if (timeout_ms == -1) {
+ ret = sigwaitinfo(&g_wakeup_sig_set, NULL);
+ } else {
+ struct timespec sigwait_timeout = millis_to_timespec(timeout_ms);
+ GRPC_SCHEDULING_START_BLOCKING_REGION;
+ ret = sigtimedwait(&g_wakeup_sig_set, NULL, &sigwait_timeout);
+ GRPC_SCHEDULING_END_BLOCKING_REGION;
+ }
+
+ if (ret == -1) {
+ if (errno == EAGAIN) {
+ is_timeout = true;
+ } else {
+ /* NOTE: This should not happen. If we see these log messages, it means
+ we are most likely doing something incorrect in the setup * needed
+ for sigwaitinfo/sigtimedwait */
+ gpr_log(GPR_ERROR,
+ "sigtimedwait failed with retcode: %d (timeout_ms: %d)", errno,
+ timeout_ms);
+ }
+ }
+
+ /* Did the worker come out of sigtimedwait due to a thread that just
+ exited epoll and kicking it (in release_polling_lease function). */
+ bool is_polling_turn = gpr_atm_acq_load(&worker->is_polling_turn);
+
+ /* Did the worker come out of sigtimedwait due to a thread alerting it that
+ some completion event was (likely) available in the completion queue */
+ bool is_kicked = gpr_atm_no_barrier_load(&worker->is_kicked);
+
+ if (is_kicked || is_timeout) {
+ *now = deadline; /* Essentially make the epoll timeout = 0 */
+ } else if (is_polling_turn) {
+ *now = gpr_now(GPR_CLOCK_MONOTONIC); /* Reduce the epoll timeout */
+ }
+
+ gpr_mu_lock(&pi->worker_list_mu); // LOCK
+ /* The node might have already been removed from the list by the poller
+ that kicked this. However it is safe to call 'remove_worker_node' on
+ an already detached node */
+ remove_worker_node(&worker->pi_list_link);
+ /* It is important to read the num_pollers again under the lock so that we
+ * have the latest num_pollers value that doesn't change while we are doing
+ * the "(num_pollers < g_max_pollers_per_pi)" a a few lines below */
+ num_pollers = gpr_atm_no_barrier_load(&pi->poller_count);
+ }
+
+ if (num_pollers < g_max_pollers_per_pi) {
+ gpr_atm_no_barrier_fetch_add(&pi->poller_count, 1);
+ is_lease_acquired = true;
+ }
+
+ gpr_mu_unlock(&pi->worker_list_mu); // UNLOCK
+ return is_lease_acquired;
+}
+
+static void release_polling_lease(polling_island *pi, grpc_error **error) {
+ gpr_mu_lock(&pi->worker_list_mu);
+
+ gpr_atm_no_barrier_fetch_add(&pi->poller_count, -1);
+ worker_node *node = pop_front_worker_node(&pi->worker_list_head);
+ if (node != NULL) {
+ grpc_pollset_worker *next_worker = WORKER_FROM_WORKER_LIST_NODE(node);
+ append_error(error, poller_kick(next_worker), "poller kick error");
+ }
+
+ gpr_mu_unlock(&pi->worker_list_mu);
+}
+
+#define GRPC_EPOLL_MAX_EVENTS 100
+static void pollset_do_epoll_pwait(grpc_exec_ctx *exec_ctx, int epoll_fd,
+ grpc_pollset *pollset, polling_island *pi,
+ grpc_pollset_worker *worker,
+ gpr_timespec now, gpr_timespec deadline,
+ sigset_t *sig_mask, grpc_error **error) {
+ /* Only g_max_pollers_per_pi threads can be doing polling in parallel.
+ If we cannot get a lease, we cannot continue to do epoll_pwait() */
+ if (!acquire_polling_lease(worker, pi, deadline, &now)) {
+ return;
+ }
+
+ struct epoll_event ep_ev[GRPC_EPOLL_MAX_EVENTS];
+ int ep_rv;
+ char *err_msg;
+ const char *err_desc = "pollset_work_and_unlock";
+
+ /* timeout_ms is the time between 'now' and 'deadline' */
+ int timeout_ms = poll_deadline_to_millis_timeout(deadline, now);
+
+ GRPC_SCHEDULING_START_BLOCKING_REGION;
+ ep_rv =
+ epoll_pwait(epoll_fd, ep_ev, GRPC_EPOLL_MAX_EVENTS, timeout_ms, sig_mask);
+ GRPC_SCHEDULING_END_BLOCKING_REGION;
+
+ /* Give back the lease right away so that some other thread can enter */
+ release_polling_lease(pi, error);
+
+ if (ep_rv < 0) {
+ if (errno != EINTR) {
+ gpr_asprintf(&err_msg,
+ "epoll_wait() epoll fd: %d failed with error: %d (%s)",
+ epoll_fd, errno, strerror(errno));
+ append_error(error, GRPC_OS_ERROR(errno, err_msg), err_desc);
+ } else {
+ /* We were interrupted. Save an interation by doing a zero timeout
+ epoll_wait to see if there are any other events of interest */
+ GRPC_POLLING_TRACE("pollset_work: pollset: %p, worker: %p received kick",
+ (void *)pollset, (void *)worker);
+ ep_rv = epoll_wait(epoll_fd, ep_ev, GRPC_EPOLL_MAX_EVENTS, 0);
+ }
+ }
+
+#ifdef GRPC_TSAN
+ /* See the definition of g_poll_sync for more details */
+ gpr_atm_acq_load(&g_epoll_sync);
+#endif /* defined(GRPC_TSAN) */
+
+ for (int i = 0; i < ep_rv; ++i) {
+ void *data_ptr = ep_ev[i].data.ptr;
+ if (data_ptr == &pi->workqueue_wakeup_fd) {
+ append_error(error,
+ grpc_wakeup_fd_consume_wakeup(&pi->workqueue_wakeup_fd),
+ err_desc);
+ maybe_do_workqueue_work(exec_ctx, pi);
+ } else if (data_ptr == &polling_island_wakeup_fd) {
+ GRPC_POLLING_TRACE(
+ "pollset_work: pollset: %p, worker: %p polling island (epoll_fd: "
+ "%d) got merged",
+ (void *)pollset, (void *)worker, epoll_fd);
+ /* This means that our polling island is merged with a different
+ island. We do not have to do anything here since the subsequent call
+ to the function pollset_work_and_unlock() will pick up the correct
+ epoll_fd */
+ } else {
+ grpc_fd *fd = data_ptr;
+ int cancel = ep_ev[i].events & (EPOLLERR | EPOLLHUP);
+ int read_ev = ep_ev[i].events & (EPOLLIN | EPOLLPRI);
+ int write_ev = ep_ev[i].events & EPOLLOUT;
+ if (read_ev || cancel) {
+ fd_become_readable(exec_ctx, fd, pollset);
+ }
+ if (write_ev || cancel) {
+ fd_become_writable(exec_ctx, fd);
+ }
+ }
+ }
+}
+
+/* Note: sig_mask contains the signal mask to use *during* epoll_wait() */
+static void pollset_work_and_unlock(grpc_exec_ctx *exec_ctx,
+ grpc_pollset *pollset,
+ grpc_pollset_worker *worker,
+ gpr_timespec now, gpr_timespec deadline,
+ sigset_t *sig_mask, grpc_error **error) {
+ int epoll_fd = -1;
+ polling_island *pi = NULL;
+ GPR_TIMER_BEGIN("pollset_work_and_unlock", 0);
+
+ /* We need to get the epoll_fd to wait on. The epoll_fd is in inside the
+ latest polling island pointed by pollset->po.pi
+
+ Since epoll_fd is immutable, it is safe to read it without a lock on the
+ polling island. There is however a possibility that the polling island from
+ which we got the epoll_fd, got merged with another island in the meantime.
+ This is okay because in such a case, we will wakeup right-away from
+ epoll_pwait() (because any merge will poison the old polling island's epoll
+ set 'polling_island_wakeup_fd') and then pick up the latest polling_island
+ the next time this function - pollset_work_and_unlock()) is called */
+
+ if (pollset->po.pi == NULL) {
+ pollset->po.pi = polling_island_create(exec_ctx, NULL, error);
+ if (pollset->po.pi == NULL) {
+ GPR_TIMER_END("pollset_work_and_unlock", 0);
+ return; /* Fatal error. Cannot continue */
+ }
+
+ PI_ADD_REF(pollset->po.pi, "ps");
+ GRPC_POLLING_TRACE("pollset_work: pollset: %p created new pi: %p",
+ (void *)pollset, (void *)pollset->po.pi);
+ }
+
+ pi = polling_island_maybe_get_latest(pollset->po.pi);
+ epoll_fd = pi->epoll_fd;
+
+ /* Update the pollset->po.pi since the island being pointed by
+ pollset->po.pi maybe older than the one pointed by pi) */
+ if (pollset->po.pi != pi) {
+ /* Always do PI_ADD_REF before PI_UNREF because PI_UNREF may cause the
+ polling island to be deleted */
+ PI_ADD_REF(pi, "ps");
+ PI_UNREF(exec_ctx, pollset->po.pi, "ps");
+ pollset->po.pi = pi;
+ }
+
+ /* Add an extra ref so that the island does not get destroyed (which means
+ the epoll_fd won't be closed) while we are are doing an epoll_wait() on the
+ epoll_fd */
+ PI_ADD_REF(pi, "ps_work");
+ gpr_mu_unlock(&pollset->po.mu);
+
+ /* If we get some workqueue work to do, it might end up completing an item on
+ the completion queue, so there's no need to poll... so we skip that and
+ redo the complete loop to verify */
+ if (!maybe_do_workqueue_work(exec_ctx, pi)) {
+ g_current_thread_polling_island = pi;
+ pollset_do_epoll_pwait(exec_ctx, epoll_fd, pollset, pi, worker, now,
+ deadline, sig_mask, error);
+ g_current_thread_polling_island = NULL;
+ }
+
+ GPR_ASSERT(pi != NULL);
+
+ /* Before leaving, release the extra ref we added to the polling island. It
+ is important to use "pi" here (i.e our old copy of pollset->po.pi
+ that we got before releasing the polling island lock). This is because
+ pollset->po.pi pointer might get udpated in other parts of the
+ code when there is an island merge while we are doing epoll_wait() above */
+ PI_UNREF(exec_ctx, pi, "ps_work");
+
+ GPR_TIMER_END("pollset_work_and_unlock", 0);
+}
+
+/* pollset->po.mu lock must be held by the caller before calling this.
+ The function pollset_work() may temporarily release the lock (pollset->po.mu)
+ during the course of its execution but it will always re-acquire the lock and
+ ensure that it is held by the time the function returns */
+static grpc_error *pollset_work(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
+ grpc_pollset_worker **worker_hdl,
+ gpr_timespec now, gpr_timespec deadline) {
+ GPR_TIMER_BEGIN("pollset_work", 0);
+ grpc_error *error = GRPC_ERROR_NONE;
+
+ grpc_pollset_worker worker;
+ pollset_worker_init(&worker);
+
+ if (worker_hdl) *worker_hdl = &worker;
+
+ gpr_tls_set(&g_current_thread_pollset, (intptr_t)pollset);
+ gpr_tls_set(&g_current_thread_worker, (intptr_t)&worker);
+
+ if (pollset->kicked_without_pollers) {
+ /* If the pollset was kicked without pollers, pretend that the current
+ worker got the kick and skip polling. A kick indicates that there is some
+ work that needs attention like an event on the completion queue or an
+ alarm */
+ GPR_TIMER_MARK("pollset_work.kicked_without_pollers", 0);
+ pollset->kicked_without_pollers = 0;
+ } else if (!pollset->shutting_down) {
+ /* We use the posix-signal with number 'grpc_wakeup_signal' for waking up
+ (i.e 'kicking') a worker in the pollset. A 'kick' is a way to inform the
+ worker that there is some pending work that needs immediate attention
+ (like an event on the completion queue, or a polling island merge that
+ results in a new epoll-fd to wait on) and that the worker should not
+ spend time waiting in epoll_pwait().
+
+ A worker can be kicked anytime from the point it is added to the pollset
+ via push_front_worker() (or push_back_worker()) to the point it is
+ removed via remove_worker().
+ If the worker is kicked before/during it calls epoll_pwait(), it should
+ immediately exit from epoll_wait(). If the worker is kicked after it
+ returns from epoll_wait(), then nothing really needs to be done.
+
+ To accomplish this, we mask 'grpc_wakeup_signal' on this thread at all
+ times *except* when it is in epoll_pwait(). This way, the worker never
+ misses acting on a kick */
+
+ if (!g_initialized_sigmask) {
+ sigemptyset(&g_wakeup_sig_set);
+ sigaddset(&g_wakeup_sig_set, grpc_wakeup_signal);
+ pthread_sigmask(SIG_BLOCK, &g_wakeup_sig_set, &g_orig_sigmask);
+ sigdelset(&g_orig_sigmask, grpc_wakeup_signal);
+ g_initialized_sigmask = true;
+ /* new_mask: The new thread mask which blocks 'grpc_wakeup_signal'.
+ This is the mask used at all times *except during
+ epoll_wait()*"
+ g_orig_sigmask: The thread mask which allows 'grpc_wakeup_signal' and
+ this is the mask to use *during epoll_wait()*
+
+ The new_mask is set on the worker before it is added to the pollset
+ (i.e before it can be kicked) */
+ }
+
+ push_front_worker(pollset, &worker); /* Add worker to pollset */
+
+ pollset_work_and_unlock(exec_ctx, pollset, &worker, now, deadline,
+ &g_orig_sigmask, &error);
+ grpc_exec_ctx_flush(exec_ctx);
+
+ gpr_mu_lock(&pollset->po.mu);
+
+ /* Note: There is no need to reset worker.is_kicked to 0 since we are no
+ longer going to use this worker */
+ remove_worker(pollset, &worker);
+ }
+
+ /* If we are the last worker on the pollset (i.e pollset_has_workers() is
+ false at this point) and the pollset is shutting down, we may have to
+ finish the shutdown process by calling finish_shutdown_locked().
+ See pollset_shutdown() for more details.
+
+ Note: Continuing to access pollset here is safe; it is the caller's
+ responsibility to not destroy a pollset when it has outstanding calls to
+ pollset_work() */
+ if (pollset->shutting_down && !pollset_has_workers(pollset) &&
+ !pollset->finish_shutdown_called) {
+ GPR_TIMER_MARK("pollset_work.finish_shutdown_locked", 0);
+ finish_shutdown_locked(exec_ctx, pollset);
+
+ gpr_mu_unlock(&pollset->po.mu);
+ grpc_exec_ctx_flush(exec_ctx);
+ gpr_mu_lock(&pollset->po.mu);
+ }
+
+ if (worker_hdl) *worker_hdl = NULL;
+
+ gpr_tls_set(&g_current_thread_pollset, (intptr_t)0);
+ gpr_tls_set(&g_current_thread_worker, (intptr_t)0);
+
+ GPR_TIMER_END("pollset_work", 0);
+
+ GRPC_LOG_IF_ERROR("pollset_work", GRPC_ERROR_REF(error));
+ return error;
+}
+
+static void add_poll_object(grpc_exec_ctx *exec_ctx, poll_obj *bag,
+ poll_obj_type bag_type, poll_obj *item,
+ poll_obj_type item_type) {
+ GPR_TIMER_BEGIN("add_poll_object", 0);
+
+#ifdef PO_DEBUG
+ GPR_ASSERT(item->obj_type == item_type);
+ GPR_ASSERT(bag->obj_type == bag_type);
+#endif
+
+ grpc_error *error = GRPC_ERROR_NONE;
+ polling_island *pi_new = NULL;
+
+ gpr_mu_lock(&bag->mu);
+ gpr_mu_lock(&item->mu);
+
+retry:
+ /*
+ * 1) If item->pi and bag->pi are both non-NULL and equal, do nothing
+ * 2) If item->pi and bag->pi are both NULL, create a new polling island (with
+ * a refcount of 2) and point item->pi and bag->pi to the new island
+ * 3) If exactly one of item->pi or bag->pi is NULL, update it to point to
+ * the other's non-NULL pi
+ * 4) Finally if item->pi and bag-pi are non-NULL and not-equal, merge the
+ * polling islands and update item->pi and bag->pi to point to the new
+ * island
+ */
+
+ /* Early out if we are trying to add an 'fd' to a 'bag' but the fd is already
+ * orphaned */
+ if (item_type == POLL_OBJ_FD && (FD_FROM_PO(item))->orphaned) {
+ gpr_mu_unlock(&item->mu);
+ gpr_mu_unlock(&bag->mu);
+ return;
+ }
+
+ if (item->pi == bag->pi) {
+ pi_new = item->pi;
+ if (pi_new == NULL) {
+ /* GPR_ASSERT(item->pi == bag->pi == NULL) */
+
+ /* If we are adding an fd to a bag (i.e pollset or pollset_set), then
+ * we need to do some extra work to make TSAN happy */
+ if (item_type == POLL_OBJ_FD) {
+ /* Unlock before creating a new polling island: the polling island will
+ create a workqueue which creates a file descriptor, and holding an fd
+ lock here can eventually cause a loop to appear to TSAN (making it
+ unhappy). We don't think it's a real loop (there's an epoch point
+ where that loop possibility disappears), but the advantages of
+ keeping TSAN happy outweigh any performance advantage we might have
+ by keeping the lock held. */
+ gpr_mu_unlock(&item->mu);
+ pi_new = polling_island_create(exec_ctx, FD_FROM_PO(item), &error);
+ gpr_mu_lock(&item->mu);
+
+ /* Need to reverify any assumptions made between the initial lock and
+ getting to this branch: if they've changed, we need to throw away our
+ work and figure things out again. */
+ if (item->pi != NULL) {
+ GRPC_POLLING_TRACE(
+ "add_poll_object: Raced creating new polling island. pi_new: %p "
+ "(fd: %d, %s: %p)",
+ (void *)pi_new, FD_FROM_PO(item)->fd, poll_obj_string(bag_type),
+ (void *)bag);
+ /* No need to lock 'pi_new' here since this is a new polling island
+ and no one has a reference to it yet */
+ polling_island_remove_all_fds_locked(pi_new, true, &error);
+
+ /* Ref and unref so that the polling island gets deleted during unref
+ */
+ PI_ADD_REF(pi_new, "dance_of_destruction");
+ PI_UNREF(exec_ctx, pi_new, "dance_of_destruction");
+ goto retry;
+ }
+ } else {
+ pi_new = polling_island_create(exec_ctx, NULL, &error);
+ }
+
+ GRPC_POLLING_TRACE(
+ "add_poll_object: Created new polling island. pi_new: %p (%s: %p, "
+ "%s: %p)",
+ (void *)pi_new, poll_obj_string(item_type), (void *)item,
+ poll_obj_string(bag_type), (void *)bag);
+ } else {
+ GRPC_POLLING_TRACE(
+ "add_poll_object: Same polling island. pi: %p (%s, %s)",
+ (void *)pi_new, poll_obj_string(item_type),
+ poll_obj_string(bag_type));
+ }
+ } else if (item->pi == NULL) {
+ /* GPR_ASSERT(bag->pi != NULL) */
+ /* Make pi_new point to latest pi*/
+ pi_new = polling_island_lock(bag->pi);
+
+ if (item_type == POLL_OBJ_FD) {
+ grpc_fd *fd = FD_FROM_PO(item);
+ polling_island_add_fds_locked(pi_new, &fd, 1, true, &error);
+ }
+
+ gpr_mu_unlock(&pi_new->mu);
+ GRPC_POLLING_TRACE(
+ "add_poll_obj: item->pi was NULL. pi_new: %p (item(%s): %p, "
+ "bag(%s): %p)",
+ (void *)pi_new, poll_obj_string(item_type), (void *)item,
+ poll_obj_string(bag_type), (void *)bag);
+ } else if (bag->pi == NULL) {
+ /* GPR_ASSERT(item->pi != NULL) */
+ /* Make pi_new to point to latest pi */
+ pi_new = polling_island_lock(item->pi);
+ gpr_mu_unlock(&pi_new->mu);
+ GRPC_POLLING_TRACE(
+ "add_poll_obj: bag->pi was NULL. pi_new: %p (item(%s): %p, "
+ "bag(%s): %p)",
+ (void *)pi_new, poll_obj_string(item_type), (void *)item,
+ poll_obj_string(bag_type), (void *)bag);
+ } else {
+ pi_new = polling_island_merge(item->pi, bag->pi, &error);
+ GRPC_POLLING_TRACE(
+ "add_poll_obj: polling islands merged. pi_new: %p (item(%s): %p, "
+ "bag(%s): %p)",
+ (void *)pi_new, poll_obj_string(item_type), (void *)item,
+ poll_obj_string(bag_type), (void *)bag);
+ }
+
+ /* At this point, pi_new is the polling island that both item->pi and bag->pi
+ MUST be pointing to */
+
+ if (item->pi != pi_new) {
+ PI_ADD_REF(pi_new, poll_obj_string(item_type));
+ if (item->pi != NULL) {
+ PI_UNREF(exec_ctx, item->pi, poll_obj_string(item_type));
+ }
+ item->pi = pi_new;
+ }
+
+ if (bag->pi != pi_new) {
+ PI_ADD_REF(pi_new, poll_obj_string(bag_type));
+ if (bag->pi != NULL) {
+ PI_UNREF(exec_ctx, bag->pi, poll_obj_string(bag_type));
+ }
+ bag->pi = pi_new;
+ }
+
+ gpr_mu_unlock(&item->mu);
+ gpr_mu_unlock(&bag->mu);
+
+ GRPC_LOG_IF_ERROR("add_poll_object", error);
+ GPR_TIMER_END("add_poll_object", 0);
+}
+
+static void pollset_add_fd(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
+ grpc_fd *fd) {
+ add_poll_object(exec_ctx, &pollset->po, POLL_OBJ_POLLSET, &fd->po,
+ POLL_OBJ_FD);
+}
+
+/*******************************************************************************
+ * Pollset-set Definitions
+ */
+
+static grpc_pollset_set *pollset_set_create(void) {
+ grpc_pollset_set *pss = gpr_malloc(sizeof(*pss));
+ gpr_mu_init(&pss->po.mu);
+ pss->po.pi = NULL;
+#ifdef PO_DEBUG
+ pss->po.obj_type = POLL_OBJ_POLLSET_SET;
+#endif
+ return pss;
+}
+
+static void pollset_set_destroy(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *pss) {
+ gpr_mu_destroy(&pss->po.mu);
+
+ if (pss->po.pi != NULL) {
+ PI_UNREF(exec_ctx, pss->po.pi, "pss_destroy");
+ }
+
+ gpr_free(pss);
+}
+
+static void pollset_set_add_fd(grpc_exec_ctx *exec_ctx, grpc_pollset_set *pss,
+ grpc_fd *fd) {
+ add_poll_object(exec_ctx, &pss->po, POLL_OBJ_POLLSET_SET, &fd->po,
+ POLL_OBJ_FD);
+}
+
+static void pollset_set_del_fd(grpc_exec_ctx *exec_ctx, grpc_pollset_set *pss,
+ grpc_fd *fd) {
+ /* Nothing to do */
+}
+
+static void pollset_set_add_pollset(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *pss, grpc_pollset *ps) {
+ add_poll_object(exec_ctx, &pss->po, POLL_OBJ_POLLSET_SET, &ps->po,
+ POLL_OBJ_POLLSET);
+}
+
+static void pollset_set_del_pollset(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *pss, grpc_pollset *ps) {
+ /* Nothing to do */
+}
+
+static void pollset_set_add_pollset_set(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *bag,
+ grpc_pollset_set *item) {
+ add_poll_object(exec_ctx, &bag->po, POLL_OBJ_POLLSET_SET, &item->po,
+ POLL_OBJ_POLLSET_SET);
+}
+
+static void pollset_set_del_pollset_set(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *bag,
+ grpc_pollset_set *item) {
+ /* Nothing to do */
+}
+
+/*******************************************************************************
+ * Event engine binding
+ */
+
+static void shutdown_engine(void) {
+ fd_global_shutdown();
+ pollset_global_shutdown();
+ polling_island_global_shutdown();
+}
+
+static const grpc_event_engine_vtable vtable = {
+ .pollset_size = sizeof(grpc_pollset),
+
+ .fd_create = fd_create,
+ .fd_wrapped_fd = fd_wrapped_fd,
+ .fd_orphan = fd_orphan,
+ .fd_shutdown = fd_shutdown,
+ .fd_is_shutdown = fd_is_shutdown,
+ .fd_notify_on_read = fd_notify_on_read,
+ .fd_notify_on_write = fd_notify_on_write,
+ .fd_get_read_notifier_pollset = fd_get_read_notifier_pollset,
+ .fd_get_workqueue = fd_get_workqueue,
+
+ .pollset_init = pollset_init,
+ .pollset_shutdown = pollset_shutdown,
+ .pollset_destroy = pollset_destroy,
+ .pollset_work = pollset_work,
+ .pollset_kick = pollset_kick,
+ .pollset_add_fd = pollset_add_fd,
+
+ .pollset_set_create = pollset_set_create,
+ .pollset_set_destroy = pollset_set_destroy,
+ .pollset_set_add_pollset = pollset_set_add_pollset,
+ .pollset_set_del_pollset = pollset_set_del_pollset,
+ .pollset_set_add_pollset_set = pollset_set_add_pollset_set,
+ .pollset_set_del_pollset_set = pollset_set_del_pollset_set,
+ .pollset_set_add_fd = pollset_set_add_fd,
+ .pollset_set_del_fd = pollset_set_del_fd,
+
+ .workqueue_ref = workqueue_ref,
+ .workqueue_unref = workqueue_unref,
+ .workqueue_scheduler = workqueue_scheduler,
+
+ .shutdown_engine = shutdown_engine,
+};
+
+/* It is possible that GLIBC has epoll but the underlying kernel doesn't.
+ * Create a dummy epoll_fd to make sure epoll support is available */
+static bool is_epoll_available() {
+ int fd = epoll_create1(EPOLL_CLOEXEC);
+ if (fd < 0) {
+ gpr_log(
+ GPR_ERROR,
+ "epoll_create1 failed with error: %d. Not using epoll polling engine",
+ fd);
+ return false;
+ }
+ close(fd);
+ return true;
+}
+
+/* This is mainly for testing purposes. Checks to see if environment variable
+ * GRPC_MAX_POLLERS_PER_PI is set and if so, assigns that value to
+ * g_max_pollers_per_pi (any negative value is considered INT_MAX) */
+static void set_max_pollers_per_island() {
+ char *s = gpr_getenv("GRPC_MAX_POLLERS_PER_PI");
+ if (s) {
+ g_max_pollers_per_pi = (int)strtol(s, NULL, 10);
+ if (g_max_pollers_per_pi < 0) {
+ g_max_pollers_per_pi = INT_MAX;
+ }
+ } else {
+ g_max_pollers_per_pi = INT_MAX;
+ }
+
+ gpr_log(GPR_INFO, "Max number of pollers per polling island: %d",
+ g_max_pollers_per_pi);
+}
+
+const grpc_event_engine_vtable *grpc_init_epoll_limited_pollers_linux(
+ bool explicitly_requested) {
+ if (!explicitly_requested) {
+ return NULL;
+ }
+
+ /* If use of signals is disabled, we cannot use epoll engine*/
+ if (is_grpc_wakeup_signal_initialized && grpc_wakeup_signal < 0) {
+ return NULL;
+ }
+
+ if (!grpc_has_wakeup_fd()) {
+ return NULL;
+ }
+
+ if (!is_epoll_available()) {
+ return NULL;
+ }
+
+ if (!is_grpc_wakeup_signal_initialized) {
+ grpc_use_signal(SIGRTMIN + 6);
+ }
+
+ set_max_pollers_per_island();
+
+ fd_global_init();
+
+ if (!GRPC_LOG_IF_ERROR("pollset_global_init", pollset_global_init())) {
+ return NULL;
+ }
+
+ if (!GRPC_LOG_IF_ERROR("polling_island_global_init",
+ polling_island_global_init())) {
+ return NULL;
+ }
+
+ return &vtable;
+}
+
+#else /* defined(GRPC_LINUX_EPOLL) */
+#if defined(GRPC_POSIX_SOCKET)
+#include "src/core/lib/iomgr/ev_posix.h"
+/* If GRPC_LINUX_EPOLL is not defined, it means epoll is not available. Return
+ * NULL */
+const grpc_event_engine_vtable *grpc_init_epoll_limited_pollers_linux(
+ bool explicitly_requested) {
+ return NULL;
+}
+#endif /* defined(GRPC_POSIX_SOCKET) */
+#endif /* !defined(GRPC_LINUX_EPOLL) */
diff --git a/src/core/lib/iomgr/ev_epoll_limited_pollers_linux.h b/src/core/lib/iomgr/ev_epoll_limited_pollers_linux.h
new file mode 100644
index 0000000000..379e1ded3b
--- /dev/null
+++ b/src/core/lib/iomgr/ev_epoll_limited_pollers_linux.h
@@ -0,0 +1,43 @@
+/*
+ *
+ * Copyright 2015, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef GRPC_CORE_LIB_IOMGR_EV_EPOLL_LIMITED_POLLERS_LINUX_H
+#define GRPC_CORE_LIB_IOMGR_EV_EPOLL_LIMITED_POLLERS_LINUX_H
+
+#include "src/core/lib/iomgr/ev_posix.h"
+#include "src/core/lib/iomgr/port.h"
+
+const grpc_event_engine_vtable *grpc_init_epoll_limited_pollers_linux(
+ bool explicitly_requested);
+
+#endif /* GRPC_CORE_LIB_IOMGR_EV_EPOLL_LIMITED_POLLERS_LINUX_H */
diff --git a/src/core/lib/iomgr/ev_epoll_thread_pool_linux.c b/src/core/lib/iomgr/ev_epoll_thread_pool_linux.c
new file mode 100644
index 0000000000..bb44321922
--- /dev/null
+++ b/src/core/lib/iomgr/ev_epoll_thread_pool_linux.c
@@ -0,0 +1,1337 @@
+/*
+ *
+ * Copyright 2017, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "src/core/lib/iomgr/port.h"
+
+/* This polling engine is only relevant on linux kernels supporting epoll() */
+#ifdef GRPC_LINUX_EPOLL
+
+#include "src/core/lib/iomgr/ev_epoll_thread_pool_linux.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <poll.h>
+#include <pthread.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include <grpc/support/alloc.h>
+#include <grpc/support/cpu.h>
+#include <grpc/support/log.h>
+#include <grpc/support/string_util.h>
+#include <grpc/support/thd.h>
+#include <grpc/support/tls.h>
+#include <grpc/support/useful.h>
+
+#include "src/core/lib/iomgr/ev_posix.h"
+#include "src/core/lib/iomgr/iomgr_internal.h"
+#include "src/core/lib/iomgr/lockfree_event.h"
+#include "src/core/lib/iomgr/timer.h"
+#include "src/core/lib/iomgr/wakeup_fd_posix.h"
+#include "src/core/lib/iomgr/workqueue.h"
+#include "src/core/lib/profiling/timers.h"
+#include "src/core/lib/support/block_annotate.h"
+
+/* TODO: sreek - Move this to init.c and initialize this like other tracers. */
+#define GRPC_POLLING_TRACE(fmt, ...) \
+ if (GRPC_TRACER_ON(grpc_polling_trace)) { \
+ gpr_log(GPR_INFO, (fmt), __VA_ARGS__); \
+ }
+
+/* The alarm system needs to be able to wakeup 'some poller' sometimes
+ * (specifically when a new alarm needs to be triggered earlier than the next
+ * alarm 'epoch'). This wakeup_fd gives us something to alert on when such a
+ * case occurs. */
+
+struct epoll_set;
+
+#define GRPC_POLLSET_KICK_BROADCAST ((grpc_pollset_worker *)1)
+
+/*******************************************************************************
+ * Fd Declarations
+ */
+struct grpc_fd {
+ gpr_mu mu;
+ struct epoll_set *eps;
+
+ int fd;
+
+ /* The fd is either closed or we relinquished control of it. In either cases,
+ this indicates that the 'fd' on this structure is no longer valid */
+ bool orphaned;
+
+ gpr_atm read_closure;
+ gpr_atm write_closure;
+
+ struct grpc_fd *freelist_next;
+ grpc_closure *on_done_closure;
+
+ grpc_iomgr_object iomgr_object;
+};
+
+static void fd_global_init(void);
+static void fd_global_shutdown(void);
+
+/*******************************************************************************
+ * epoll set Declarations
+ */
+
+#ifdef GRPC_WORKQUEUE_REFCOUNT_DEBUG
+
+#define EPS_ADD_REF(p, r) eps_add_ref_dbg((p), (r), __FILE__, __LINE__)
+#define EPS_UNREF(exec_ctx, p, r) \
+ eps_unref_dbg((exec_ctx), (p), (r), __FILE__, __LINE__)
+
+#else /* defined(GRPC_WORKQUEUE_REFCOUNT_DEBUG) */
+
+#define EPS_ADD_REF(p, r) eps_add_ref((p))
+#define EPS_UNREF(exec_ctx, p, r) eps_unref((exec_ctx), (p))
+
+#endif /* !defined(GRPC_EPS_REF_COUNT_DEBUG) */
+
+/* This is also used as grpc_workqueue (by directly casting it) */
+typedef struct epoll_set {
+ grpc_closure_scheduler workqueue_scheduler;
+
+ /* Mutex poller should acquire to poll this. This enforces that only one
+ * poller can be polling on epoll_set at any time */
+ gpr_mu mu;
+
+ /* Ref count. Use EPS_ADD_REF() and EPS_UNREF() macros to increment/decrement
+ the refcount. Once the ref count becomes zero, this structure is destroyed
+ which means we should ensure that there is never a scenario where a
+ EPS_ADD_REF() is racing with a EPS_UNREF() that just made the ref_count
+ zero. */
+ gpr_atm ref_count;
+
+ /* Number of threads currently polling on this epoll set*/
+ gpr_atm poller_count;
+ /* Mutex guarding the read end of the workqueue (must be held to pop from
+ * workqueue_items) */
+ gpr_mu workqueue_read_mu;
+ /* Queue of closures to be executed */
+ gpr_mpscq workqueue_items;
+ /* Count of items in workqueue_items */
+ gpr_atm workqueue_item_count;
+ /* Wakeup fd used to wake pollers to check the contents of workqueue_items */
+ grpc_wakeup_fd workqueue_wakeup_fd;
+
+ /* Is the epoll set shutdown */
+ gpr_atm is_shutdown;
+
+ /* The fd of the underlying epoll set */
+ int epoll_fd;
+} epoll_set;
+
+/*******************************************************************************
+ * Pollset Declarations
+ */
+struct grpc_pollset_worker {
+ gpr_cv kick_cv;
+
+ struct grpc_pollset_worker *next;
+ struct grpc_pollset_worker *prev;
+};
+
+struct grpc_pollset {
+ gpr_mu mu;
+ struct epoll_set *eps;
+
+ grpc_pollset_worker root_worker;
+ bool kicked_without_pollers;
+
+ bool shutting_down; /* Is the pollset shutting down ? */
+ bool finish_shutdown_called; /* Is the 'finish_shutdown_locked()' called ? */
+ grpc_closure *shutdown_done; /* Called after after shutdown is complete */
+};
+
+/*******************************************************************************
+ * Pollset-set Declarations
+ */
+struct grpc_pollset_set {};
+
+/*****************************************************************************
+ * Dedicated polling threads and pollsets - Declarations
+ */
+
+size_t g_num_eps = 1;
+struct epoll_set **g_epoll_sets = NULL;
+gpr_atm g_next_eps;
+size_t g_num_threads_per_eps = 1;
+gpr_thd_id *g_poller_threads = NULL;
+
+/* Used as read-notifier pollsets for fds. We won't be using read notifier
+ * pollsets with this polling engine. So it does not matter what pollset we
+ * return */
+grpc_pollset g_read_notifier;
+
+static void add_fd_to_eps(grpc_fd *fd);
+static bool init_epoll_sets();
+static void shutdown_epoll_sets();
+static void poller_thread_loop(void *arg);
+static void start_poller_threads();
+static void shutdown_poller_threads();
+
+/*******************************************************************************
+ * Common helpers
+ */
+
+static bool append_error(grpc_error **composite, grpc_error *error,
+ const char *desc) {
+ if (error == GRPC_ERROR_NONE) return true;
+ if (*composite == GRPC_ERROR_NONE) {
+ *composite = GRPC_ERROR_CREATE_FROM_COPIED_STRING(desc);
+ }
+ *composite = grpc_error_add_child(*composite, error);
+ return false;
+}
+
+/*******************************************************************************
+ * epoll set Definitions
+ */
+
+/* The wakeup fd that is used to wake up all threads in an epoll_set informing
+ that the epoll set is shutdown. This wakeup fd initialized to be readable
+ and MUST NOT be consumed i.e the threads that woke up MUST NOT call
+ grpc_wakeup_fd_consume_wakeup() */
+static grpc_wakeup_fd epoll_set_wakeup_fd;
+
+/* The epoll set being polled right now.
+ See comments in workqueue_maybe_wakeup for why this is tracked. */
+static __thread epoll_set *g_current_thread_epoll_set;
+
+/* Forward declaration */
+static void epoll_set_delete(epoll_set *eps);
+static void workqueue_enqueue(grpc_exec_ctx *exec_ctx, grpc_closure *closure,
+ grpc_error *error);
+
+#ifdef GRPC_TSAN
+/* Currently TSAN may incorrectly flag data races between epoll_ctl and
+ epoll_wait for any grpc_fd structs that are added to the epoll set via
+ epoll_ctl and are returned (within a very short window) via epoll_wait().
+
+ To work-around this race, we establish a happens-before relation between
+ the code just-before epoll_ctl() and the code after epoll_wait() by using
+ this atomic */
+gpr_atm g_epoll_sync;
+#endif /* defined(GRPC_TSAN) */
+
+static const grpc_closure_scheduler_vtable workqueue_scheduler_vtable = {
+ workqueue_enqueue, workqueue_enqueue, "workqueue"};
+
+static void eps_add_ref(epoll_set *eps);
+static void eps_unref(grpc_exec_ctx *exec_ctx, epoll_set *eps);
+
+#ifdef GRPC_WORKQUEUE_REFCOUNT_DEBUG
+static void eps_add_ref_dbg(epoll_set *eps, const char *reason,
+ const char *file, int line) {
+ long old_cnt = gpr_atm_acq_load(&eps->ref_count);
+ eps_add_ref(eps);
+ gpr_log(GPR_DEBUG, "Add ref eps: %p, old: %ld -> new:%ld (%s) - (%s, %d)",
+ (void *)eps, old_cnt, old_cnt + 1, reason, file, line);
+}
+
+static void eps_unref_dbg(grpc_exec_ctx *exec_ctx, epoll_set *eps,
+ const char *reason, const char *file, int line) {
+ long old_cnt = gpr_atm_acq_load(&eps->ref_count);
+ eps_unref(exec_ctx, eps);
+ gpr_log(GPR_DEBUG, "Unref eps: %p, old:%ld -> new:%ld (%s) - (%s, %d)",
+ (void *)eps, old_cnt, (old_cnt - 1), reason, file, line);
+}
+
+static grpc_workqueue *workqueue_ref(grpc_workqueue *workqueue,
+ const char *file, int line,
+ const char *reason) {
+ if (workqueue != NULL) {
+ eps_add_ref_dbg((epoll_set *)workqueue, reason, file, line);
+ }
+ return workqueue;
+}
+
+static void workqueue_unref(grpc_exec_ctx *exec_ctx, grpc_workqueue *workqueue,
+ const char *file, int line, const char *reason) {
+ if (workqueue != NULL) {
+ eps_unref_dbg(exec_ctx, (epoll_set *)workqueue, reason, file, line);
+ }
+}
+#else
+static grpc_workqueue *workqueue_ref(grpc_workqueue *workqueue) {
+ if (workqueue != NULL) {
+ eps_add_ref((epoll_set *)workqueue);
+ }
+ return workqueue;
+}
+
+static void workqueue_unref(grpc_exec_ctx *exec_ctx,
+ grpc_workqueue *workqueue) {
+ if (workqueue != NULL) {
+ eps_unref(exec_ctx, (epoll_set *)workqueue);
+ }
+}
+#endif
+
+static void eps_add_ref(epoll_set *eps) {
+ gpr_atm_no_barrier_fetch_add(&eps->ref_count, 1);
+}
+
+static void eps_unref(grpc_exec_ctx *exec_ctx, epoll_set *eps) {
+ /* If ref count went to zero, delete the epoll set. This deletion is
+ not done under a lock since once the ref count goes to zero, we are
+ guaranteed that no one else holds a reference to the epoll set (and
+ that there is no racing eps_add_ref() call either).*/
+ if (1 == gpr_atm_full_fetch_add(&eps->ref_count, -1)) {
+ epoll_set_delete(eps);
+ }
+}
+
+static void epoll_set_add_fd_locked(epoll_set *eps, grpc_fd *fd,
+ grpc_error **error) {
+ int err;
+ struct epoll_event ev;
+ char *err_msg;
+ const char *err_desc = "epoll_set_add_fd_locked";
+
+#ifdef GRPC_TSAN
+ /* See the definition of g_epoll_sync for more context */
+ gpr_atm_rel_store(&g_epoll_sync, (gpr_atm)0);
+#endif /* defined(GRPC_TSAN) */
+
+ ev.events = (uint32_t)(EPOLLIN | EPOLLOUT | EPOLLET);
+ ev.data.ptr = fd;
+ err = epoll_ctl(eps->epoll_fd, EPOLL_CTL_ADD, fd->fd, &ev);
+ if (err < 0 && errno != EEXIST) {
+ gpr_asprintf(
+ &err_msg,
+ "epoll_ctl (epoll_fd: %d) add fd: %d failed with error: %d (%s)",
+ eps->epoll_fd, fd->fd, errno, strerror(errno));
+ append_error(error, GRPC_OS_ERROR(errno, err_msg), err_desc);
+ gpr_free(err_msg);
+ }
+}
+
+static void epoll_set_add_wakeup_fd_locked(epoll_set *eps,
+ grpc_wakeup_fd *wakeup_fd,
+ grpc_error **error) {
+ struct epoll_event ev;
+ int err;
+ char *err_msg;
+ const char *err_desc = "epoll_set_add_wakeup_fd";
+
+ ev.events = (uint32_t)(EPOLLIN | EPOLLET);
+ ev.data.ptr = wakeup_fd;
+ err = epoll_ctl(eps->epoll_fd, EPOLL_CTL_ADD,
+ GRPC_WAKEUP_FD_GET_READ_FD(wakeup_fd), &ev);
+ if (err < 0 && errno != EEXIST) {
+ gpr_asprintf(&err_msg,
+ "epoll_ctl (epoll_fd: %d) add wakeup fd: %d failed with "
+ "error: %d (%s)",
+ eps->epoll_fd, GRPC_WAKEUP_FD_GET_READ_FD(wakeup_fd), errno,
+ strerror(errno));
+ append_error(error, GRPC_OS_ERROR(errno, err_msg), err_desc);
+ gpr_free(err_msg);
+ }
+}
+
+static void epoll_set_remove_fd(epoll_set *eps, grpc_fd *fd, bool is_fd_closed,
+ grpc_error **error) {
+ int err;
+ char *err_msg;
+ const char *err_desc = "epoll_set_remove_fd";
+
+ /* If fd is already closed, then it would have been automatically been removed
+ from the epoll set */
+ if (!is_fd_closed) {
+ err = epoll_ctl(eps->epoll_fd, EPOLL_CTL_DEL, fd->fd, NULL);
+ if (err < 0 && errno != ENOENT) {
+ gpr_asprintf(
+ &err_msg,
+ "epoll_ctl (epoll_fd: %d) del fd: %d failed with error: %d (%s)",
+ eps->epoll_fd, fd->fd, errno, strerror(errno));
+ append_error(error, GRPC_OS_ERROR(errno, err_msg), err_desc);
+ gpr_free(err_msg);
+ }
+ }
+}
+
+/* Might return NULL in case of an error */
+static epoll_set *epoll_set_create(grpc_error **error) {
+ epoll_set *eps = NULL;
+ const char *err_desc = "epoll_set_create";
+
+ *error = GRPC_ERROR_NONE;
+
+ eps = gpr_malloc(sizeof(*eps));
+ eps->workqueue_scheduler.vtable = &workqueue_scheduler_vtable;
+ eps->epoll_fd = -1;
+
+ gpr_mu_init(&eps->mu);
+ gpr_mu_init(&eps->workqueue_read_mu);
+ gpr_mpscq_init(&eps->workqueue_items);
+ gpr_atm_rel_store(&eps->workqueue_item_count, 0);
+
+ gpr_atm_rel_store(&eps->ref_count, 0);
+ gpr_atm_rel_store(&eps->poller_count, 0);
+
+ gpr_atm_rel_store(&eps->is_shutdown, false);
+
+ if (!append_error(error, grpc_wakeup_fd_init(&eps->workqueue_wakeup_fd),
+ err_desc)) {
+ goto done;
+ }
+
+ eps->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+
+ if (eps->epoll_fd < 0) {
+ append_error(error, GRPC_OS_ERROR(errno, "epoll_create1"), err_desc);
+ goto done;
+ }
+
+ epoll_set_add_wakeup_fd_locked(eps, &eps->workqueue_wakeup_fd, error);
+
+done:
+ if (*error != GRPC_ERROR_NONE) {
+ epoll_set_delete(eps);
+ eps = NULL;
+ }
+ return eps;
+}
+
+static void epoll_set_delete(epoll_set *eps) {
+ if (eps->epoll_fd >= 0) {
+ close(eps->epoll_fd);
+ }
+
+ GPR_ASSERT(gpr_atm_no_barrier_load(&eps->workqueue_item_count) == 0);
+ gpr_mu_destroy(&eps->mu);
+ gpr_mu_destroy(&eps->workqueue_read_mu);
+ gpr_mpscq_destroy(&eps->workqueue_items);
+ grpc_wakeup_fd_destroy(&eps->workqueue_wakeup_fd);
+
+ gpr_free(eps);
+}
+
+static void workqueue_maybe_wakeup(epoll_set *eps) {
+ /* If this thread is the current poller, then it may be that it's about to
+ decrement the current poller count, so we need to look past this thread */
+ bool is_current_poller = (g_current_thread_epoll_set == eps);
+ gpr_atm min_current_pollers_for_wakeup = is_current_poller ? 1 : 0;
+ gpr_atm current_pollers = gpr_atm_no_barrier_load(&eps->poller_count);
+ /* Only issue a wakeup if it's likely that some poller could come in and take
+ it right now. Note that since we do an anticipatory mpscq_pop every poll
+ loop, it's ok if we miss the wakeup here, as we'll get the work item when
+ the next poller enters anyway. */
+ if (current_pollers > min_current_pollers_for_wakeup) {
+ GRPC_LOG_IF_ERROR("workqueue_wakeup_fd",
+ grpc_wakeup_fd_wakeup(&eps->workqueue_wakeup_fd));
+ }
+}
+
+static void workqueue_enqueue(grpc_exec_ctx *exec_ctx, grpc_closure *closure,
+ grpc_error *error) {
+ GPR_TIMER_BEGIN("workqueue.enqueue", 0);
+ grpc_workqueue *workqueue = (grpc_workqueue *)closure->scheduler;
+ /* take a ref to the workqueue: otherwise it can happen that whatever events
+ * this kicks off ends up destroying the workqueue before this function
+ * completes */
+ GRPC_WORKQUEUE_REF(workqueue, "enqueue");
+ epoll_set *eps = (epoll_set *)workqueue;
+ gpr_atm last = gpr_atm_no_barrier_fetch_add(&eps->workqueue_item_count, 1);
+ closure->error_data.error = error;
+ gpr_mpscq_push(&eps->workqueue_items, &closure->next_data.atm_next);
+ if (last == 0) {
+ workqueue_maybe_wakeup(eps);
+ }
+
+ GRPC_WORKQUEUE_UNREF(exec_ctx, workqueue, "enqueue");
+ GPR_TIMER_END("workqueue.enqueue", 0);
+}
+
+static grpc_closure_scheduler *workqueue_scheduler(grpc_workqueue *workqueue) {
+ epoll_set *eps = (epoll_set *)workqueue;
+ return workqueue == NULL ? grpc_schedule_on_exec_ctx
+ : &eps->workqueue_scheduler;
+}
+
+static grpc_error *epoll_set_global_init() {
+ grpc_error *error = GRPC_ERROR_NONE;
+
+ error = grpc_wakeup_fd_init(&epoll_set_wakeup_fd);
+ if (error == GRPC_ERROR_NONE) {
+ error = grpc_wakeup_fd_wakeup(&epoll_set_wakeup_fd);
+ }
+
+ return error;
+}
+
+static void epoll_set_global_shutdown() {
+ grpc_wakeup_fd_destroy(&epoll_set_wakeup_fd);
+}
+
+/*******************************************************************************
+ * Fd Definitions
+ */
+
+/* We need to keep a freelist not because of any concerns of malloc performance
+ * but instead so that implementations with multiple threads in (for example)
+ * epoll_wait deal with the race between pollset removal and incoming poll
+ * notifications.
+ *
+ * The problem is that the poller ultimately holds a reference to this
+ * object, so it is very difficult to know when is safe to free it, at least
+ * without some expensive synchronization.
+ *
+ * If we keep the object freelisted, in the worst case losing this race just
+ * becomes a spurious read notification on a reused fd.
+ */
+
+static grpc_fd *fd_freelist = NULL;
+static gpr_mu fd_freelist_mu;
+
+static grpc_fd *get_fd_from_freelist() {
+ grpc_fd *new_fd = NULL;
+
+ gpr_mu_lock(&fd_freelist_mu);
+ if (fd_freelist != NULL) {
+ new_fd = fd_freelist;
+ fd_freelist = fd_freelist->freelist_next;
+ }
+ gpr_mu_unlock(&fd_freelist_mu);
+ return new_fd;
+}
+
+static void add_fd_to_freelist(grpc_fd *fd) {
+ gpr_mu_lock(&fd_freelist_mu);
+ fd->freelist_next = fd_freelist;
+ fd_freelist = fd;
+ grpc_iomgr_unregister_object(&fd->iomgr_object);
+
+ grpc_lfev_destroy(&fd->read_closure);
+ grpc_lfev_destroy(&fd->write_closure);
+
+ gpr_mu_unlock(&fd_freelist_mu);
+}
+
+static void fd_global_init(void) { gpr_mu_init(&fd_freelist_mu); }
+
+static void fd_global_shutdown(void) {
+ gpr_mu_lock(&fd_freelist_mu);
+ gpr_mu_unlock(&fd_freelist_mu);
+ while (fd_freelist != NULL) {
+ grpc_fd *fd = fd_freelist;
+ fd_freelist = fd_freelist->freelist_next;
+ gpr_mu_destroy(&fd->mu);
+ gpr_free(fd);
+ }
+ gpr_mu_destroy(&fd_freelist_mu);
+}
+
+static grpc_fd *fd_create(int fd, const char *name) {
+ grpc_fd *new_fd = get_fd_from_freelist();
+ if (new_fd == NULL) {
+ new_fd = gpr_malloc(sizeof(grpc_fd));
+ gpr_mu_init(&new_fd->mu);
+ }
+
+ /* Note: It is not really needed to get the new_fd->mu lock here. If this
+ * is a newly created fd (or an fd we got from the freelist), no one else
+ * would be holding a lock to it anyway. */
+ gpr_mu_lock(&new_fd->mu);
+ new_fd->eps = NULL;
+
+ new_fd->fd = fd;
+ new_fd->orphaned = false;
+ grpc_lfev_init(&new_fd->read_closure);
+ grpc_lfev_init(&new_fd->write_closure);
+
+ new_fd->freelist_next = NULL;
+ new_fd->on_done_closure = NULL;
+
+ gpr_mu_unlock(&new_fd->mu);
+
+ char *fd_name;
+ gpr_asprintf(&fd_name, "%s fd=%d", name, fd);
+ grpc_iomgr_register_object(&new_fd->iomgr_object, fd_name);
+ gpr_log(GPR_DEBUG, "FD %d %p create %s", fd, (void *)new_fd, fd_name);
+ gpr_free(fd_name);
+
+ /* Associate the fd with one of the eps */
+ add_fd_to_eps(new_fd);
+ return new_fd;
+}
+
+static int fd_wrapped_fd(grpc_fd *fd) {
+ int ret_fd = -1;
+ gpr_mu_lock(&fd->mu);
+ if (!fd->orphaned) {
+ ret_fd = fd->fd;
+ }
+ gpr_mu_unlock(&fd->mu);
+
+ return ret_fd;
+}
+
+static void fd_orphan(grpc_exec_ctx *exec_ctx, grpc_fd *fd,
+ grpc_closure *on_done, int *release_fd,
+ const char *reason) {
+ bool is_fd_closed = false;
+ grpc_error *error = GRPC_ERROR_NONE;
+ epoll_set *unref_eps = NULL;
+
+ gpr_mu_lock(&fd->mu);
+ fd->on_done_closure = on_done;
+
+ /* If release_fd is not NULL, we should be relinquishing control of the file
+ descriptor fd->fd (but we still own the grpc_fd structure). */
+ if (release_fd != NULL) {
+ *release_fd = fd->fd;
+ } else {
+ close(fd->fd);
+ is_fd_closed = true;
+ }
+
+ fd->orphaned = true;
+
+ /* Remove the fd from the epoll set */
+ if (fd->eps != NULL) {
+ epoll_set_remove_fd(fd->eps, fd, is_fd_closed, &error);
+ unref_eps = fd->eps;
+ fd->eps = NULL;
+ }
+
+ grpc_closure_sched(exec_ctx, fd->on_done_closure, GRPC_ERROR_REF(error));
+
+ gpr_mu_unlock(&fd->mu);
+
+ /* We are done with this fd. Release it (i.e add back to freelist) */
+ add_fd_to_freelist(fd);
+
+ if (unref_eps != NULL) {
+ /* Unref stale epoll set here, outside the fd lock above.
+ The epoll set owns a workqueue which owns an fd, and unreffing
+ inside the lock can cause an eventual lock loop that makes TSAN very
+ unhappy. */
+ EPS_UNREF(exec_ctx, unref_eps, "fd_orphan");
+ }
+ GRPC_LOG_IF_ERROR("fd_orphan", GRPC_ERROR_REF(error));
+ GRPC_ERROR_UNREF(error);
+}
+
+/* This polling engine doesn't really need the read notifier functionality. So
+ * it just returns a dummy read notifier pollset */
+static grpc_pollset *fd_get_read_notifier_pollset(grpc_exec_ctx *exec_ctx,
+ grpc_fd *fd) {
+ return &g_read_notifier;
+}
+
+static bool fd_is_shutdown(grpc_fd *fd) {
+ return grpc_lfev_is_shutdown(&fd->read_closure);
+}
+
+/* Might be called multiple times */
+static void fd_shutdown(grpc_exec_ctx *exec_ctx, grpc_fd *fd, grpc_error *why) {
+ if (grpc_lfev_set_shutdown(exec_ctx, &fd->read_closure,
+ GRPC_ERROR_REF(why))) {
+ shutdown(fd->fd, SHUT_RDWR);
+ grpc_lfev_set_shutdown(exec_ctx, &fd->write_closure, GRPC_ERROR_REF(why));
+ }
+ GRPC_ERROR_UNREF(why);
+}
+
+static void fd_notify_on_read(grpc_exec_ctx *exec_ctx, grpc_fd *fd,
+ grpc_closure *closure) {
+ grpc_lfev_notify_on(exec_ctx, &fd->read_closure, closure);
+}
+
+static void fd_notify_on_write(grpc_exec_ctx *exec_ctx, grpc_fd *fd,
+ grpc_closure *closure) {
+ grpc_lfev_notify_on(exec_ctx, &fd->write_closure, closure);
+}
+
+static grpc_workqueue *fd_get_workqueue(grpc_fd *fd) { return NULL; }
+
+/*******************************************************************************
+ * Pollset Definitions
+ */
+/* TODO: sreek - Not needed anymore */
+GPR_TLS_DECL(g_current_thread_pollset);
+GPR_TLS_DECL(g_current_thread_worker);
+
+static void pollset_worker_init(grpc_pollset_worker *worker) {
+ worker->next = worker->prev = NULL;
+ gpr_cv_init(&worker->kick_cv);
+}
+
+/* Global state management */
+static grpc_error *pollset_global_init(void) {
+ gpr_tls_init(&g_current_thread_pollset);
+ gpr_tls_init(&g_current_thread_worker);
+ return GRPC_ERROR_NONE;
+}
+
+static void pollset_global_shutdown(void) {
+ gpr_tls_destroy(&g_current_thread_pollset);
+ gpr_tls_destroy(&g_current_thread_worker);
+}
+
+static grpc_error *pollset_worker_kick(grpc_pollset_worker *worker) {
+ gpr_cv_signal(&worker->kick_cv);
+ return GRPC_ERROR_NONE;
+}
+
+/* Return 1 if the pollset has active threads in pollset_work (pollset must
+ * be locked) */
+static int pollset_has_workers(grpc_pollset *p) {
+ return p->root_worker.next != &p->root_worker;
+}
+
+static void remove_worker(grpc_pollset *p, grpc_pollset_worker *worker) {
+ worker->prev->next = worker->next;
+ worker->next->prev = worker->prev;
+}
+
+static grpc_pollset_worker *pop_front_worker(grpc_pollset *p) {
+ if (pollset_has_workers(p)) {
+ grpc_pollset_worker *w = p->root_worker.next;
+ remove_worker(p, w);
+ return w;
+ } else {
+ return NULL;
+ }
+}
+
+static void push_back_worker(grpc_pollset *p, grpc_pollset_worker *worker) {
+ worker->next = &p->root_worker;
+ worker->prev = worker->next->prev;
+ worker->prev->next = worker->next->prev = worker;
+}
+
+static void push_front_worker(grpc_pollset *p, grpc_pollset_worker *worker) {
+ worker->prev = &p->root_worker;
+ worker->next = worker->prev->next;
+ worker->prev->next = worker->next->prev = worker;
+}
+
+/* p->mu must be held before calling this function */
+static grpc_error *pollset_kick(grpc_pollset *p,
+ grpc_pollset_worker *specific_worker) {
+ GPR_TIMER_BEGIN("pollset_kick", 0);
+ grpc_error *error = GRPC_ERROR_NONE;
+ const char *err_desc = "Kick Failure";
+ grpc_pollset_worker *worker = specific_worker;
+ if (worker != NULL) {
+ if (worker == GRPC_POLLSET_KICK_BROADCAST) {
+ if (pollset_has_workers(p)) {
+ GPR_TIMER_BEGIN("pollset_kick.broadcast", 0);
+ for (worker = p->root_worker.next; worker != &p->root_worker;
+ worker = worker->next) {
+ if (gpr_tls_get(&g_current_thread_worker) != (intptr_t)worker) {
+ append_error(&error, pollset_worker_kick(worker), err_desc);
+ }
+ }
+ GPR_TIMER_END("pollset_kick.broadcast", 0);
+ } else {
+ p->kicked_without_pollers = true;
+ }
+ } else {
+ GPR_TIMER_MARK("kicked_specifically", 0);
+ if (gpr_tls_get(&g_current_thread_worker) != (intptr_t)worker) {
+ append_error(&error, pollset_worker_kick(worker), err_desc);
+ }
+ }
+ } else if (gpr_tls_get(&g_current_thread_pollset) != (intptr_t)p) {
+ /* Since worker == NULL, it means that we can kick "any" worker on this
+ pollset 'p'. If 'p' happens to be the same pollset this thread is
+ currently polling (i.e in pollset_work() function), then there is no need
+ to kick any other worker since the current thread can just absorb the
+ kick. This is the reason why we enter this case only when
+ g_current_thread_pollset is != p */
+
+ GPR_TIMER_MARK("kick_anonymous", 0);
+ worker = pop_front_worker(p);
+ if (worker != NULL) {
+ GPR_TIMER_MARK("finally_kick", 0);
+ push_back_worker(p, worker);
+ append_error(&error, pollset_worker_kick(worker), err_desc);
+ } else {
+ GPR_TIMER_MARK("kicked_no_pollers", 0);
+ p->kicked_without_pollers = true;
+ }
+ }
+
+ GPR_TIMER_END("pollset_kick", 0);
+ GRPC_LOG_IF_ERROR("pollset_kick", GRPC_ERROR_REF(error));
+ return error;
+}
+
+static void pollset_init(grpc_pollset *pollset, gpr_mu **mu) {
+ gpr_mu_init(&pollset->mu);
+ *mu = &pollset->mu;
+ pollset->eps = NULL;
+
+ pollset->root_worker.next = pollset->root_worker.prev = &pollset->root_worker;
+ pollset->kicked_without_pollers = false;
+
+ pollset->shutting_down = false;
+ pollset->finish_shutdown_called = false;
+ pollset->shutdown_done = NULL;
+}
+
+static void fd_become_readable(grpc_exec_ctx *exec_ctx, grpc_fd *fd) {
+ grpc_lfev_set_ready(exec_ctx, &fd->read_closure);
+}
+
+static void fd_become_writable(grpc_exec_ctx *exec_ctx, grpc_fd *fd) {
+ grpc_lfev_set_ready(exec_ctx, &fd->write_closure);
+}
+
+static void pollset_release_epoll_set(grpc_exec_ctx *exec_ctx, grpc_pollset *ps,
+ char *reason) {
+ if (ps->eps != NULL) {
+ EPS_UNREF(exec_ctx, ps->eps, reason);
+ }
+ ps->eps = NULL;
+}
+
+static void finish_shutdown_locked(grpc_exec_ctx *exec_ctx,
+ grpc_pollset *pollset) {
+ /* The pollset cannot have any workers if we are at this stage */
+ GPR_ASSERT(!pollset_has_workers(pollset));
+
+ pollset->finish_shutdown_called = true;
+
+ /* Release the ref and set pollset->eps to NULL */
+ pollset_release_epoll_set(exec_ctx, pollset, "ps_shutdown");
+ grpc_closure_sched(exec_ctx, pollset->shutdown_done, GRPC_ERROR_NONE);
+}
+
+/* pollset->mu lock must be held by the caller before calling this */
+static void pollset_shutdown(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
+ grpc_closure *closure) {
+ GPR_TIMER_BEGIN("pollset_shutdown", 0);
+ GPR_ASSERT(!pollset->shutting_down);
+ pollset->shutting_down = true;
+ pollset->shutdown_done = closure;
+ pollset_kick(pollset, GRPC_POLLSET_KICK_BROADCAST);
+
+ /* If the pollset has any workers, we cannot call finish_shutdown_locked()
+ because it would release the underlying epoll set. In such a case, we
+ let the last worker call finish_shutdown_locked() from pollset_work() */
+ if (!pollset_has_workers(pollset)) {
+ GPR_ASSERT(!pollset->finish_shutdown_called);
+ GPR_TIMER_MARK("pollset_shutdown.finish_shutdown_locked", 0);
+ finish_shutdown_locked(exec_ctx, pollset);
+ }
+ GPR_TIMER_END("pollset_shutdown", 0);
+}
+
+/* pollset_shutdown is guaranteed to be called before pollset_destroy. So other
+ * than destroying the mutexes, there is nothing special that needs to be done
+ * here */
+static void pollset_destroy(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset) {
+ GPR_ASSERT(!pollset_has_workers(pollset));
+ gpr_mu_destroy(&pollset->mu);
+}
+
+static bool maybe_do_workqueue_work(grpc_exec_ctx *exec_ctx, epoll_set *eps) {
+ if (gpr_mu_trylock(&eps->workqueue_read_mu)) {
+ gpr_mpscq_node *n = gpr_mpscq_pop(&eps->workqueue_items);
+ gpr_mu_unlock(&eps->workqueue_read_mu);
+ if (n != NULL) {
+ if (gpr_atm_full_fetch_add(&eps->workqueue_item_count, -1) > 1) {
+ workqueue_maybe_wakeup(eps);
+ }
+ grpc_closure *c = (grpc_closure *)n;
+ grpc_error *error = c->error_data.error;
+#ifndef NDEBUG
+ c->scheduled = false;
+#endif
+ c->cb(exec_ctx, c->cb_arg, error);
+ GRPC_ERROR_UNREF(error);
+ return true;
+ } else if (gpr_atm_no_barrier_load(&eps->workqueue_item_count) > 0) {
+ /* n == NULL might mean there's work but it's not available to be popped
+ * yet - try to ensure another workqueue wakes up to check shortly if so
+ */
+ workqueue_maybe_wakeup(eps);
+ }
+ }
+ return false;
+}
+
+/* Blocking call */
+static void acquire_epoll_lease(epoll_set *eps) {
+ if (g_num_threads_per_eps > 1) {
+ gpr_mu_lock(&eps->mu);
+ }
+}
+
+static void release_epoll_lease(epoll_set *eps) {
+ if (g_num_threads_per_eps > 1) {
+ gpr_mu_unlock(&eps->mu);
+ }
+}
+
+#define GRPC_EPOLL_MAX_EVENTS 100
+static void do_epoll_wait(grpc_exec_ctx *exec_ctx, int epoll_fd, epoll_set *eps,
+ grpc_error **error) {
+ struct epoll_event ep_ev[GRPC_EPOLL_MAX_EVENTS];
+ int ep_rv;
+ char *err_msg;
+ const char *err_desc = "do_epoll_wait";
+
+ int timeout_ms = -1;
+
+ GRPC_SCHEDULING_START_BLOCKING_REGION;
+ acquire_epoll_lease(eps);
+ ep_rv = epoll_wait(epoll_fd, ep_ev, GRPC_EPOLL_MAX_EVENTS, timeout_ms);
+ release_epoll_lease(eps);
+ GRPC_SCHEDULING_END_BLOCKING_REGION;
+
+ if (ep_rv < 0) {
+ gpr_asprintf(&err_msg,
+ "epoll_wait() epoll fd: %d failed with error: %d (%s)",
+ epoll_fd, errno, strerror(errno));
+ append_error(error, GRPC_OS_ERROR(errno, err_msg), err_desc);
+ }
+
+#ifdef GRPC_TSAN
+ /* See the definition of g_poll_sync for more details */
+ gpr_atm_acq_load(&g_epoll_sync);
+#endif /* defined(GRPC_TSAN) */
+
+ for (int i = 0; i < ep_rv; ++i) {
+ void *data_ptr = ep_ev[i].data.ptr;
+ if (data_ptr == &eps->workqueue_wakeup_fd) {
+ append_error(error,
+ grpc_wakeup_fd_consume_wakeup(&eps->workqueue_wakeup_fd),
+ err_desc);
+ maybe_do_workqueue_work(exec_ctx, eps);
+ } else if (data_ptr == &epoll_set_wakeup_fd) {
+ gpr_atm_rel_store(&eps->is_shutdown, 1);
+ gpr_log(GPR_INFO, "pollset poller: shutdown set");
+ } else {
+ grpc_fd *fd = data_ptr;
+ int cancel = ep_ev[i].events & (EPOLLERR | EPOLLHUP);
+ int read_ev = ep_ev[i].events & (EPOLLIN | EPOLLPRI);
+ int write_ev = ep_ev[i].events & EPOLLOUT;
+ if (read_ev || cancel) {
+ fd_become_readable(exec_ctx, fd);
+ }
+ if (write_ev || cancel) {
+ fd_become_writable(exec_ctx, fd);
+ }
+ }
+ }
+}
+
+static void epoll_set_work(grpc_exec_ctx *exec_ctx, epoll_set *eps,
+ grpc_error **error) {
+ int epoll_fd = -1;
+ GPR_TIMER_BEGIN("epoll_set_work", 0);
+
+ /* Since epoll_fd is immutable, it is safe to read it without a lock on the
+ epoll set. */
+ epoll_fd = eps->epoll_fd;
+
+ /* If we get some workqueue work to do, it might end up completing an item on
+ the completion queue, so there's no need to poll... so we skip that and
+ redo the complete loop to verify */
+ if (!maybe_do_workqueue_work(exec_ctx, eps)) {
+ gpr_atm_no_barrier_fetch_add(&eps->poller_count, 1);
+ g_current_thread_epoll_set = eps;
+
+ do_epoll_wait(exec_ctx, epoll_fd, eps, error);
+
+ g_current_thread_epoll_set = NULL;
+ gpr_atm_no_barrier_fetch_add(&eps->poller_count, -1);
+ }
+
+ GPR_TIMER_END("epoll_set_work", 0);
+}
+
+/* pollset->mu lock must be held by the caller before calling this.
+ The function pollset_work() may temporarily release the lock (pollset->mu)
+ during the course of its execution but it will always re-acquire the lock and
+ ensure that it is held by the time the function returns */
+static grpc_error *pollset_work(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
+ grpc_pollset_worker **worker_hdl,
+ gpr_timespec now, gpr_timespec deadline) {
+ GPR_TIMER_BEGIN("pollset_work", 0);
+ grpc_error *error = GRPC_ERROR_NONE;
+
+ grpc_pollset_worker worker;
+ pollset_worker_init(&worker);
+
+ if (worker_hdl) *worker_hdl = &worker;
+
+ gpr_tls_set(&g_current_thread_pollset, (intptr_t)pollset);
+ gpr_tls_set(&g_current_thread_worker, (intptr_t)&worker);
+
+ if (pollset->kicked_without_pollers) {
+ /* If the pollset was kicked without pollers, pretend that the current
+ worker got the kick and skip polling. A kick indicates that there is some
+ work that needs attention like an event on the completion queue or an
+ alarm */
+ GPR_TIMER_MARK("pollset_work.kicked_without_pollers", 0);
+ pollset->kicked_without_pollers = 0;
+ } else if (!pollset->shutting_down) {
+ push_front_worker(pollset, &worker);
+
+ gpr_cv_wait(&worker.kick_cv, &pollset->mu,
+ gpr_convert_clock_type(deadline, GPR_CLOCK_REALTIME));
+ /* pollset->mu locked here */
+
+ remove_worker(pollset, &worker);
+ }
+
+ /* If we are the last worker on the pollset (i.e pollset_has_workers() is
+ false at this point) and the pollset is shutting down, we may have to
+ finish the shutdown process by calling finish_shutdown_locked().
+ See pollset_shutdown() for more details.
+
+ Note: Continuing to access pollset here is safe; it is the caller's
+ responsibility to not destroy a pollset when it has outstanding calls to
+ pollset_work() */
+ if (pollset->shutting_down && !pollset_has_workers(pollset) &&
+ !pollset->finish_shutdown_called) {
+ GPR_TIMER_MARK("pollset_work.finish_shutdown_locked", 0);
+ finish_shutdown_locked(exec_ctx, pollset);
+
+ gpr_mu_unlock(&pollset->mu);
+ grpc_exec_ctx_flush(exec_ctx);
+ gpr_mu_lock(&pollset->mu);
+ }
+
+ if (worker_hdl) *worker_hdl = NULL;
+
+ gpr_tls_set(&g_current_thread_pollset, (intptr_t)0);
+ gpr_tls_set(&g_current_thread_worker, (intptr_t)0);
+
+ GPR_TIMER_END("pollset_work", 0);
+
+ GRPC_LOG_IF_ERROR("pollset_work", GRPC_ERROR_REF(error));
+ return error;
+}
+
+static void pollset_add_fd(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
+ grpc_fd *fd) {
+ /* Nothing to do */
+}
+
+/*******************************************************************************
+ * Pollset-set Definitions
+ */
+grpc_pollset_set g_dummy_pollset_set;
+static grpc_pollset_set *pollset_set_create(void) {
+ return &g_dummy_pollset_set;
+}
+
+static void pollset_set_destroy(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *pss) {
+ /* Nothing to do */
+}
+
+static void pollset_set_add_fd(grpc_exec_ctx *exec_ctx, grpc_pollset_set *pss,
+ grpc_fd *fd) {
+ /* Nothing to do */
+}
+
+static void pollset_set_del_fd(grpc_exec_ctx *exec_ctx, grpc_pollset_set *pss,
+ grpc_fd *fd) {
+ /* Nothing to do */
+}
+
+static void pollset_set_add_pollset(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *pss, grpc_pollset *ps) {
+ /* Nothing to do */
+}
+
+static void pollset_set_del_pollset(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *pss, grpc_pollset *ps) {
+ /* Nothing to do */
+}
+
+static void pollset_set_add_pollset_set(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *bag,
+ grpc_pollset_set *item) {
+ /* Nothing to do */
+}
+
+static void pollset_set_del_pollset_set(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *bag,
+ grpc_pollset_set *item) {
+ /* Nothing to do */
+}
+
+/*******************************************************************************
+ * Event engine binding
+ */
+
+static void shutdown_engine(void) {
+ shutdown_poller_threads();
+ shutdown_epoll_sets();
+ fd_global_shutdown();
+ pollset_global_shutdown();
+ epoll_set_global_shutdown();
+ gpr_log(GPR_INFO, "ev-epoll-threadpool engine shutdown complete");
+}
+
+static const grpc_event_engine_vtable vtable = {
+ .pollset_size = sizeof(grpc_pollset),
+
+ .fd_create = fd_create,
+ .fd_wrapped_fd = fd_wrapped_fd,
+ .fd_orphan = fd_orphan,
+ .fd_shutdown = fd_shutdown,
+ .fd_is_shutdown = fd_is_shutdown,
+ .fd_notify_on_read = fd_notify_on_read,
+ .fd_notify_on_write = fd_notify_on_write,
+ .fd_get_read_notifier_pollset = fd_get_read_notifier_pollset,
+ .fd_get_workqueue = fd_get_workqueue,
+
+ .pollset_init = pollset_init,
+ .pollset_shutdown = pollset_shutdown,
+ .pollset_destroy = pollset_destroy,
+ .pollset_work = pollset_work,
+ .pollset_kick = pollset_kick,
+ .pollset_add_fd = pollset_add_fd,
+
+ .pollset_set_create = pollset_set_create,
+ .pollset_set_destroy = pollset_set_destroy,
+ .pollset_set_add_pollset = pollset_set_add_pollset,
+ .pollset_set_del_pollset = pollset_set_del_pollset,
+ .pollset_set_add_pollset_set = pollset_set_add_pollset_set,
+ .pollset_set_del_pollset_set = pollset_set_del_pollset_set,
+ .pollset_set_add_fd = pollset_set_add_fd,
+ .pollset_set_del_fd = pollset_set_del_fd,
+
+ .workqueue_ref = workqueue_ref,
+ .workqueue_unref = workqueue_unref,
+ .workqueue_scheduler = workqueue_scheduler,
+
+ .shutdown_engine = shutdown_engine,
+};
+
+/*****************************************************************************
+ * Dedicated polling threads and pollsets - Definitions
+ */
+static void add_fd_to_eps(grpc_fd *fd) {
+ GPR_ASSERT(fd->eps == NULL);
+ GPR_TIMER_BEGIN("add_fd_to_eps", 0);
+
+ grpc_error *error = GRPC_ERROR_NONE;
+ size_t idx = (size_t)gpr_atm_no_barrier_fetch_add(&g_next_eps, 1) % g_num_eps;
+ epoll_set *eps = g_epoll_sets[idx];
+
+ gpr_mu_lock(&fd->mu);
+
+ if (fd->orphaned) {
+ gpr_mu_unlock(&fd->mu);
+ return; /* Early out */
+ }
+
+ epoll_set_add_fd_locked(eps, fd, &error);
+ EPS_ADD_REF(eps, "fd");
+ fd->eps = eps;
+
+ GRPC_POLLING_TRACE("add_fd_to_eps (fd: %d, eps idx = %" PRIdPTR ")", fd->fd,
+ idx);
+ gpr_mu_unlock(&fd->mu);
+
+ GRPC_LOG_IF_ERROR("add_fd_to_eps", error);
+ GPR_TIMER_END("add_fd_to_eps", 0);
+}
+
+static bool init_epoll_sets() {
+ grpc_error *error = GRPC_ERROR_NONE;
+ bool is_success = true;
+
+ g_epoll_sets = (epoll_set **)malloc(g_num_eps * sizeof(epoll_set *));
+
+ for (size_t i = 0; i < g_num_eps; i++) {
+ g_epoll_sets[i] = epoll_set_create(&error);
+ if (g_epoll_sets[i] == NULL) {
+ gpr_log(GPR_ERROR, "Error in creating a epoll set");
+ g_num_eps = i; /* Helps cleanup */
+ shutdown_epoll_sets();
+ is_success = false;
+ goto done;
+ }
+
+ EPS_ADD_REF(g_epoll_sets[i], "init_epoll_sets");
+ }
+
+ gpr_atm_no_barrier_store(&g_next_eps, 0);
+ gpr_mu *mu;
+ pollset_init(&g_read_notifier, &mu);
+
+done:
+ GRPC_LOG_IF_ERROR("init_epoll_sets", error);
+ return is_success;
+}
+
+static void shutdown_epoll_sets() {
+ if (!g_epoll_sets) {
+ return;
+ }
+
+ grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT;
+ for (size_t i = 0; i < g_num_eps; i++) {
+ EPS_UNREF(&exec_ctx, g_epoll_sets[i], "shutdown_epoll_sets");
+ }
+ grpc_exec_ctx_flush(&exec_ctx);
+
+ gpr_free(g_epoll_sets);
+ g_epoll_sets = NULL;
+ pollset_destroy(&exec_ctx, &g_read_notifier);
+ grpc_exec_ctx_finish(&exec_ctx);
+}
+
+static void poller_thread_loop(void *arg) {
+ grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT;
+ grpc_error *error = GRPC_ERROR_NONE;
+ epoll_set *eps = (epoll_set *)arg;
+
+ while (!gpr_atm_acq_load(&eps->is_shutdown)) {
+ epoll_set_work(&exec_ctx, eps, &error);
+ grpc_exec_ctx_flush(&exec_ctx);
+ }
+
+ grpc_exec_ctx_finish(&exec_ctx);
+ GRPC_LOG_IF_ERROR("poller_thread_loop", error);
+}
+
+/* g_epoll_sets MUST be initialized before calling this */
+static void start_poller_threads() {
+ GPR_ASSERT(g_epoll_sets);
+
+ gpr_log(GPR_INFO, "Starting poller threads");
+
+ size_t num_threads = g_num_eps * g_num_threads_per_eps;
+ g_poller_threads = (gpr_thd_id *)malloc(num_threads * sizeof(gpr_thd_id));
+ gpr_thd_options options = gpr_thd_options_default();
+ gpr_thd_options_set_joinable(&options);
+
+ for (size_t i = 0; i < num_threads; i++) {
+ gpr_thd_new(&g_poller_threads[i], poller_thread_loop,
+ (void *)g_epoll_sets[i % g_num_eps], &options);
+ }
+}
+
+static void shutdown_poller_threads() {
+ GPR_ASSERT(g_poller_threads);
+ GPR_ASSERT(g_epoll_sets);
+ grpc_error *error = GRPC_ERROR_NONE;
+
+ gpr_log(GPR_INFO, "Shutting down pollers");
+
+ epoll_set *eps = NULL;
+ size_t num_threads = g_num_eps * g_num_threads_per_eps;
+ for (size_t i = 0; i < num_threads; i++) {
+ eps = g_epoll_sets[i];
+ epoll_set_add_wakeup_fd_locked(eps, &epoll_set_wakeup_fd, &error);
+ }
+
+ for (size_t i = 0; i < g_num_eps; i++) {
+ gpr_thd_join(g_poller_threads[i]);
+ }
+
+ GRPC_LOG_IF_ERROR("shutdown_poller_threads", error);
+ gpr_free(g_poller_threads);
+ g_poller_threads = NULL;
+}
+
+/****************************************************************************/
+
+/* It is possible that GLIBC has epoll but the underlying kernel doesn't.
+ * Create a dummy epoll_fd to make sure epoll support is available */
+static bool is_epoll_available() {
+ int fd = epoll_create1(EPOLL_CLOEXEC);
+ if (fd < 0) {
+ gpr_log(
+ GPR_ERROR,
+ "epoll_create1 failed with error: %d. Not using epoll polling engine",
+ fd);
+ return false;
+ }
+ close(fd);
+ return true;
+}
+
+const grpc_event_engine_vtable *grpc_init_epoll_thread_pool_linux(
+ bool requested_explicitly) {
+ if (!requested_explicitly) return NULL;
+
+ if (!grpc_has_wakeup_fd()) {
+ return NULL;
+ }
+
+ if (!is_epoll_available()) {
+ return NULL;
+ }
+
+ fd_global_init();
+
+ if (!GRPC_LOG_IF_ERROR("pollset_global_init", pollset_global_init())) {
+ return NULL;
+ }
+
+ if (!GRPC_LOG_IF_ERROR("epoll_set_global_init", epoll_set_global_init())) {
+ return NULL;
+ }
+
+ if (!init_epoll_sets()) {
+ return NULL;
+ }
+
+ /* TODO (sreek): Maynot be a good idea to start threads here (especially if
+ * this engine doesn't get picked. Consider introducing an engine_init
+ * function in the vtable */
+ start_poller_threads();
+ return &vtable;
+}
+
+#else /* defined(GRPC_LINUX_EPOLL) */
+#if defined(GRPC_POSIX_SOCKET)
+#include "src/core/lib/iomgr/ev_posix.h"
+/* If GRPC_LINUX_EPOLL is not defined, it means epoll is not available. Return
+ * NULL */
+const grpc_event_engine_vtable *grpc_init_epoll_thread_pool_linux(
+ bool requested_explicitly) {
+ return NULL;
+}
+#endif /* defined(GRPC_POSIX_SOCKET) */
+#endif /* !defined(GRPC_LINUX_EPOLL) */
diff --git a/src/core/lib/iomgr/ev_epoll_thread_pool_linux.h b/src/core/lib/iomgr/ev_epoll_thread_pool_linux.h
new file mode 100644
index 0000000000..9af776a52e
--- /dev/null
+++ b/src/core/lib/iomgr/ev_epoll_thread_pool_linux.h
@@ -0,0 +1,43 @@
+/*
+ *
+ * Copyright 2017, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef GRPC_CORE_LIB_IOMGR_EV_EPOLL_THREAD_POOL_LINUX_H
+#define GRPC_CORE_LIB_IOMGR_EV_EPOLL_THREAD_POOL_LINUX_H
+
+#include "src/core/lib/iomgr/ev_posix.h"
+#include "src/core/lib/iomgr/port.h"
+
+const grpc_event_engine_vtable *grpc_init_epoll_thread_pool_linux(
+ bool requested_explicitly);
+
+#endif /* GRPC_CORE_LIB_IOMGR_EV_EPOLL_THREAD_POOL_LINUX_H */
diff --git a/src/core/lib/iomgr/ev_epollex_linux.c b/src/core/lib/iomgr/ev_epollex_linux.c
new file mode 100644
index 0000000000..7cb6085e25
--- /dev/null
+++ b/src/core/lib/iomgr/ev_epollex_linux.c
@@ -0,0 +1,1511 @@
+/*
+ *
+ * Copyright 2017, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "src/core/lib/iomgr/port.h"
+
+/* This polling engine is only relevant on linux kernels supporting epoll() */
+#ifdef GRPC_LINUX_EPOLL
+
+#include "src/core/lib/iomgr/ev_epollex_linux.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <poll.h>
+#include <pthread.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include <grpc/support/alloc.h>
+#include <grpc/support/log.h>
+#include <grpc/support/string_util.h>
+#include <grpc/support/tls.h>
+#include <grpc/support/useful.h>
+
+#include "src/core/lib/iomgr/ev_posix.h"
+#include "src/core/lib/iomgr/iomgr_internal.h"
+#include "src/core/lib/iomgr/is_epollexclusive_available.h"
+#include "src/core/lib/iomgr/lockfree_event.h"
+#include "src/core/lib/iomgr/sys_epoll_wrapper.h"
+#include "src/core/lib/iomgr/timer.h"
+#include "src/core/lib/iomgr/wakeup_fd_posix.h"
+#include "src/core/lib/iomgr/workqueue.h"
+#include "src/core/lib/profiling/timers.h"
+#include "src/core/lib/support/block_annotate.h"
+#include "src/core/lib/support/spinlock.h"
+
+/*******************************************************************************
+ * Pollset-set sibling link
+ */
+
+typedef enum {
+ PO_POLLING_GROUP,
+ PO_POLLSET_SET,
+ PO_POLLSET,
+ PO_FD, /* ordering is important: we always want to lock pollsets before fds:
+ this guarantees that using an fd as a pollable is safe */
+ PO_EMPTY_POLLABLE,
+ PO_COUNT
+} polling_obj_type;
+
+typedef struct polling_obj polling_obj;
+typedef struct polling_group polling_group;
+
+struct polling_obj {
+ gpr_mu mu;
+ polling_obj_type type;
+ polling_group *group;
+ struct polling_obj *next;
+ struct polling_obj *prev;
+};
+
+struct polling_group {
+ polling_obj po;
+ gpr_refcount refs;
+};
+
+static void po_init(polling_obj *po, polling_obj_type type);
+static void po_destroy(polling_obj *po);
+static void po_join(grpc_exec_ctx *exec_ctx, polling_obj *a, polling_obj *b);
+static int po_cmp(polling_obj *a, polling_obj *b);
+
+static void pg_create(grpc_exec_ctx *exec_ctx, polling_obj **initial_po,
+ size_t initial_po_count);
+static polling_group *pg_ref(polling_group *pg);
+static void pg_unref(polling_group *pg);
+static void pg_merge(grpc_exec_ctx *exec_ctx, polling_group *a,
+ polling_group *b);
+static void pg_join(grpc_exec_ctx *exec_ctx, polling_group *pg,
+ polling_obj *po);
+
+/*******************************************************************************
+ * pollable Declarations
+ */
+
+typedef struct pollable {
+ polling_obj po;
+ int epfd;
+ grpc_wakeup_fd wakeup;
+ grpc_pollset_worker *root_worker;
+} pollable;
+
+static pollable g_empty_pollable;
+
+static void pollable_init(pollable *p, polling_obj_type type);
+static void pollable_destroy(pollable *p);
+/* ensure that p->epfd, p->wakeup are initialized; p->po.mu must be held */
+static grpc_error *pollable_materialize(pollable *p);
+
+/*******************************************************************************
+ * Fd Declarations
+ */
+
+struct grpc_fd {
+ pollable pollable;
+ int fd;
+ /* refst format:
+ bit 0 : 1=Active / 0=Orphaned
+ bits 1-n : refcount
+ Ref/Unref by two to avoid altering the orphaned bit */
+ gpr_atm refst;
+
+ /* Wakeup fd used to wake pollers to check the contents of workqueue_items */
+ grpc_wakeup_fd workqueue_wakeup_fd;
+ grpc_closure_scheduler workqueue_scheduler;
+ /* Spinlock guarding the read end of the workqueue (must be held to pop from
+ * workqueue_items) */
+ gpr_spinlock workqueue_read_mu;
+ /* Queue of closures to be executed */
+ gpr_mpscq workqueue_items;
+ /* Count of items in workqueue_items */
+ gpr_atm workqueue_item_count;
+
+ /* The fd is either closed or we relinquished control of it. In either
+ cases, this indicates that the 'fd' on this structure is no longer
+ valid */
+ gpr_mu orphaned_mu;
+ bool orphaned;
+
+ gpr_atm read_closure;
+ gpr_atm write_closure;
+
+ struct grpc_fd *freelist_next;
+ grpc_closure *on_done_closure;
+
+ /* The pollset that last noticed that the fd is readable. The actual type
+ * stored in this is (grpc_pollset *) */
+ gpr_atm read_notifier_pollset;
+
+ grpc_iomgr_object iomgr_object;
+};
+
+static void fd_global_init(void);
+static void fd_global_shutdown(void);
+
+static void workqueue_enqueue(grpc_exec_ctx *exec_ctx, grpc_closure *closure,
+ grpc_error *error);
+
+static const grpc_closure_scheduler_vtable workqueue_scheduler_vtable = {
+ workqueue_enqueue, workqueue_enqueue, "workqueue"};
+
+/*******************************************************************************
+ * Pollset Declarations
+ */
+
+typedef struct pollset_worker_link {
+ grpc_pollset_worker *next;
+ grpc_pollset_worker *prev;
+} pollset_worker_link;
+
+typedef enum {
+ PWL_POLLSET,
+ PWL_POLLABLE,
+ POLLSET_WORKER_LINK_COUNT
+} pollset_worker_links;
+
+struct grpc_pollset_worker {
+ bool kicked;
+ bool initialized_cv;
+ pollset_worker_link links[POLLSET_WORKER_LINK_COUNT];
+ gpr_cv cv;
+ grpc_pollset *pollset;
+ pollable *pollable;
+};
+
+struct grpc_pollset {
+ pollable pollable;
+ pollable *current_pollable;
+ bool kicked_without_poller;
+ grpc_closure *shutdown_closure;
+ grpc_pollset_worker *root_worker;
+};
+
+/*******************************************************************************
+ * Pollset-set Declarations
+ */
+struct grpc_pollset_set {
+ polling_obj po;
+};
+
+/*******************************************************************************
+ * Common helpers
+ */
+
+static bool append_error(grpc_error **composite, grpc_error *error,
+ const char *desc) {
+ if (error == GRPC_ERROR_NONE) return true;
+ if (*composite == GRPC_ERROR_NONE) {
+ *composite = GRPC_ERROR_CREATE_FROM_COPIED_STRING(desc);
+ }
+ *composite = grpc_error_add_child(*composite, error);
+ return false;
+}
+
+/*******************************************************************************
+ * Fd Definitions
+ */
+
+/* We need to keep a freelist not because of any concerns of malloc performance
+ * but instead so that implementations with multiple threads in (for example)
+ * epoll_wait deal with the race between pollset removal and incoming poll
+ * notifications.
+ *
+ * The problem is that the poller ultimately holds a reference to this
+ * object, so it is very difficult to know when is safe to free it, at least
+ * without some expensive synchronization.
+ *
+ * If we keep the object freelisted, in the worst case losing this race just
+ * becomes a spurious read notification on a reused fd.
+ */
+
+/* The alarm system needs to be able to wakeup 'some poller' sometimes
+ * (specifically when a new alarm needs to be triggered earlier than the next
+ * alarm 'epoch'). This wakeup_fd gives us something to alert on when such a
+ * case occurs. */
+
+static grpc_fd *fd_freelist = NULL;
+static gpr_mu fd_freelist_mu;
+
+#ifdef GRPC_FD_REF_COUNT_DEBUG
+#define REF_BY(fd, n, reason) ref_by(fd, n, reason, __FILE__, __LINE__)
+#define UNREF_BY(ec, fd, n, reason) \
+ unref_by(ec, fd, n, reason, __FILE__, __LINE__)
+static void ref_by(grpc_fd *fd, int n, const char *reason, const char *file,
+ int line) {
+ gpr_log(GPR_DEBUG, "FD %d %p ref %d %ld -> %ld [%s; %s:%d]", fd->fd,
+ (void *)fd, n, gpr_atm_no_barrier_load(&fd->refst),
+ gpr_atm_no_barrier_load(&fd->refst) + n, reason, file, line);
+#else
+#define REF_BY(fd, n, reason) ref_by(fd, n)
+#define UNREF_BY(ec, fd, n, reason) unref_by(ec, fd, n)
+static void ref_by(grpc_fd *fd, int n) {
+#endif
+ GPR_ASSERT(gpr_atm_no_barrier_fetch_add(&fd->refst, n) > 0);
+}
+
+static void fd_destroy(grpc_exec_ctx *exec_ctx, void *arg, grpc_error *error) {
+ grpc_fd *fd = arg;
+ /* Add the fd to the freelist */
+ grpc_iomgr_unregister_object(&fd->iomgr_object);
+ pollable_destroy(&fd->pollable);
+ gpr_mu_destroy(&fd->orphaned_mu);
+ gpr_mu_lock(&fd_freelist_mu);
+ fd->freelist_next = fd_freelist;
+ fd_freelist = fd;
+
+ grpc_lfev_destroy(&fd->read_closure);
+ grpc_lfev_destroy(&fd->write_closure);
+
+ gpr_mu_unlock(&fd_freelist_mu);
+}
+
+#ifdef GRPC_FD_REF_COUNT_DEBUG
+static void unref_by(grpc_exec_ctx *exec_ctx, grpc_fd *fd, int n,
+ const char *reason, const char *file, int line) {
+ gpr_atm old;
+ gpr_log(GPR_DEBUG, "FD %d %p unref %d %ld -> %ld [%s; %s:%d]", fd->fd,
+ (void *)fd, n, gpr_atm_no_barrier_load(&fd->refst),
+ gpr_atm_no_barrier_load(&fd->refst) - n, reason, file, line);
+#else
+static void unref_by(grpc_exec_ctx *exec_ctx, grpc_fd *fd, int n) {
+ gpr_atm old;
+#endif
+ old = gpr_atm_full_fetch_add(&fd->refst, -n);
+ if (old == n) {
+ grpc_closure_sched(exec_ctx, grpc_closure_create(fd_destroy, fd,
+ grpc_schedule_on_exec_ctx),
+ GRPC_ERROR_NONE);
+ } else {
+ GPR_ASSERT(old > n);
+ }
+}
+
+static void fd_global_init(void) { gpr_mu_init(&fd_freelist_mu); }
+
+static void fd_global_shutdown(void) {
+ gpr_mu_lock(&fd_freelist_mu);
+ gpr_mu_unlock(&fd_freelist_mu);
+ while (fd_freelist != NULL) {
+ grpc_fd *fd = fd_freelist;
+ fd_freelist = fd_freelist->freelist_next;
+ gpr_free(fd);
+ }
+ gpr_mu_destroy(&fd_freelist_mu);
+}
+
+static grpc_fd *fd_create(int fd, const char *name) {
+ grpc_fd *new_fd = NULL;
+
+ gpr_mu_lock(&fd_freelist_mu);
+ if (fd_freelist != NULL) {
+ new_fd = fd_freelist;
+ fd_freelist = fd_freelist->freelist_next;
+ }
+ gpr_mu_unlock(&fd_freelist_mu);
+
+ if (new_fd == NULL) {
+ new_fd = gpr_malloc(sizeof(grpc_fd));
+ }
+
+ pollable_init(&new_fd->pollable, PO_FD);
+
+ gpr_atm_rel_store(&new_fd->refst, (gpr_atm)1);
+ new_fd->fd = fd;
+ gpr_mu_init(&new_fd->orphaned_mu);
+ new_fd->orphaned = false;
+ grpc_lfev_init(&new_fd->read_closure);
+ grpc_lfev_init(&new_fd->write_closure);
+ gpr_atm_no_barrier_store(&new_fd->read_notifier_pollset, (gpr_atm)NULL);
+
+ GRPC_LOG_IF_ERROR("fd_create",
+ grpc_wakeup_fd_init(&new_fd->workqueue_wakeup_fd));
+ new_fd->workqueue_scheduler.vtable = &workqueue_scheduler_vtable;
+ new_fd->workqueue_read_mu = GPR_SPINLOCK_INITIALIZER;
+ gpr_mpscq_init(&new_fd->workqueue_items);
+ gpr_atm_no_barrier_store(&new_fd->workqueue_item_count, 0);
+
+ new_fd->freelist_next = NULL;
+ new_fd->on_done_closure = NULL;
+
+ char *fd_name;
+ gpr_asprintf(&fd_name, "%s fd=%d", name, fd);
+ grpc_iomgr_register_object(&new_fd->iomgr_object, fd_name);
+#ifdef GRPC_FD_REF_COUNT_DEBUG
+ gpr_log(GPR_DEBUG, "FD %d %p create %s", fd, (void *)new_fd, fd_name);
+#endif
+ gpr_free(fd_name);
+ return new_fd;
+}
+
+static int fd_wrapped_fd(grpc_fd *fd) {
+ int ret_fd = -1;
+ gpr_mu_lock(&fd->orphaned_mu);
+ if (!fd->orphaned) {
+ ret_fd = fd->fd;
+ }
+ gpr_mu_unlock(&fd->orphaned_mu);
+
+ return ret_fd;
+}
+
+static void fd_orphan(grpc_exec_ctx *exec_ctx, grpc_fd *fd,
+ grpc_closure *on_done, int *release_fd,
+ const char *reason) {
+ bool is_fd_closed = false;
+ grpc_error *error = GRPC_ERROR_NONE;
+
+ gpr_mu_lock(&fd->pollable.po.mu);
+ gpr_mu_lock(&fd->orphaned_mu);
+ fd->on_done_closure = on_done;
+
+ /* If release_fd is not NULL, we should be relinquishing control of the file
+ descriptor fd->fd (but we still own the grpc_fd structure). */
+ if (release_fd != NULL) {
+ *release_fd = fd->fd;
+ } else {
+ close(fd->fd);
+ is_fd_closed = true;
+ }
+
+ fd->orphaned = true;
+
+ if (!is_fd_closed) {
+ gpr_log(GPR_DEBUG, "TODO: handle fd removal?");
+ }
+
+ /* Remove the active status but keep referenced. We want this grpc_fd struct
+ to be alive (and not added to freelist) until the end of this function */
+ REF_BY(fd, 1, reason);
+
+ grpc_closure_sched(exec_ctx, fd->on_done_closure, GRPC_ERROR_REF(error));
+
+ gpr_mu_unlock(&fd->orphaned_mu);
+ gpr_mu_unlock(&fd->pollable.po.mu);
+ UNREF_BY(exec_ctx, fd, 2, reason); /* Drop the reference */
+ GRPC_LOG_IF_ERROR("fd_orphan", GRPC_ERROR_REF(error));
+ GRPC_ERROR_UNREF(error);
+}
+
+static grpc_pollset *fd_get_read_notifier_pollset(grpc_exec_ctx *exec_ctx,
+ grpc_fd *fd) {
+ gpr_atm notifier = gpr_atm_acq_load(&fd->read_notifier_pollset);
+ return (grpc_pollset *)notifier;
+}
+
+static bool fd_is_shutdown(grpc_fd *fd) {
+ return grpc_lfev_is_shutdown(&fd->read_closure);
+}
+
+/* Might be called multiple times */
+static void fd_shutdown(grpc_exec_ctx *exec_ctx, grpc_fd *fd, grpc_error *why) {
+ if (grpc_lfev_set_shutdown(exec_ctx, &fd->read_closure,
+ GRPC_ERROR_REF(why))) {
+ shutdown(fd->fd, SHUT_RDWR);
+ grpc_lfev_set_shutdown(exec_ctx, &fd->write_closure, GRPC_ERROR_REF(why));
+ }
+ GRPC_ERROR_UNREF(why);
+}
+
+static void fd_notify_on_read(grpc_exec_ctx *exec_ctx, grpc_fd *fd,
+ grpc_closure *closure) {
+ grpc_lfev_notify_on(exec_ctx, &fd->read_closure, closure);
+}
+
+static void fd_notify_on_write(grpc_exec_ctx *exec_ctx, grpc_fd *fd,
+ grpc_closure *closure) {
+ grpc_lfev_notify_on(exec_ctx, &fd->write_closure, closure);
+}
+
+static grpc_workqueue *fd_get_workqueue(grpc_fd *fd) {
+ REF_BY(fd, 2, "return_workqueue");
+ return (grpc_workqueue *)fd;
+}
+
+#ifdef GRPC_WORKQUEUE_REFCOUNT_DEBUG
+static grpc_workqueue *workqueue_ref(grpc_workqueue *workqueue,
+ const char *file, int line,
+ const char *reason) {
+ if (workqueue != NULL) {
+ ref_by((grpc_fd *)workqueue, 2, file, line, reason);
+ }
+ return workqueue;
+}
+
+static void workqueue_unref(grpc_exec_ctx *exec_ctx, grpc_workqueue *workqueue,
+ const char *file, int line, const char *reason) {
+ if (workqueue != NULL) {
+ unref_by(exec_ctx, (grpc_fd *)workqueue, 2, file, line, reason);
+ }
+}
+#else
+static grpc_workqueue *workqueue_ref(grpc_workqueue *workqueue) {
+ if (workqueue != NULL) {
+ ref_by((grpc_fd *)workqueue, 2);
+ }
+ return workqueue;
+}
+
+static void workqueue_unref(grpc_exec_ctx *exec_ctx,
+ grpc_workqueue *workqueue) {
+ if (workqueue != NULL) {
+ unref_by(exec_ctx, (grpc_fd *)workqueue, 2);
+ }
+}
+#endif
+
+static void workqueue_wakeup(grpc_fd *fd) {
+ GRPC_LOG_IF_ERROR("workqueue_enqueue",
+ grpc_wakeup_fd_wakeup(&fd->workqueue_wakeup_fd));
+}
+
+static void workqueue_enqueue(grpc_exec_ctx *exec_ctx, grpc_closure *closure,
+ grpc_error *error) {
+ GPR_TIMER_BEGIN("workqueue.enqueue", 0);
+ grpc_fd *fd = (grpc_fd *)(((char *)closure->scheduler) -
+ offsetof(grpc_fd, workqueue_scheduler));
+ REF_BY(fd, 2, "workqueue_enqueue");
+ gpr_atm last = gpr_atm_no_barrier_fetch_add(&fd->workqueue_item_count, 1);
+ closure->error_data.error = error;
+ gpr_mpscq_push(&fd->workqueue_items, &closure->next_data.atm_next);
+ if (last == 0) {
+ workqueue_wakeup(fd);
+ }
+ UNREF_BY(exec_ctx, fd, 2, "workqueue_enqueue");
+}
+
+static void fd_invoke_workqueue(grpc_exec_ctx *exec_ctx, grpc_fd *fd) {
+ /* handle spurious wakeups */
+ if (!gpr_spinlock_trylock(&fd->workqueue_read_mu)) return;
+ gpr_mpscq_node *n = gpr_mpscq_pop(&fd->workqueue_items);
+ gpr_spinlock_unlock(&fd->workqueue_read_mu);
+ if (n != NULL) {
+ if (gpr_atm_full_fetch_add(&fd->workqueue_item_count, -1) > 1) {
+ workqueue_wakeup(fd);
+ }
+ grpc_closure *c = (grpc_closure *)n;
+ grpc_error *error = c->error_data.error;
+#ifndef NDEBUG
+ c->scheduled = false;
+#endif
+ c->cb(exec_ctx, c->cb_arg, error);
+ GRPC_ERROR_UNREF(error);
+ } else if (gpr_atm_no_barrier_load(&fd->workqueue_item_count) > 0) {
+ /* n == NULL might mean there's work but it's not available to be popped
+ * yet - try to ensure another workqueue wakes up to check shortly if so
+ */
+ workqueue_wakeup(fd);
+ }
+}
+
+static grpc_closure_scheduler *workqueue_scheduler(grpc_workqueue *workqueue) {
+ return &((grpc_fd *)workqueue)->workqueue_scheduler;
+}
+
+/*******************************************************************************
+ * Pollable Definitions
+ */
+
+static void pollable_init(pollable *p, polling_obj_type type) {
+ po_init(&p->po, type);
+ p->root_worker = NULL;
+ p->epfd = -1;
+}
+
+static void pollable_destroy(pollable *p) {
+ po_destroy(&p->po);
+ if (p->epfd != -1) {
+ close(p->epfd);
+ grpc_wakeup_fd_destroy(&p->wakeup);
+ }
+}
+
+/* ensure that p->epfd, p->wakeup are initialized; p->po.mu must be held */
+static grpc_error *pollable_materialize(pollable *p) {
+ if (p->epfd == -1) {
+ int new_epfd = epoll_create1(EPOLL_CLOEXEC);
+ if (new_epfd < 0) {
+ return GRPC_OS_ERROR(errno, "epoll_create1");
+ }
+ grpc_error *err = grpc_wakeup_fd_init(&p->wakeup);
+ if (err != GRPC_ERROR_NONE) {
+ close(new_epfd);
+ return err;
+ }
+ struct epoll_event ev = {.events = (uint32_t)(EPOLLIN | EPOLLET),
+ .data.ptr = &p->wakeup};
+ if (epoll_ctl(new_epfd, EPOLL_CTL_ADD, p->wakeup.read_fd, &ev) != 0) {
+ err = GRPC_OS_ERROR(errno, "epoll_ctl");
+ close(new_epfd);
+ grpc_wakeup_fd_destroy(&p->wakeup);
+ return err;
+ }
+
+ p->epfd = new_epfd;
+ }
+ return GRPC_ERROR_NONE;
+}
+
+/* pollable must be materialized */
+static grpc_error *pollable_add_fd(pollable *p, grpc_fd *fd) {
+ grpc_error *error = GRPC_ERROR_NONE;
+ static const char *err_desc = "pollable_add_fd";
+ const int epfd = p->epfd;
+ GPR_ASSERT(epfd != -1);
+
+ if (GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG, "add fd %p to pollable %p", fd, p);
+ }
+
+ gpr_mu_lock(&fd->orphaned_mu);
+ if (fd->orphaned) {
+ gpr_mu_unlock(&fd->orphaned_mu);
+ return GRPC_ERROR_NONE;
+ }
+ struct epoll_event ev_fd = {
+ .events = (uint32_t)(EPOLLET | EPOLLIN | EPOLLOUT | EPOLLEXCLUSIVE),
+ .data.ptr = fd};
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd->fd, &ev_fd) != 0) {
+ switch (errno) {
+ case EEXIST: /* if this fd is already in the epoll set, the workqueue fd
+ must also be - just return */
+ gpr_mu_unlock(&fd->orphaned_mu);
+ return GRPC_ERROR_NONE;
+ default:
+ append_error(&error, GRPC_OS_ERROR(errno, "epoll_ctl"), err_desc);
+ }
+ }
+ struct epoll_event ev_wq = {
+ .events = (uint32_t)(EPOLLET | EPOLLIN | EPOLLEXCLUSIVE),
+ .data.ptr = (void *)(1 + (intptr_t)fd)};
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd->workqueue_wakeup_fd.read_fd, &ev_wq) !=
+ 0) {
+ switch (errno) {
+ case EEXIST: /* if the workqueue fd is already in the epoll set we're ok
+ - no need to do anything special */
+ break;
+ default:
+ append_error(&error, GRPC_OS_ERROR(errno, "epoll_ctl"), err_desc);
+ }
+ }
+ gpr_mu_unlock(&fd->orphaned_mu);
+
+ return error;
+}
+
+/*******************************************************************************
+ * Pollset Definitions
+ */
+
+GPR_TLS_DECL(g_current_thread_pollset);
+GPR_TLS_DECL(g_current_thread_worker);
+
+/* Global state management */
+static grpc_error *pollset_global_init(void) {
+ gpr_tls_init(&g_current_thread_pollset);
+ gpr_tls_init(&g_current_thread_worker);
+ pollable_init(&g_empty_pollable, PO_EMPTY_POLLABLE);
+ return GRPC_ERROR_NONE;
+}
+
+static void pollset_global_shutdown(void) {
+ pollable_destroy(&g_empty_pollable);
+ gpr_tls_destroy(&g_current_thread_pollset);
+ gpr_tls_destroy(&g_current_thread_worker);
+}
+
+static grpc_error *pollset_kick_all(grpc_pollset *pollset) {
+ grpc_error *error = GRPC_ERROR_NONE;
+ if (pollset->root_worker != NULL) {
+ grpc_pollset_worker *worker = pollset->root_worker;
+ do {
+ if (worker->pollable != &pollset->pollable) {
+ gpr_mu_lock(&worker->pollable->po.mu);
+ }
+ if (worker->initialized_cv) {
+ worker->kicked = true;
+ gpr_cv_signal(&worker->cv);
+ } else {
+ append_error(&error, grpc_wakeup_fd_wakeup(&worker->pollable->wakeup),
+ "pollset_shutdown");
+ }
+ if (worker->pollable != &pollset->pollable) {
+ gpr_mu_unlock(&worker->pollable->po.mu);
+ }
+
+ worker = worker->links[PWL_POLLSET].next;
+ } while (worker != pollset->root_worker);
+ }
+ return error;
+}
+
+static grpc_error *pollset_kick_inner(grpc_pollset *pollset, pollable *p,
+ grpc_pollset_worker *specific_worker) {
+ if (GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG,
+ "PS:%p kick %p tls_pollset=%p tls_worker=%p "
+ "root_worker=(pollset:%p pollable:%p)",
+ p, specific_worker, (void *)gpr_tls_get(&g_current_thread_pollset),
+ (void *)gpr_tls_get(&g_current_thread_worker), pollset->root_worker,
+ p->root_worker);
+ }
+ if (specific_worker == NULL) {
+ if (gpr_tls_get(&g_current_thread_pollset) != (intptr_t)pollset) {
+ if (pollset->root_worker == NULL) {
+ if (GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG, "PS:%p kicked_any_without_poller", p);
+ }
+ pollset->kicked_without_poller = true;
+ return GRPC_ERROR_NONE;
+ } else {
+ if (GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG, "PS:%p kicked_any_via_wakeup_fd", p);
+ }
+ grpc_error *err = pollable_materialize(p);
+ if (err != GRPC_ERROR_NONE) return err;
+ return grpc_wakeup_fd_wakeup(&p->wakeup);
+ }
+ } else {
+ if (GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG, "PS:%p kicked_any_but_awake", p);
+ }
+ return GRPC_ERROR_NONE;
+ }
+ } else if (specific_worker->kicked) {
+ if (GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG, "PS:%p kicked_specific_but_already_kicked", p);
+ }
+ return GRPC_ERROR_NONE;
+ } else if (gpr_tls_get(&g_current_thread_worker) ==
+ (intptr_t)specific_worker) {
+ if (GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG, "PS:%p kicked_specific_but_awake", p);
+ }
+ specific_worker->kicked = true;
+ return GRPC_ERROR_NONE;
+ } else if (specific_worker == p->root_worker) {
+ if (GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG, "PS:%p kicked_specific_via_wakeup_fd", p);
+ }
+ grpc_error *err = pollable_materialize(p);
+ if (err != GRPC_ERROR_NONE) return err;
+ specific_worker->kicked = true;
+ return grpc_wakeup_fd_wakeup(&p->wakeup);
+ } else {
+ if (GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG, "PS:%p kicked_specific_via_cv", p);
+ }
+ specific_worker->kicked = true;
+ gpr_cv_signal(&specific_worker->cv);
+ return GRPC_ERROR_NONE;
+ }
+}
+
+/* p->po.mu must be held before calling this function */
+static grpc_error *pollset_kick(grpc_pollset *pollset,
+ grpc_pollset_worker *specific_worker) {
+ pollable *p = pollset->current_pollable;
+ if (p != &pollset->pollable) {
+ gpr_mu_lock(&p->po.mu);
+ }
+ grpc_error *error = pollset_kick_inner(pollset, p, specific_worker);
+ if (p != &pollset->pollable) {
+ gpr_mu_unlock(&p->po.mu);
+ }
+ return error;
+}
+
+static void pollset_init(grpc_pollset *pollset, gpr_mu **mu) {
+ pollable_init(&pollset->pollable, PO_POLLSET);
+ pollset->current_pollable = &g_empty_pollable;
+ pollset->kicked_without_poller = false;
+ pollset->shutdown_closure = NULL;
+ pollset->root_worker = NULL;
+ *mu = &pollset->pollable.po.mu;
+}
+
+/* Convert a timespec to milliseconds:
+ - Very small or negative poll times are clamped to zero to do a non-blocking
+ poll (which becomes spin polling)
+ - Other small values are rounded up to one millisecond
+ - Longer than a millisecond polls are rounded up to the next nearest
+ millisecond to avoid spinning
+ - Infinite timeouts are converted to -1 */
+static int poll_deadline_to_millis_timeout(gpr_timespec deadline,
+ gpr_timespec now) {
+ gpr_timespec timeout;
+ if (gpr_time_cmp(deadline, gpr_inf_future(deadline.clock_type)) == 0) {
+ return -1;
+ }
+
+ if (gpr_time_cmp(deadline, now) <= 0) {
+ return 0;
+ }
+
+ static const gpr_timespec round_up = {
+ .clock_type = GPR_TIMESPAN, .tv_sec = 0, .tv_nsec = GPR_NS_PER_MS - 1};
+ timeout = gpr_time_sub(deadline, now);
+ int millis = gpr_time_to_millis(gpr_time_add(timeout, round_up));
+ return millis >= 1 ? millis : 1;
+}
+
+static void fd_become_readable(grpc_exec_ctx *exec_ctx, grpc_fd *fd,
+ grpc_pollset *notifier) {
+ grpc_lfev_set_ready(exec_ctx, &fd->read_closure);
+
+ /* Note, it is possible that fd_become_readable might be called twice with
+ different 'notifier's when an fd becomes readable and it is in two epoll
+ sets (This can happen briefly during polling island merges). In such cases
+ it does not really matter which notifer is set as the read_notifier_pollset
+ (They would both point to the same polling island anyway) */
+ /* Use release store to match with acquire load in fd_get_read_notifier */
+ gpr_atm_rel_store(&fd->read_notifier_pollset, (gpr_atm)notifier);
+}
+
+static void fd_become_writable(grpc_exec_ctx *exec_ctx, grpc_fd *fd) {
+ grpc_lfev_set_ready(exec_ctx, &fd->write_closure);
+}
+
+static grpc_error *fd_become_pollable_locked(grpc_fd *fd) {
+ grpc_error *error = GRPC_ERROR_NONE;
+ static const char *err_desc = "fd_become_pollable";
+ if (append_error(&error, pollable_materialize(&fd->pollable), err_desc)) {
+ append_error(&error, pollable_add_fd(&fd->pollable, fd), err_desc);
+ }
+ return error;
+}
+
+static void pollset_maybe_finish_shutdown(grpc_exec_ctx *exec_ctx,
+ grpc_pollset *pollset) {
+ if (pollset->shutdown_closure != NULL && pollset->root_worker == NULL) {
+ grpc_closure_sched(exec_ctx, pollset->shutdown_closure, GRPC_ERROR_NONE);
+ pollset->shutdown_closure = NULL;
+ }
+}
+
+/* pollset->po.mu lock must be held by the caller before calling this */
+static void pollset_shutdown(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
+ grpc_closure *closure) {
+ GPR_ASSERT(pollset->shutdown_closure == NULL);
+ pollset->shutdown_closure = closure;
+ GRPC_LOG_IF_ERROR("pollset_shutdown", pollset_kick_all(pollset));
+ pollset_maybe_finish_shutdown(exec_ctx, pollset);
+}
+
+static bool pollset_is_pollable_fd(grpc_pollset *pollset, pollable *p) {
+ return p != &g_empty_pollable && p != &pollset->pollable;
+}
+
+/* pollset_shutdown is guaranteed to be called before pollset_destroy. */
+static void pollset_destroy(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset) {
+ pollable_destroy(&pollset->pollable);
+ if (pollset_is_pollable_fd(pollset, pollset->current_pollable)) {
+ UNREF_BY(exec_ctx, (grpc_fd *)pollset->current_pollable, 2,
+ "pollset_pollable");
+ }
+}
+
+#define MAX_EPOLL_EVENTS 100
+
+static grpc_error *pollset_epoll(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
+ pollable *p, gpr_timespec now,
+ gpr_timespec deadline) {
+ struct epoll_event events[MAX_EPOLL_EVENTS];
+ static const char *err_desc = "pollset_poll";
+
+ int timeout = poll_deadline_to_millis_timeout(deadline, now);
+
+ if (GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG, "PS:%p poll %p for %dms", pollset, p, timeout);
+ }
+
+ if (timeout != 0) {
+ GRPC_SCHEDULING_START_BLOCKING_REGION;
+ }
+ int r;
+ do {
+ r = epoll_wait(p->epfd, events, MAX_EPOLL_EVENTS, timeout);
+ } while (r < 0 && errno == EINTR);
+ if (timeout != 0) {
+ GRPC_SCHEDULING_END_BLOCKING_REGION;
+ }
+
+ if (r < 0) return GRPC_OS_ERROR(errno, "epoll_wait");
+
+ if (GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG, "PS:%p poll %p got %d events", pollset, p, r);
+ }
+
+ grpc_error *error = GRPC_ERROR_NONE;
+ for (int i = 0; i < r; i++) {
+ void *data_ptr = events[i].data.ptr;
+ if (data_ptr == &p->wakeup) {
+ if (GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG, "PS:%p poll %p got pollset_wakeup", pollset, p);
+ }
+ append_error(&error, grpc_wakeup_fd_consume_wakeup(&p->wakeup), err_desc);
+ } else {
+ grpc_fd *fd = (grpc_fd *)(((intptr_t)data_ptr) & ~(intptr_t)1);
+ bool is_workqueue = (((intptr_t)data_ptr) & 1) != 0;
+ bool cancel = (events[i].events & (EPOLLERR | EPOLLHUP)) != 0;
+ bool read_ev = (events[i].events & (EPOLLIN | EPOLLPRI)) != 0;
+ bool write_ev = (events[i].events & EPOLLOUT) != 0;
+ if (GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG,
+ "PS:%p poll %p got fd %p: is_wq=%d cancel=%d read=%d "
+ "write=%d",
+ pollset, p, fd, is_workqueue, cancel, read_ev, write_ev);
+ }
+ if (is_workqueue) {
+ append_error(&error,
+ grpc_wakeup_fd_consume_wakeup(&fd->workqueue_wakeup_fd),
+ err_desc);
+ fd_invoke_workqueue(exec_ctx, fd);
+ } else {
+ if (read_ev || cancel) {
+ fd_become_readable(exec_ctx, fd, pollset);
+ }
+ if (write_ev || cancel) {
+ fd_become_writable(exec_ctx, fd);
+ }
+ }
+ }
+ }
+
+ return error;
+}
+
+/* Return true if first in list */
+static bool worker_insert(grpc_pollset_worker **root, pollset_worker_links link,
+ grpc_pollset_worker *worker) {
+ if (*root == NULL) {
+ *root = worker;
+ worker->links[link].next = worker->links[link].prev = worker;
+ return true;
+ } else {
+ worker->links[link].next = *root;
+ worker->links[link].prev = worker->links[link].next->links[link].prev;
+ worker->links[link].next->links[link].prev = worker;
+ worker->links[link].prev->links[link].next = worker;
+ return false;
+ }
+}
+
+/* Return true if last in list */
+typedef enum { EMPTIED, NEW_ROOT, REMOVED } worker_remove_result;
+
+static worker_remove_result worker_remove(grpc_pollset_worker **root,
+ pollset_worker_links link,
+ grpc_pollset_worker *worker) {
+ if (worker == *root) {
+ if (worker == worker->links[link].next) {
+ *root = NULL;
+ return EMPTIED;
+ } else {
+ *root = worker->links[link].next;
+ worker->links[link].prev->links[link].next = worker->links[link].next;
+ worker->links[link].next->links[link].prev = worker->links[link].prev;
+ return NEW_ROOT;
+ }
+ } else {
+ worker->links[link].prev->links[link].next = worker->links[link].next;
+ worker->links[link].next->links[link].prev = worker->links[link].prev;
+ return REMOVED;
+ }
+}
+
+/* Return true if this thread should poll */
+static bool begin_worker(grpc_pollset *pollset, grpc_pollset_worker *worker,
+ grpc_pollset_worker **worker_hdl, gpr_timespec *now,
+ gpr_timespec deadline) {
+ bool do_poll = true;
+ if (worker_hdl != NULL) *worker_hdl = worker;
+ worker->initialized_cv = false;
+ worker->kicked = false;
+ worker->pollset = pollset;
+ worker->pollable = pollset->current_pollable;
+
+ if (pollset_is_pollable_fd(pollset, worker->pollable)) {
+ REF_BY((grpc_fd *)worker->pollable, 2, "one_poll");
+ }
+
+ worker_insert(&pollset->root_worker, PWL_POLLSET, worker);
+ if (!worker_insert(&worker->pollable->root_worker, PWL_POLLABLE, worker)) {
+ worker->initialized_cv = true;
+ gpr_cv_init(&worker->cv);
+ if (worker->pollable != &pollset->pollable) {
+ gpr_mu_unlock(&pollset->pollable.po.mu);
+ }
+ if (GRPC_TRACER_ON(grpc_polling_trace) &&
+ worker->pollable->root_worker != worker) {
+ gpr_log(GPR_DEBUG, "PS:%p wait %p w=%p for %dms", pollset,
+ worker->pollable, worker,
+ poll_deadline_to_millis_timeout(deadline, *now));
+ }
+ while (do_poll && worker->pollable->root_worker != worker) {
+ if (gpr_cv_wait(&worker->cv, &worker->pollable->po.mu, deadline)) {
+ if (GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG, "PS:%p timeout_wait %p w=%p", pollset,
+ worker->pollable, worker);
+ }
+ do_poll = false;
+ } else if (worker->kicked) {
+ if (GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG, "PS:%p wakeup %p w=%p", pollset, worker->pollable,
+ worker);
+ }
+ do_poll = false;
+ } else if (GRPC_TRACER_ON(grpc_polling_trace) &&
+ worker->pollable->root_worker != worker) {
+ gpr_log(GPR_DEBUG, "PS:%p spurious_wakeup %p w=%p", pollset,
+ worker->pollable, worker);
+ }
+ }
+ if (worker->pollable != &pollset->pollable) {
+ gpr_mu_unlock(&worker->pollable->po.mu);
+ gpr_mu_lock(&pollset->pollable.po.mu);
+ gpr_mu_lock(&worker->pollable->po.mu);
+ }
+ *now = gpr_now(now->clock_type);
+ }
+
+ return do_poll && pollset->shutdown_closure == NULL &&
+ pollset->current_pollable == worker->pollable;
+}
+
+static void end_worker(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
+ grpc_pollset_worker *worker,
+ grpc_pollset_worker **worker_hdl) {
+ if (NEW_ROOT ==
+ worker_remove(&worker->pollable->root_worker, PWL_POLLABLE, worker)) {
+ gpr_cv_signal(&worker->pollable->root_worker->cv);
+ }
+ if (worker->initialized_cv) {
+ gpr_cv_destroy(&worker->cv);
+ }
+ if (pollset_is_pollable_fd(pollset, worker->pollable)) {
+ UNREF_BY(exec_ctx, (grpc_fd *)worker->pollable, 2, "one_poll");
+ }
+ if (EMPTIED == worker_remove(&pollset->root_worker, PWL_POLLSET, worker)) {
+ pollset_maybe_finish_shutdown(exec_ctx, pollset);
+ }
+}
+
+/* pollset->po.mu lock must be held by the caller before calling this.
+ The function pollset_work() may temporarily release the lock (pollset->po.mu)
+ during the course of its execution but it will always re-acquire the lock and
+ ensure that it is held by the time the function returns */
+static grpc_error *pollset_work(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
+ grpc_pollset_worker **worker_hdl,
+ gpr_timespec now, gpr_timespec deadline) {
+ grpc_pollset_worker worker;
+ if (0 && GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG, "PS:%p work hdl=%p worker=%p now=%" PRId64
+ ".%09d deadline=%" PRId64 ".%09d kwp=%d root_worker=%p",
+ pollset, worker_hdl, &worker, now.tv_sec, now.tv_nsec,
+ deadline.tv_sec, deadline.tv_nsec, pollset->kicked_without_poller,
+ pollset->root_worker);
+ }
+ grpc_error *error = GRPC_ERROR_NONE;
+ static const char *err_desc = "pollset_work";
+ if (pollset->kicked_without_poller) {
+ pollset->kicked_without_poller = false;
+ return GRPC_ERROR_NONE;
+ }
+ if (pollset->current_pollable != &pollset->pollable) {
+ gpr_mu_lock(&pollset->current_pollable->po.mu);
+ }
+ if (begin_worker(pollset, &worker, worker_hdl, &now, deadline)) {
+ gpr_tls_set(&g_current_thread_pollset, (intptr_t)pollset);
+ gpr_tls_set(&g_current_thread_worker, (intptr_t)&worker);
+ GPR_ASSERT(!pollset->shutdown_closure);
+ append_error(&error, pollable_materialize(worker.pollable), err_desc);
+ if (worker.pollable != &pollset->pollable) {
+ gpr_mu_unlock(&worker.pollable->po.mu);
+ }
+ gpr_mu_unlock(&pollset->pollable.po.mu);
+ append_error(&error, pollset_epoll(exec_ctx, pollset, worker.pollable, now,
+ deadline),
+ err_desc);
+ grpc_exec_ctx_flush(exec_ctx);
+ gpr_mu_lock(&pollset->pollable.po.mu);
+ if (worker.pollable != &pollset->pollable) {
+ gpr_mu_lock(&worker.pollable->po.mu);
+ }
+ gpr_tls_set(&g_current_thread_pollset, 0);
+ gpr_tls_set(&g_current_thread_worker, 0);
+ pollset_maybe_finish_shutdown(exec_ctx, pollset);
+ }
+ end_worker(exec_ctx, pollset, &worker, worker_hdl);
+ if (worker.pollable != &pollset->pollable) {
+ gpr_mu_unlock(&worker.pollable->po.mu);
+ }
+ return error;
+}
+
+static void unref_fd_no_longer_poller(grpc_exec_ctx *exec_ctx, void *arg,
+ grpc_error *error) {
+ grpc_fd *fd = arg;
+ UNREF_BY(exec_ctx, fd, 2, "pollset_pollable");
+}
+
+/* expects pollsets locked, flag whether fd is locked or not */
+static grpc_error *pollset_add_fd_locked(grpc_exec_ctx *exec_ctx,
+ grpc_pollset *pollset, grpc_fd *fd,
+ bool fd_locked) {
+ static const char *err_desc = "pollset_add_fd";
+ grpc_error *error = GRPC_ERROR_NONE;
+ if (pollset->current_pollable == &g_empty_pollable) {
+ if (GRPC_TRACER_ON(grpc_polling_trace))
+ gpr_log(GPR_DEBUG,
+ "PS:%p add fd %p; transition pollable from empty to fd", pollset,
+ fd);
+ /* empty pollable --> single fd pollable */
+ append_error(&error, pollset_kick_all(pollset), err_desc);
+ pollset->current_pollable = &fd->pollable;
+ if (!fd_locked) gpr_mu_lock(&fd->pollable.po.mu);
+ append_error(&error, fd_become_pollable_locked(fd), err_desc);
+ if (!fd_locked) gpr_mu_unlock(&fd->pollable.po.mu);
+ REF_BY(fd, 2, "pollset_pollable");
+ } else if (pollset->current_pollable == &pollset->pollable) {
+ if (GRPC_TRACER_ON(grpc_polling_trace))
+ gpr_log(GPR_DEBUG, "PS:%p add fd %p; already multipolling", pollset, fd);
+ append_error(&error, pollable_add_fd(pollset->current_pollable, fd),
+ err_desc);
+ } else if (pollset->current_pollable != &fd->pollable) {
+ grpc_fd *had_fd = (grpc_fd *)pollset->current_pollable;
+ if (GRPC_TRACER_ON(grpc_polling_trace))
+ gpr_log(GPR_DEBUG,
+ "PS:%p add fd %p; transition pollable from fd %p to multipoller",
+ pollset, fd, had_fd);
+ append_error(&error, pollset_kick_all(pollset), err_desc);
+ pollset->current_pollable = &pollset->pollable;
+ if (append_error(&error, pollable_materialize(&pollset->pollable),
+ err_desc)) {
+ pollable_add_fd(&pollset->pollable, had_fd);
+ pollable_add_fd(&pollset->pollable, fd);
+ }
+ grpc_closure_sched(exec_ctx,
+ grpc_closure_create(unref_fd_no_longer_poller, had_fd,
+ grpc_schedule_on_exec_ctx),
+ GRPC_ERROR_NONE);
+ }
+ return error;
+}
+
+static void pollset_add_fd(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
+ grpc_fd *fd) {
+ gpr_mu_lock(&pollset->pollable.po.mu);
+ grpc_error *error = pollset_add_fd_locked(exec_ctx, pollset, fd, false);
+ gpr_mu_unlock(&pollset->pollable.po.mu);
+ GRPC_LOG_IF_ERROR("pollset_add_fd", error);
+}
+
+/*******************************************************************************
+ * Pollset-set Definitions
+ */
+
+static grpc_pollset_set *pollset_set_create(void) {
+ grpc_pollset_set *pss = gpr_zalloc(sizeof(*pss));
+ po_init(&pss->po, PO_POLLSET_SET);
+ return pss;
+}
+
+static void pollset_set_destroy(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *pss) {
+ po_destroy(&pss->po);
+ gpr_free(pss);
+}
+
+static void pollset_set_add_fd(grpc_exec_ctx *exec_ctx, grpc_pollset_set *pss,
+ grpc_fd *fd) {
+ po_join(exec_ctx, &pss->po, &fd->pollable.po);
+}
+
+static void pollset_set_del_fd(grpc_exec_ctx *exec_ctx, grpc_pollset_set *pss,
+ grpc_fd *fd) {}
+
+static void pollset_set_add_pollset(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *pss, grpc_pollset *ps) {
+ po_join(exec_ctx, &pss->po, &ps->pollable.po);
+}
+
+static void pollset_set_del_pollset(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *pss, grpc_pollset *ps) {}
+
+static void pollset_set_add_pollset_set(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *bag,
+ grpc_pollset_set *item) {
+ po_join(exec_ctx, &bag->po, &item->po);
+}
+
+static void pollset_set_del_pollset_set(grpc_exec_ctx *exec_ctx,
+ grpc_pollset_set *bag,
+ grpc_pollset_set *item) {}
+
+static void po_init(polling_obj *po, polling_obj_type type) {
+ gpr_mu_init(&po->mu);
+ po->type = type;
+ po->group = NULL;
+ po->next = po;
+ po->prev = po;
+}
+
+static polling_group *pg_lock_latest(polling_group *pg) {
+ /* assumes pg unlocked; consumes ref, returns ref */
+ gpr_mu_lock(&pg->po.mu);
+ while (pg->po.group != NULL) {
+ polling_group *new_pg = pg_ref(pg->po.group);
+ gpr_mu_unlock(&pg->po.mu);
+ pg_unref(pg);
+ pg = new_pg;
+ gpr_mu_lock(&pg->po.mu);
+ }
+ return pg;
+}
+
+static void po_destroy(polling_obj *po) {
+ if (po->group != NULL) {
+ polling_group *pg = pg_lock_latest(po->group);
+ po->prev->next = po->next;
+ po->next->prev = po->prev;
+ gpr_mu_unlock(&pg->po.mu);
+ pg_unref(pg);
+ }
+ gpr_mu_destroy(&po->mu);
+}
+
+static polling_group *pg_ref(polling_group *pg) {
+ gpr_ref(&pg->refs);
+ return pg;
+}
+
+static void pg_unref(polling_group *pg) {
+ if (gpr_unref(&pg->refs)) {
+ po_destroy(&pg->po);
+ gpr_free(pg);
+ }
+}
+
+static int po_cmp(polling_obj *a, polling_obj *b) {
+ if (a == b) return 0;
+ if (a->type < b->type) return -1;
+ if (a->type > b->type) return 1;
+ if (a < b) return -1;
+ assert(a > b);
+ return 1;
+}
+
+static void po_join(grpc_exec_ctx *exec_ctx, polling_obj *a, polling_obj *b) {
+ switch (po_cmp(a, b)) {
+ case 0:
+ return;
+ case 1:
+ GPR_SWAP(polling_obj *, a, b);
+ /* fall through */
+ case -1:
+ gpr_mu_lock(&a->mu);
+ gpr_mu_lock(&b->mu);
+
+ if (a->group == NULL) {
+ if (b->group == NULL) {
+ polling_obj *initial_po[] = {a, b};
+ pg_create(exec_ctx, initial_po, GPR_ARRAY_SIZE(initial_po));
+ gpr_mu_unlock(&a->mu);
+ gpr_mu_unlock(&b->mu);
+ } else {
+ polling_group *b_group = pg_ref(b->group);
+ gpr_mu_unlock(&b->mu);
+ gpr_mu_unlock(&a->mu);
+ pg_join(exec_ctx, b_group, a);
+ }
+ } else if (b->group == NULL) {
+ polling_group *a_group = pg_ref(a->group);
+ gpr_mu_unlock(&a->mu);
+ gpr_mu_unlock(&b->mu);
+ pg_join(exec_ctx, a_group, b);
+ } else if (a->group == b->group) {
+ /* nothing to do */
+ gpr_mu_unlock(&a->mu);
+ gpr_mu_unlock(&b->mu);
+ } else {
+ polling_group *a_group = pg_ref(a->group);
+ polling_group *b_group = pg_ref(b->group);
+ gpr_mu_unlock(&a->mu);
+ gpr_mu_unlock(&b->mu);
+ pg_merge(exec_ctx, a_group, b_group);
+ }
+ }
+}
+
+static void pg_notify(grpc_exec_ctx *exec_ctx, polling_obj *a, polling_obj *b) {
+ if (a->type == PO_FD && b->type == PO_POLLSET) {
+ pollset_add_fd_locked(exec_ctx, (grpc_pollset *)b, (grpc_fd *)a, true);
+ } else if (a->type == PO_POLLSET && b->type == PO_FD) {
+ pollset_add_fd_locked(exec_ctx, (grpc_pollset *)a, (grpc_fd *)b, true);
+ }
+}
+
+static void pg_broadcast(grpc_exec_ctx *exec_ctx, polling_group *from,
+ polling_group *to) {
+ for (polling_obj *a = from->po.next; a != &from->po; a = a->next) {
+ for (polling_obj *b = to->po.next; b != &to->po; b = b->next) {
+ if (po_cmp(a, b) < 0) {
+ gpr_mu_lock(&a->mu);
+ gpr_mu_lock(&b->mu);
+ } else {
+ GPR_ASSERT(po_cmp(a, b) != 0);
+ gpr_mu_lock(&b->mu);
+ gpr_mu_lock(&a->mu);
+ }
+ pg_notify(exec_ctx, a, b);
+ gpr_mu_unlock(&a->mu);
+ gpr_mu_unlock(&b->mu);
+ }
+ }
+}
+
+static void pg_create(grpc_exec_ctx *exec_ctx, polling_obj **initial_po,
+ size_t initial_po_count) {
+ /* assumes all polling objects in initial_po are locked */
+ polling_group *pg = gpr_malloc(sizeof(*pg));
+ po_init(&pg->po, PO_POLLING_GROUP);
+ gpr_ref_init(&pg->refs, (int)initial_po_count);
+ for (size_t i = 0; i < initial_po_count; i++) {
+ GPR_ASSERT(initial_po[i]->group == NULL);
+ initial_po[i]->group = pg;
+ }
+ for (size_t i = 1; i < initial_po_count; i++) {
+ initial_po[i]->prev = initial_po[i - 1];
+ }
+ for (size_t i = 0; i < initial_po_count - 1; i++) {
+ initial_po[i]->next = initial_po[i + 1];
+ }
+ initial_po[0]->prev = &pg->po;
+ initial_po[initial_po_count - 1]->next = &pg->po;
+ pg->po.next = initial_po[0];
+ pg->po.prev = initial_po[initial_po_count - 1];
+ for (size_t i = 1; i < initial_po_count; i++) {
+ for (size_t j = 0; j < i; j++) {
+ pg_notify(exec_ctx, initial_po[i], initial_po[j]);
+ }
+ }
+}
+
+static void pg_join(grpc_exec_ctx *exec_ctx, polling_group *pg,
+ polling_obj *po) {
+ /* assumes neither pg nor po are locked; consumes one ref to pg */
+ pg = pg_lock_latest(pg);
+ /* pg locked */
+ for (polling_obj *existing = pg->po.next /* skip pg - it's just a stub */;
+ existing != &pg->po; existing = existing->next) {
+ if (po_cmp(po, existing) < 0) {
+ gpr_mu_lock(&po->mu);
+ gpr_mu_lock(&existing->mu);
+ } else {
+ GPR_ASSERT(po_cmp(po, existing) != 0);
+ gpr_mu_lock(&existing->mu);
+ gpr_mu_lock(&po->mu);
+ }
+ /* pg, po, existing locked */
+ if (po->group != NULL) {
+ gpr_mu_unlock(&pg->po.mu);
+ polling_group *po_group = pg_ref(po->group);
+ gpr_mu_unlock(&po->mu);
+ gpr_mu_unlock(&existing->mu);
+ pg_merge(exec_ctx, pg, po_group);
+ /* early exit: polling obj picked up a group during joining: we needed
+ to do a full merge */
+ return;
+ }
+ pg_notify(exec_ctx, po, existing);
+ gpr_mu_unlock(&po->mu);
+ gpr_mu_unlock(&existing->mu);
+ }
+ gpr_mu_lock(&po->mu);
+ if (po->group != NULL) {
+ gpr_mu_unlock(&pg->po.mu);
+ polling_group *po_group = pg_ref(po->group);
+ gpr_mu_unlock(&po->mu);
+ pg_merge(exec_ctx, pg, po_group);
+ /* early exit: polling obj picked up a group during joining: we needed
+ to do a full merge */
+ return;
+ }
+ po->group = pg;
+ po->next = &pg->po;
+ po->prev = pg->po.prev;
+ po->prev->next = po->next->prev = po;
+ gpr_mu_unlock(&pg->po.mu);
+ gpr_mu_unlock(&po->mu);
+}
+
+static void pg_merge(grpc_exec_ctx *exec_ctx, polling_group *a,
+ polling_group *b) {
+ for (;;) {
+ if (a == b) {
+ pg_unref(a);
+ pg_unref(b);
+ return;
+ }
+ if (a > b) GPR_SWAP(polling_group *, a, b);
+ gpr_mu_lock(&a->po.mu);
+ gpr_mu_lock(&b->po.mu);
+ if (a->po.group != NULL) {
+ polling_group *m2 = pg_ref(a->po.group);
+ gpr_mu_unlock(&a->po.mu);
+ gpr_mu_unlock(&b->po.mu);
+ pg_unref(a);
+ a = m2;
+ } else if (b->po.group != NULL) {
+ polling_group *m2 = pg_ref(b->po.group);
+ gpr_mu_unlock(&a->po.mu);
+ gpr_mu_unlock(&b->po.mu);
+ pg_unref(b);
+ b = m2;
+ } else {
+ break;
+ }
+ }
+ polling_group **unref = NULL;
+ size_t unref_count = 0;
+ size_t unref_cap = 0;
+ b->po.group = a;
+ pg_broadcast(exec_ctx, a, b);
+ pg_broadcast(exec_ctx, b, a);
+ while (b->po.next != &b->po) {
+ polling_obj *po = b->po.next;
+ gpr_mu_lock(&po->mu);
+ if (unref_count == unref_cap) {
+ unref_cap = GPR_MAX(8, 3 * unref_cap / 2);
+ unref = gpr_realloc(unref, unref_cap * sizeof(*unref));
+ }
+ unref[unref_count++] = po->group;
+ po->group = pg_ref(a);
+ // unlink from b
+ po->prev->next = po->next;
+ po->next->prev = po->prev;
+ // link to a
+ po->next = &a->po;
+ po->prev = a->po.prev;
+ po->next->prev = po->prev->next = po;
+ gpr_mu_unlock(&po->mu);
+ }
+ gpr_mu_unlock(&a->po.mu);
+ gpr_mu_unlock(&b->po.mu);
+ for (size_t i = 0; i < unref_count; i++) {
+ pg_unref(unref[i]);
+ }
+ gpr_free(unref);
+ pg_unref(b);
+}
+
+/*******************************************************************************
+ * Event engine binding
+ */
+
+static void shutdown_engine(void) {
+ fd_global_shutdown();
+ pollset_global_shutdown();
+}
+
+static const grpc_event_engine_vtable vtable = {
+ .pollset_size = sizeof(grpc_pollset),
+
+ .fd_create = fd_create,
+ .fd_wrapped_fd = fd_wrapped_fd,
+ .fd_orphan = fd_orphan,
+ .fd_shutdown = fd_shutdown,
+ .fd_is_shutdown = fd_is_shutdown,
+ .fd_notify_on_read = fd_notify_on_read,
+ .fd_notify_on_write = fd_notify_on_write,
+ .fd_get_read_notifier_pollset = fd_get_read_notifier_pollset,
+ .fd_get_workqueue = fd_get_workqueue,
+
+ .pollset_init = pollset_init,
+ .pollset_shutdown = pollset_shutdown,
+ .pollset_destroy = pollset_destroy,
+ .pollset_work = pollset_work,
+ .pollset_kick = pollset_kick,
+ .pollset_add_fd = pollset_add_fd,
+
+ .pollset_set_create = pollset_set_create,
+ .pollset_set_destroy = pollset_set_destroy,
+ .pollset_set_add_pollset = pollset_set_add_pollset,
+ .pollset_set_del_pollset = pollset_set_del_pollset,
+ .pollset_set_add_pollset_set = pollset_set_add_pollset_set,
+ .pollset_set_del_pollset_set = pollset_set_del_pollset_set,
+ .pollset_set_add_fd = pollset_set_add_fd,
+ .pollset_set_del_fd = pollset_set_del_fd,
+
+ .workqueue_ref = workqueue_ref,
+ .workqueue_unref = workqueue_unref,
+ .workqueue_scheduler = workqueue_scheduler,
+
+ .shutdown_engine = shutdown_engine,
+};
+
+const grpc_event_engine_vtable *grpc_init_epollex_linux(
+ bool explicitly_requested) {
+ if (!explicitly_requested) return NULL;
+
+ if (!grpc_has_wakeup_fd()) {
+ return NULL;
+ }
+
+ if (!grpc_is_epollexclusive_available()) {
+ return NULL;
+ }
+
+ fd_global_init();
+
+ if (!GRPC_LOG_IF_ERROR("pollset_global_init", pollset_global_init())) {
+ pollset_global_shutdown();
+ fd_global_shutdown();
+ return NULL;
+ }
+
+ return &vtable;
+}
+
+#else /* defined(GRPC_LINUX_EPOLL) */
+#if defined(GRPC_POSIX_SOCKET)
+#include "src/core/lib/iomgr/ev_posix.h"
+/* If GRPC_LINUX_EPOLL is not defined, it means epoll is not available. Return
+ * NULL */
+const grpc_event_engine_vtable *grpc_init_epollex_linux(
+ bool explicitly_requested) {
+ return NULL;
+}
+#endif /* defined(GRPC_POSIX_SOCKET) */
+
+#endif /* !defined(GRPC_LINUX_EPOLL) */
diff --git a/src/core/lib/iomgr/ev_epollex_linux.h b/src/core/lib/iomgr/ev_epollex_linux.h
new file mode 100644
index 0000000000..a078a7f19a
--- /dev/null
+++ b/src/core/lib/iomgr/ev_epollex_linux.h
@@ -0,0 +1,43 @@
+/*
+ *
+ * Copyright 2015, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef GRPC_CORE_LIB_IOMGR_EV_EPOLLEX_LINUX_H
+#define GRPC_CORE_LIB_IOMGR_EV_EPOLLEX_LINUX_H
+
+#include "src/core/lib/iomgr/ev_posix.h"
+#include "src/core/lib/iomgr/port.h"
+
+const grpc_event_engine_vtable *grpc_init_epollex_linux(
+ bool explicitly_requested);
+
+#endif /* GRPC_CORE_LIB_IOMGR_EV_EPOLLEX_LINUX_H */
diff --git a/src/core/lib/iomgr/ev_epoll_linux.c b/src/core/lib/iomgr/ev_epollsig_linux.c
index e603a75593..92c555b7ea 100644
--- a/src/core/lib/iomgr/ev_epoll_linux.c
+++ b/src/core/lib/iomgr/ev_epollsig_linux.c
@@ -36,7 +36,7 @@
/* This polling engine is only relevant on linux kernels supporting epoll() */
#ifdef GRPC_LINUX_EPOLL
-#include "src/core/lib/iomgr/ev_epoll_linux.h"
+#include "src/core/lib/iomgr/ev_epollsig_linux.h"
#include <assert.h>
#include <errno.h>
@@ -63,11 +63,11 @@
#include "src/core/lib/profiling/timers.h"
#include "src/core/lib/support/block_annotate.h"
-/* TODO: sreek - Move this to init.c and initialize this like other tracers. */
-static int grpc_polling_trace = 0; /* Disabled by default */
-#define GRPC_POLLING_TRACE(fmt, ...) \
- if (grpc_polling_trace) { \
- gpr_log(GPR_INFO, (fmt), __VA_ARGS__); \
+#define GRPC_POLLSET_KICK_BROADCAST ((grpc_pollset_worker *)1)
+
+#define GRPC_POLLING_TRACE(...) \
+ if (GRPC_TRACER_ON(grpc_polling_trace)) { \
+ gpr_log(GPR_INFO, __VA_ARGS__); \
}
/* Uncomment the following to enable extra checks on poll_object operations */
@@ -76,11 +76,6 @@ static int grpc_polling_trace = 0; /* Disabled by default */
static int grpc_wakeup_signal = -1;
static bool is_grpc_wakeup_signal_initialized = false;
-/* TODO: sreek: Right now, this wakes up all pollers. In future we should make
- * sure to wake up one polling thread (which can wake up other threads if
- * needed) */
-static grpc_wakeup_fd global_wakeup_fd;
-
/* Implements the function defined in grpc_posix.h. This function might be
* called before even calling grpc_init() to set either a different signal to
* use. If signum == -1, then the use of signals is disabled */
@@ -454,8 +449,8 @@ static void polling_island_add_wakeup_fd_locked(polling_island *pi,
gpr_asprintf(&err_msg,
"epoll_ctl (epoll_fd: %d) add wakeup fd: %d failed with "
"error: %d (%s)",
- pi->epoll_fd, GRPC_WAKEUP_FD_GET_READ_FD(&global_wakeup_fd),
- errno, strerror(errno));
+ pi->epoll_fd, GRPC_WAKEUP_FD_GET_READ_FD(wakeup_fd), errno,
+ strerror(errno));
append_error(error, GRPC_OS_ERROR(errno, err_msg), err_desc);
gpr_free(err_msg);
}
@@ -558,7 +553,6 @@ static polling_island *polling_island_create(grpc_exec_ctx *exec_ctx,
goto done;
}
- polling_island_add_wakeup_fd_locked(pi, &global_wakeup_fd, error);
polling_island_add_wakeup_fd_locked(pi, &pi->workqueue_wakeup_fd, error);
if (initial_fd != NULL) {
@@ -738,7 +732,7 @@ static void workqueue_maybe_wakeup(polling_island *pi) {
it right now. Note that since we do an anticipatory mpscq_pop every poll
loop, it's ok if we miss the wakeup here, as we'll get the work item when
the next poller enters anyway. */
- if (current_pollers > min_current_pollers_for_wakeup) {
+ if (current_pollers >= min_current_pollers_for_wakeup) {
GRPC_LOG_IF_ERROR("workqueue_wakeup_fd",
grpc_wakeup_fd_wakeup(&pi->workqueue_wakeup_fd));
}
@@ -1116,11 +1110,10 @@ static grpc_error *pollset_global_init(void) {
gpr_tls_init(&g_current_thread_pollset);
gpr_tls_init(&g_current_thread_worker);
poller_kick_init();
- return grpc_wakeup_fd_init(&global_wakeup_fd);
+ return GRPC_ERROR_NONE;
}
static void pollset_global_shutdown(void) {
- grpc_wakeup_fd_destroy(&global_wakeup_fd);
gpr_tls_destroy(&g_current_thread_pollset);
gpr_tls_destroy(&g_current_thread_worker);
}
@@ -1226,10 +1219,6 @@ static grpc_error *pollset_kick(grpc_pollset *p,
return error;
}
-static grpc_error *kick_poller(void) {
- return grpc_wakeup_fd_wakeup(&global_wakeup_fd);
-}
-
static void pollset_init(grpc_pollset *pollset, gpr_mu **mu) {
gpr_mu_init(&pollset->po.mu);
*mu = &pollset->po.mu;
@@ -1332,7 +1321,7 @@ static void pollset_shutdown(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
/* pollset_shutdown is guaranteed to be called before pollset_destroy. So other
* than destroying the mutexes, there is nothing special that needs to be done
* here */
-static void pollset_destroy(grpc_pollset *pollset) {
+static void pollset_destroy(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset) {
GPR_ASSERT(!pollset_has_workers(pollset));
gpr_mu_destroy(&pollset->po.mu);
}
@@ -1343,7 +1332,13 @@ static bool maybe_do_workqueue_work(grpc_exec_ctx *exec_ctx,
gpr_mpscq_node *n = gpr_mpscq_pop(&pi->workqueue_items);
gpr_mu_unlock(&pi->workqueue_read_mu);
if (n != NULL) {
- if (gpr_atm_full_fetch_add(&pi->workqueue_item_count, -1) > 1) {
+ gpr_atm remaining =
+ gpr_atm_full_fetch_add(&pi->workqueue_item_count, -1) - 1;
+ GRPC_POLLING_TRACE(
+ "maybe_do_workqueue_work: pi: %p: got closure %p, remaining = "
+ "%" PRIdPTR,
+ pi, n, remaining);
+ if (remaining > 0) {
workqueue_maybe_wakeup(pi);
}
grpc_closure *c = (grpc_closure *)n;
@@ -1358,8 +1353,13 @@ static bool maybe_do_workqueue_work(grpc_exec_ctx *exec_ctx,
/* n == NULL might mean there's work but it's not available to be popped
* yet - try to ensure another workqueue wakes up to check shortly if so
*/
+ GRPC_POLLING_TRACE(
+ "maybe_do_workqueue_work: pi: %p: more to do, but not yet", pi);
workqueue_maybe_wakeup(pi);
}
+ } else {
+ GRPC_POLLING_TRACE("maybe_do_workqueue_work: pi: %p: read already locked",
+ pi);
}
return false;
}
@@ -1422,7 +1422,10 @@ static void pollset_work_and_unlock(grpc_exec_ctx *exec_ctx,
/* If we get some workqueue work to do, it might end up completing an item on
the completion queue, so there's no need to poll... so we skip that and
redo the complete loop to verify */
+ GRPC_POLLING_TRACE("pollset_work: pollset: %p, worker %p, pi %p", pollset,
+ worker, pi);
if (!maybe_do_workqueue_work(exec_ctx, pi)) {
+ GRPC_POLLING_TRACE("pollset_work: begins");
gpr_atm_no_barrier_fetch_add(&pi->poller_count, 1);
g_current_thread_polling_island = pi;
@@ -1453,11 +1456,7 @@ static void pollset_work_and_unlock(grpc_exec_ctx *exec_ctx,
for (int i = 0; i < ep_rv; ++i) {
void *data_ptr = ep_ev[i].data.ptr;
- if (data_ptr == &global_wakeup_fd) {
- grpc_timer_consume_kick();
- append_error(error, grpc_wakeup_fd_consume_wakeup(&global_wakeup_fd),
- err_desc);
- } else if (data_ptr == &pi->workqueue_wakeup_fd) {
+ if (data_ptr == &pi->workqueue_wakeup_fd) {
append_error(error,
grpc_wakeup_fd_consume_wakeup(&pi->workqueue_wakeup_fd),
err_desc);
@@ -1487,6 +1486,7 @@ static void pollset_work_and_unlock(grpc_exec_ctx *exec_ctx,
g_current_thread_polling_island = NULL;
gpr_atm_no_barrier_fetch_add(&pi->poller_count, -1);
+ GRPC_POLLING_TRACE("pollset_work: ends");
}
GPR_ASSERT(pi != NULL);
@@ -1897,8 +1897,6 @@ static const grpc_event_engine_vtable vtable = {
.pollset_set_add_fd = pollset_set_add_fd,
.pollset_set_del_fd = pollset_set_del_fd,
- .kick_poller = kick_poller,
-
.workqueue_ref = workqueue_ref,
.workqueue_unref = workqueue_unref,
.workqueue_scheduler = workqueue_scheduler,
@@ -1921,7 +1919,8 @@ static bool is_epoll_available() {
return true;
}
-const grpc_event_engine_vtable *grpc_init_epoll_linux(void) {
+const grpc_event_engine_vtable *grpc_init_epollsig_linux(
+ bool explicit_request) {
/* If use of signals is disabled, we cannot use epoll engine*/
if (is_grpc_wakeup_signal_initialized && grpc_wakeup_signal < 0) {
return NULL;
@@ -1936,7 +1935,13 @@ const grpc_event_engine_vtable *grpc_init_epoll_linux(void) {
}
if (!is_grpc_wakeup_signal_initialized) {
- grpc_use_signal(SIGRTMIN + 6);
+ /* TODO(ctiller): when other epoll engines are ready, remove the true || to
+ * force this to be explitly chosen if needed */
+ if (true || explicit_request) {
+ grpc_use_signal(SIGRTMIN + 6);
+ } else {
+ return NULL;
+ }
}
fd_global_init();
@@ -1958,7 +1963,10 @@ const grpc_event_engine_vtable *grpc_init_epoll_linux(void) {
#include "src/core/lib/iomgr/ev_posix.h"
/* If GRPC_LINUX_EPOLL is not defined, it means epoll is not available. Return
* NULL */
-const grpc_event_engine_vtable *grpc_init_epoll_linux(void) { return NULL; }
+const grpc_event_engine_vtable *grpc_init_epollsig_linux(
+ bool explicit_request) {
+ return NULL;
+}
#endif /* defined(GRPC_POSIX_SOCKET) */
void grpc_use_signal(int signum) {}
diff --git a/src/core/lib/iomgr/ev_epoll_linux.h b/src/core/lib/iomgr/ev_epollsig_linux.h
index 8fc3ff59a3..9e4034f2a7 100644
--- a/src/core/lib/iomgr/ev_epoll_linux.h
+++ b/src/core/lib/iomgr/ev_epollsig_linux.h
@@ -31,13 +31,13 @@
*
*/
-#ifndef GRPC_CORE_LIB_IOMGR_EV_EPOLL_LINUX_H
-#define GRPC_CORE_LIB_IOMGR_EV_EPOLL_LINUX_H
+#ifndef GRPC_CORE_LIB_IOMGR_EV_EPOLLSIG_LINUX_H
+#define GRPC_CORE_LIB_IOMGR_EV_EPOLLSIG_LINUX_H
#include "src/core/lib/iomgr/ev_posix.h"
#include "src/core/lib/iomgr/port.h"
-const grpc_event_engine_vtable *grpc_init_epoll_linux(void);
+const grpc_event_engine_vtable *grpc_init_epollsig_linux(bool explicit_request);
#ifdef GRPC_LINUX_EPOLL
void *grpc_fd_get_polling_island(grpc_fd *fd);
@@ -45,4 +45,4 @@ void *grpc_pollset_get_polling_island(grpc_pollset *ps);
bool grpc_are_polling_islands_equal(void *p, void *q);
#endif /* defined(GRPC_LINUX_EPOLL) */
-#endif /* GRPC_CORE_LIB_IOMGR_EV_EPOLL_LINUX_H */
+#endif /* GRPC_CORE_LIB_IOMGR_EV_EPOLLSIG_LINUX_H */
diff --git a/src/core/lib/iomgr/ev_poll_posix.c b/src/core/lib/iomgr/ev_poll_posix.c
index 9834cdd197..3a7648ac32 100644
--- a/src/core/lib/iomgr/ev_poll_posix.c
+++ b/src/core/lib/iomgr/ev_poll_posix.c
@@ -58,6 +58,8 @@
#include "src/core/lib/profiling/timers.h"
#include "src/core/lib/support/block_annotate.h"
+#define GRPC_POLLSET_KICK_BROADCAST ((grpc_pollset_worker *)1)
+
/*******************************************************************************
* FD declarations
*/
@@ -122,8 +124,6 @@ struct grpc_fd {
grpc_pollset *read_notifier_pollset;
};
-static grpc_wakeup_fd global_wakeup_fd;
-
/* Begin polling on an fd.
Registers that the given pollset is interested in this fd - so that if read
or writability interest changes, the pollset can be kicked to pick up that
@@ -784,19 +784,14 @@ static grpc_error *pollset_kick(grpc_pollset *p,
static grpc_error *pollset_global_init(void) {
gpr_tls_init(&g_current_thread_poller);
gpr_tls_init(&g_current_thread_worker);
- return grpc_wakeup_fd_init(&global_wakeup_fd);
+ return GRPC_ERROR_NONE;
}
static void pollset_global_shutdown(void) {
- grpc_wakeup_fd_destroy(&global_wakeup_fd);
gpr_tls_destroy(&g_current_thread_poller);
gpr_tls_destroy(&g_current_thread_worker);
}
-static grpc_error *kick_poller(void) {
- return grpc_wakeup_fd_wakeup(&global_wakeup_fd);
-}
-
/* main interface */
static void pollset_init(grpc_pollset *pollset, gpr_mu **mu) {
@@ -815,7 +810,7 @@ static void pollset_init(grpc_pollset *pollset, gpr_mu **mu) {
pollset->pollset_set_count = 0;
}
-static void pollset_destroy(grpc_pollset *pollset) {
+static void pollset_destroy(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset) {
GPR_ASSERT(!pollset_has_workers(pollset));
GPR_ASSERT(pollset->idle_jobs.head == pollset->idle_jobs.tail);
while (pollset->local_wakeup_cache) {
@@ -952,13 +947,10 @@ static grpc_error *pollset_work(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
}
fd_count = 0;
- pfd_count = 2;
- pfds[0].fd = GRPC_WAKEUP_FD_GET_READ_FD(&global_wakeup_fd);
+ pfd_count = 1;
+ pfds[0].fd = GRPC_WAKEUP_FD_GET_READ_FD(&worker.wakeup_fd->fd);
pfds[0].events = POLLIN;
pfds[0].revents = 0;
- pfds[1].fd = GRPC_WAKEUP_FD_GET_READ_FD(&worker.wakeup_fd->fd);
- pfds[1].events = POLLIN;
- pfds[1].revents = 0;
for (i = 0; i < pollset->fd_count; i++) {
if (fd_is_orphaned(pollset->fds[i])) {
GRPC_FD_UNREF(pollset->fds[i], "multipoller");
@@ -974,7 +966,7 @@ static grpc_error *pollset_work(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
pollset->fd_count = fd_count;
gpr_mu_unlock(&pollset->mu);
- for (i = 2; i < pfd_count; i++) {
+ for (i = 1; i < pfd_count; i++) {
grpc_fd *fd = watchers[i].fd;
pfds[i].events = (short)fd_begin_poll(fd, pollset, &worker, POLLIN,
POLLOUT, &watchers[i]);
@@ -992,7 +984,7 @@ static grpc_error *pollset_work(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
work_combine_error(&error, GRPC_OS_ERROR(errno, "poll"));
}
- for (i = 2; i < pfd_count; i++) {
+ for (i = 1; i < pfd_count; i++) {
if (watchers[i].fd == NULL) {
fd_end_poll(exec_ctx, &watchers[i], 0, 0, NULL);
} else {
@@ -1002,20 +994,15 @@ static grpc_error *pollset_work(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
}
}
} else if (r == 0) {
- for (i = 2; i < pfd_count; i++) {
+ for (i = 1; i < pfd_count; i++) {
fd_end_poll(exec_ctx, &watchers[i], 0, 0, NULL);
}
} else {
if (pfds[0].revents & POLLIN_CHECK) {
- grpc_timer_consume_kick();
- work_combine_error(&error,
- grpc_wakeup_fd_consume_wakeup(&global_wakeup_fd));
- }
- if (pfds[1].revents & POLLIN_CHECK) {
work_combine_error(
&error, grpc_wakeup_fd_consume_wakeup(&worker.wakeup_fd->fd));
}
- for (i = 2; i < pfd_count; i++) {
+ for (i = 1; i < pfd_count; i++) {
if (watchers[i].fd == NULL) {
fd_end_poll(exec_ctx, &watchers[i], 0, 0, NULL);
} else {
@@ -1560,8 +1547,6 @@ static const grpc_event_engine_vtable vtable = {
.pollset_set_add_fd = pollset_set_add_fd,
.pollset_set_del_fd = pollset_set_del_fd,
- .kick_poller = kick_poller,
-
.workqueue_ref = workqueue_ref,
.workqueue_unref = workqueue_unref,
.workqueue_scheduler = workqueue_scheduler,
@@ -1569,7 +1554,7 @@ static const grpc_event_engine_vtable vtable = {
.shutdown_engine = shutdown_engine,
};
-const grpc_event_engine_vtable *grpc_init_poll_posix(void) {
+const grpc_event_engine_vtable *grpc_init_poll_posix(bool explicit_request) {
if (!grpc_has_wakeup_fd()) {
return NULL;
}
@@ -1579,7 +1564,7 @@ const grpc_event_engine_vtable *grpc_init_poll_posix(void) {
return &vtable;
}
-const grpc_event_engine_vtable *grpc_init_poll_cv_posix(void) {
+const grpc_event_engine_vtable *grpc_init_poll_cv_posix(bool explicit_request) {
global_cv_fd_table_init();
grpc_enable_cv_wakeup_fds(1);
if (!GRPC_LOG_IF_ERROR("pollset_global_init", pollset_global_init())) {
diff --git a/src/core/lib/iomgr/ev_poll_posix.h b/src/core/lib/iomgr/ev_poll_posix.h
index 202ffca14c..2890e93ead 100644
--- a/src/core/lib/iomgr/ev_poll_posix.h
+++ b/src/core/lib/iomgr/ev_poll_posix.h
@@ -36,7 +36,7 @@
#include "src/core/lib/iomgr/ev_posix.h"
-const grpc_event_engine_vtable *grpc_init_poll_posix(void);
-const grpc_event_engine_vtable *grpc_init_poll_cv_posix(void);
+const grpc_event_engine_vtable *grpc_init_poll_posix(bool explicit_request);
+const grpc_event_engine_vtable *grpc_init_poll_cv_posix(bool explicit_request);
#endif /* GRPC_CORE_LIB_IOMGR_EV_POLL_POSIX_H */
diff --git a/src/core/lib/iomgr/ev_posix.c b/src/core/lib/iomgr/ev_posix.c
index 13409a4de8..c4d2f23e29 100644
--- a/src/core/lib/iomgr/ev_posix.c
+++ b/src/core/lib/iomgr/ev_posix.c
@@ -44,10 +44,18 @@
#include <grpc/support/string_util.h>
#include <grpc/support/useful.h>
-#include "src/core/lib/iomgr/ev_epoll_linux.h"
+#include "src/core/lib/debug/trace.h"
+#include "src/core/lib/iomgr/ev_epoll1_linux.h"
+#include "src/core/lib/iomgr/ev_epoll_limited_pollers_linux.h"
+#include "src/core/lib/iomgr/ev_epoll_thread_pool_linux.h"
+#include "src/core/lib/iomgr/ev_epollex_linux.h"
+#include "src/core/lib/iomgr/ev_epollsig_linux.h"
#include "src/core/lib/iomgr/ev_poll_posix.h"
#include "src/core/lib/support/env.h"
+grpc_tracer_flag grpc_polling_trace =
+ GRPC_TRACER_INITIALIZER(false); /* Disabled by default */
+
/** Default poll() function - a pointer so that it can be overridden by some
* tests */
grpc_poll_function_type grpc_poll_function = poll;
@@ -57,7 +65,8 @@ grpc_wakeup_fd grpc_global_wakeup_fd;
static const grpc_event_engine_vtable *g_event_engine;
static const char *g_poll_strategy_name = NULL;
-typedef const grpc_event_engine_vtable *(*event_engine_factory_fn)(void);
+typedef const grpc_event_engine_vtable *(*event_engine_factory_fn)(
+ bool explicit_request);
typedef struct {
const char *name;
@@ -65,7 +74,11 @@ typedef struct {
} event_engine_factory;
static const event_engine_factory g_factories[] = {
- {"epoll", grpc_init_epoll_linux},
+ {"epollex", grpc_init_epollex_linux},
+ {"epollsig", grpc_init_epollsig_linux},
+ {"epoll1", grpc_init_epoll1_linux},
+ {"epoll-threadpool", grpc_init_epoll_thread_pool_linux},
+ {"epoll-limited", grpc_init_epoll_limited_pollers_linux},
{"poll", grpc_init_poll_posix},
{"poll-cv", grpc_init_poll_cv_posix},
};
@@ -102,7 +115,8 @@ static bool is(const char *want, const char *have) {
static void try_engine(const char *engine) {
for (size_t i = 0; i < GPR_ARRAY_SIZE(g_factories); i++) {
if (is(engine, g_factories[i].name)) {
- if ((g_event_engine = g_factories[i].factory())) {
+ if ((g_event_engine = g_factories[i].factory(
+ 0 == strcmp(engine, g_factories[i].name)))) {
g_poll_strategy_name = g_factories[i].name;
gpr_log(GPR_DEBUG, "Using polling engine: %s", g_factories[i].name);
return;
@@ -121,6 +135,8 @@ void grpc_set_event_engine_test_only(
const char *grpc_get_poll_strategy_name() { return g_poll_strategy_name; }
void grpc_event_engine_init(void) {
+ grpc_register_tracer("polling", &grpc_polling_trace);
+
char *s = gpr_getenv("GRPC_POLL_STRATEGY");
if (s == NULL) {
s = gpr_strdup("all");
@@ -197,8 +213,8 @@ void grpc_pollset_shutdown(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
g_event_engine->pollset_shutdown(exec_ctx, pollset, closure);
}
-void grpc_pollset_destroy(grpc_pollset *pollset) {
- g_event_engine->pollset_destroy(pollset);
+void grpc_pollset_destroy(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset) {
+ g_event_engine->pollset_destroy(exec_ctx, pollset);
}
grpc_error *grpc_pollset_work(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
@@ -260,8 +276,6 @@ void grpc_pollset_set_del_fd(grpc_exec_ctx *exec_ctx,
g_event_engine->pollset_set_del_fd(exec_ctx, pollset_set, fd);
}
-grpc_error *grpc_kick_poller(void) { return g_event_engine->kick_poller(); }
-
#ifdef GRPC_WORKQUEUE_REFCOUNT_DEBUG
grpc_workqueue *grpc_workqueue_ref(grpc_workqueue *workqueue, const char *file,
int line, const char *reason) {
diff --git a/src/core/lib/iomgr/ev_posix.h b/src/core/lib/iomgr/ev_posix.h
index becc4d359e..80619aab5f 100644
--- a/src/core/lib/iomgr/ev_posix.h
+++ b/src/core/lib/iomgr/ev_posix.h
@@ -36,12 +36,15 @@
#include <poll.h>
+#include "src/core/lib/debug/trace.h"
#include "src/core/lib/iomgr/exec_ctx.h"
#include "src/core/lib/iomgr/pollset.h"
#include "src/core/lib/iomgr/pollset_set.h"
#include "src/core/lib/iomgr/wakeup_fd_posix.h"
#include "src/core/lib/iomgr/workqueue.h"
+extern grpc_tracer_flag grpc_polling_trace; /* Disabled by default */
+
typedef struct grpc_fd grpc_fd;
typedef struct grpc_event_engine_vtable {
@@ -64,7 +67,7 @@ typedef struct grpc_event_engine_vtable {
void (*pollset_init)(grpc_pollset *pollset, gpr_mu **mu);
void (*pollset_shutdown)(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
grpc_closure *closure);
- void (*pollset_destroy)(grpc_pollset *pollset);
+ void (*pollset_destroy)(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset);
grpc_error *(*pollset_work)(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
grpc_pollset_worker **worker, gpr_timespec now,
gpr_timespec deadline);
@@ -93,8 +96,6 @@ typedef struct grpc_event_engine_vtable {
void (*pollset_set_del_fd)(grpc_exec_ctx *exec_ctx,
grpc_pollset_set *pollset_set, grpc_fd *fd);
- grpc_error *(*kick_poller)(void);
-
void (*shutdown_engine)(void);
#ifdef GRPC_WORKQUEUE_REFCOUNT_DEBUG
diff --git a/src/core/lib/iomgr/ev_windows.c b/src/core/lib/iomgr/ev_windows.c
new file mode 100644
index 0000000000..7bf7327823
--- /dev/null
+++ b/src/core/lib/iomgr/ev_windows.c
@@ -0,0 +1,43 @@
+/*
+ *
+ * Copyright 2015, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "src/core/lib/iomgr/port.h"
+
+#ifdef GRPC_WINSOCK_SOCKET
+
+#include "src/core/lib/debug/trace.h"
+
+grpc_tracer_flag grpc_polling_trace =
+ GRPC_TRACER_INITIALIZER(false); /* Disabled by default */
+
+#endif // GRPC_WINSOCK_SOCKET
diff --git a/src/core/lib/iomgr/exec_ctx.c b/src/core/lib/iomgr/exec_ctx.c
index 2532a708e7..318bb2b713 100644
--- a/src/core/lib/iomgr/exec_ctx.c
+++ b/src/core/lib/iomgr/exec_ctx.c
@@ -62,6 +62,11 @@ bool grpc_always_ready_to_finish(grpc_exec_ctx *exec_ctx, void *arg_ignored) {
return true;
}
+bool grpc_exec_ctx_has_work(grpc_exec_ctx *exec_ctx) {
+ return exec_ctx->active_combiner != NULL ||
+ !grpc_closure_list_empty(exec_ctx->closure_list);
+}
+
bool grpc_exec_ctx_flush(grpc_exec_ctx *exec_ctx) {
bool did_something = 0;
GPR_TIMER_BEGIN("grpc_exec_ctx_flush", 0);
diff --git a/src/core/lib/iomgr/exec_ctx.h b/src/core/lib/iomgr/exec_ctx.h
index f99a0fee5f..759a3ae2d5 100644
--- a/src/core/lib/iomgr/exec_ctx.h
+++ b/src/core/lib/iomgr/exec_ctx.h
@@ -93,6 +93,8 @@ struct grpc_exec_ctx {
extern grpc_closure_scheduler *grpc_schedule_on_exec_ctx;
+bool grpc_exec_ctx_has_work(grpc_exec_ctx *exec_ctx);
+
/** Flush any work that has been enqueued onto this grpc_exec_ctx.
* Caller must guarantee that no interfering locks are held.
* Returns true if work was performed, false otherwise. */
diff --git a/src/core/lib/iomgr/iomgr.c b/src/core/lib/iomgr/iomgr.c
index 001e528409..1fd41c2f88 100644
--- a/src/core/lib/iomgr/iomgr.c
+++ b/src/core/lib/iomgr/iomgr.c
@@ -47,6 +47,7 @@
#include "src/core/lib/iomgr/iomgr_internal.h"
#include "src/core/lib/iomgr/network_status_tracker.h"
#include "src/core/lib/iomgr/timer.h"
+#include "src/core/lib/iomgr/timer_manager.h"
#include "src/core/lib/support/env.h"
#include "src/core/lib/support/string.h"
@@ -67,6 +68,8 @@ void grpc_iomgr_init(void) {
grpc_iomgr_platform_init();
}
+void grpc_iomgr_start(void) { grpc_timer_manager_init(); }
+
static size_t count_objects(void) {
grpc_iomgr_object *obj;
size_t n = 0;
@@ -88,6 +91,7 @@ void grpc_iomgr_shutdown(grpc_exec_ctx *exec_ctx) {
gpr_now(GPR_CLOCK_REALTIME), gpr_time_from_seconds(10, GPR_TIMESPAN));
gpr_timespec last_warning_time = gpr_now(GPR_CLOCK_REALTIME);
+ grpc_timer_manager_shutdown();
grpc_iomgr_platform_flush();
gpr_mu_lock(&g_mu);
diff --git a/src/core/lib/iomgr/iomgr.h b/src/core/lib/iomgr/iomgr.h
index 245a1e08aa..6e2e023615 100644
--- a/src/core/lib/iomgr/iomgr.h
+++ b/src/core/lib/iomgr/iomgr.h
@@ -40,6 +40,9 @@
/** Initializes the iomgr. */
void grpc_iomgr_init(void);
+/** Starts any background threads for iomgr. */
+void grpc_iomgr_start(void);
+
/** Signals the intention to shutdown the iomgr. Expects to be able to flush
* exec_ctx. */
void grpc_iomgr_shutdown(grpc_exec_ctx *exec_ctx);
diff --git a/src/core/lib/iomgr/is_epollexclusive_available.c b/src/core/lib/iomgr/is_epollexclusive_available.c
new file mode 100644
index 0000000000..34fcf313e5
--- /dev/null
+++ b/src/core/lib/iomgr/is_epollexclusive_available.c
@@ -0,0 +1,116 @@
+/*
+ *
+ * Copyright 2017, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "src/core/lib/iomgr/port.h"
+
+#include "src/core/lib/iomgr/is_epollexclusive_available.h"
+
+#ifdef GRPC_LINUX_EPOLL
+
+#include <grpc/support/log.h>
+
+#include <errno.h>
+#include <sys/eventfd.h>
+#include <unistd.h>
+
+#include "src/core/lib/iomgr/sys_epoll_wrapper.h"
+
+/* This polling engine is only relevant on linux kernels supporting epoll() */
+bool grpc_is_epollexclusive_available(void) {
+ static bool logged_why_not = false;
+
+ int fd = epoll_create1(EPOLL_CLOEXEC);
+ if (fd < 0) {
+ if (!logged_why_not) {
+ gpr_log(GPR_ERROR,
+ "epoll_create1 failed with error: %d. Not using epollex polling "
+ "engine.",
+ fd);
+ logged_why_not = true;
+ }
+ return false;
+ }
+ int evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+ if (evfd < 0) {
+ if (!logged_why_not) {
+ gpr_log(GPR_ERROR,
+ "eventfd failed with error: %d. Not using epollex polling "
+ "engine.",
+ fd);
+ logged_why_not = true;
+ }
+ close(fd);
+ return false;
+ }
+ struct epoll_event ev = {
+ /* choose events that should cause an error on
+ EPOLLEXCLUSIVE enabled kernels - specifically the combination of
+ EPOLLONESHOT and EPOLLEXCLUSIVE */
+ .events = (uint32_t)(EPOLLET | EPOLLIN | EPOLLEXCLUSIVE | EPOLLONESHOT),
+ .data.ptr = NULL};
+ if (epoll_ctl(fd, EPOLL_CTL_ADD, evfd, &ev) != 0) {
+ if (errno != EINVAL) {
+ if (!logged_why_not) {
+ gpr_log(
+ GPR_ERROR,
+ "epoll_ctl with EPOLLEXCLUSIVE | EPOLLONESHOT failed with error: "
+ "%d. Not using epollex polling engine.",
+ errno);
+ logged_why_not = true;
+ }
+ close(fd);
+ close(evfd);
+ return false;
+ }
+ } else {
+ if (!logged_why_not) {
+ gpr_log(GPR_ERROR,
+ "epoll_ctl with EPOLLEXCLUSIVE | EPOLLONESHOT succeeded. This is "
+ "evidence of no EPOLLEXCLUSIVE support. Not using "
+ "epollex polling engine.");
+ logged_why_not = true;
+ }
+ close(fd);
+ close(evfd);
+ return false;
+ }
+ close(evfd);
+ close(fd);
+ return true;
+}
+
+#else
+
+bool grpc_is_epollexclusive_available(void) { return false; }
+
+#endif
diff --git a/src/core/lib/iomgr/is_epollexclusive_available.h b/src/core/lib/iomgr/is_epollexclusive_available.h
new file mode 100644
index 0000000000..b65b819e74
--- /dev/null
+++ b/src/core/lib/iomgr/is_epollexclusive_available.h
@@ -0,0 +1,41 @@
+/*
+ *
+ * Copyright 2015, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef GRPC_CORE_LIB_IOMGR_IS_EPOLLEXCLUSIVE_AVAILABLE_H
+#define GRPC_CORE_LIB_IOMGR_IS_EPOLLEXCLUSIVE_AVAILABLE_H
+
+#include <stdbool.h>
+
+bool grpc_is_epollexclusive_available(void);
+
+#endif /* GRPC_CORE_LIB_IOMGR_IS_EPOLLEXCLUSIVE_AVAILABLE_H */
diff --git a/src/core/lib/iomgr/lockfree_event.c b/src/core/lib/iomgr/lockfree_event.c
index 17e3bbf727..898ec1cb1b 100644
--- a/src/core/lib/iomgr/lockfree_event.c
+++ b/src/core/lib/iomgr/lockfree_event.c
@@ -35,6 +35,10 @@
#include <grpc/support/log.h>
+#include "src/core/lib/debug/trace.h"
+
+extern grpc_tracer_flag grpc_polling_trace;
+
/* 'state' holds the to call when the fd is readable or writable respectively.
It can contain one of the following values:
CLOSURE_READY : The fd has an I/O event of interest but there is no
@@ -93,6 +97,10 @@ void grpc_lfev_notify_on(grpc_exec_ctx *exec_ctx, gpr_atm *state,
grpc_closure *closure) {
while (true) {
gpr_atm curr = gpr_atm_no_barrier_load(state);
+ if (GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG, "lfev_notify_on: %p curr=%p closure=%p", state,
+ (void *)curr, closure);
+ }
switch (curr) {
case CLOSURE_NOT_READY: {
/* CLOSURE_NOT_READY -> <closure>.
@@ -155,6 +163,10 @@ bool grpc_lfev_set_shutdown(grpc_exec_ctx *exec_ctx, gpr_atm *state,
while (true) {
gpr_atm curr = gpr_atm_no_barrier_load(state);
+ if (GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG, "lfev_set_shutdown: %p curr=%p err=%s", state,
+ (void *)curr, grpc_error_string(shutdown_err));
+ }
switch (curr) {
case CLOSURE_READY:
case CLOSURE_NOT_READY:
@@ -200,6 +212,10 @@ void grpc_lfev_set_ready(grpc_exec_ctx *exec_ctx, gpr_atm *state) {
while (true) {
gpr_atm curr = gpr_atm_no_barrier_load(state);
+ if (GRPC_TRACER_ON(grpc_polling_trace)) {
+ gpr_log(GPR_DEBUG, "lfev_set_ready: %p curr=%p", state, (void *)curr);
+ }
+
switch (curr) {
case CLOSURE_READY: {
/* Already ready. We are done here */
diff --git a/src/core/lib/iomgr/pollset.h b/src/core/lib/iomgr/pollset.h
index 9bf3cdac89..69e20098d7 100644
--- a/src/core/lib/iomgr/pollset.h
+++ b/src/core/lib/iomgr/pollset.h
@@ -40,8 +40,6 @@
#include "src/core/lib/iomgr/exec_ctx.h"
-#define GRPC_POLLSET_KICK_BROADCAST ((grpc_pollset_worker *)1)
-
/* A grpc_pollset is a set of file descriptors that a higher level item is
interested in. For example:
- a server will typically keep a pollset containing all connected channels,
@@ -59,7 +57,7 @@ void grpc_pollset_init(grpc_pollset *pollset, gpr_mu **mu);
* pollset's mutex must be held */
void grpc_pollset_shutdown(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
grpc_closure *closure);
-void grpc_pollset_destroy(grpc_pollset *pollset);
+void grpc_pollset_destroy(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset);
/* Do some work on a pollset.
May involve invoking asynchronous callbacks, or actually polling file
@@ -88,8 +86,7 @@ grpc_error *grpc_pollset_work(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
gpr_timespec deadline) GRPC_MUST_USE_RESULT;
/* Break one polling thread out of polling work for this pollset.
- If specific_worker is GRPC_POLLSET_KICK_BROADCAST, kick ALL the workers.
- Otherwise, if specific_worker is non-NULL, then kick that worker. */
+ If specific_worker is non-NULL, then kick that worker. */
grpc_error *grpc_pollset_kick(grpc_pollset *pollset,
grpc_pollset_worker *specific_worker)
GRPC_MUST_USE_RESULT;
diff --git a/src/core/lib/iomgr/pollset_uv.c b/src/core/lib/iomgr/pollset_uv.c
index a2f81bcd78..5923da98e2 100644
--- a/src/core/lib/iomgr/pollset_uv.c
+++ b/src/core/lib/iomgr/pollset_uv.c
@@ -106,7 +106,7 @@ void grpc_pollset_shutdown(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
grpc_closure_sched(exec_ctx, closure, GRPC_ERROR_NONE);
}
-void grpc_pollset_destroy(grpc_pollset *pollset) {
+void grpc_pollset_destroy(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset) {
uv_close((uv_handle_t *)&pollset->timer, timer_close_cb);
// timer.data is a boolean indicating that the timer has finished closing
pollset->timer.data = (void *)0;
diff --git a/src/core/lib/iomgr/pollset_windows.c b/src/core/lib/iomgr/pollset_windows.c
index 04c6b71747..b5f454cfa9 100644
--- a/src/core/lib/iomgr/pollset_windows.c
+++ b/src/core/lib/iomgr/pollset_windows.c
@@ -43,6 +43,8 @@
#include "src/core/lib/iomgr/pollset.h"
#include "src/core/lib/iomgr/pollset_windows.h"
+#define GRPC_POLLSET_KICK_BROADCAST ((grpc_pollset_worker *)1)
+
gpr_mu grpc_polling_mu;
static grpc_pollset_worker *g_active_poller;
static grpc_pollset_worker g_global_root_worker;
@@ -114,7 +116,7 @@ void grpc_pollset_shutdown(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
}
}
-void grpc_pollset_destroy(grpc_pollset *pollset) {}
+void grpc_pollset_destroy(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset) {}
grpc_error *grpc_pollset_work(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
grpc_pollset_worker **worker_hdl,
@@ -227,6 +229,4 @@ grpc_error *grpc_pollset_kick(grpc_pollset *p,
return GRPC_ERROR_NONE;
}
-void grpc_kick_poller(void) { grpc_iocp_kick(); }
-
#endif /* GRPC_WINSOCK_SOCKET */
diff --git a/src/core/lib/iomgr/port.h b/src/core/lib/iomgr/port.h
index 269dc35003..2a553f4114 100644
--- a/src/core/lib/iomgr/port.h
+++ b/src/core/lib/iomgr/port.h
@@ -88,6 +88,7 @@
#ifndef __GLIBC__
#define GRPC_LINUX_EPOLL 1
#define GRPC_LINUX_EVENTFD 1
+#define GRPC_MSG_IOVLEN_TYPE int
#endif
#ifndef GRPC_LINUX_EVENTFD
#define GRPC_POSIX_NO_SPECIAL_WAKEUP_FD 1
diff --git a/src/core/lib/iomgr/resource_quota.c b/src/core/lib/iomgr/resource_quota.c
index c3ee878651..6b2b85cce0 100644
--- a/src/core/lib/iomgr/resource_quota.c
+++ b/src/core/lib/iomgr/resource_quota.c
@@ -44,7 +44,7 @@
#include "src/core/lib/iomgr/combiner.h"
-int grpc_resource_quota_trace = 0;
+grpc_tracer_flag grpc_resource_quota_trace = GRPC_TRACER_INITIALIZER(false);
#define MEMORY_USAGE_ESTIMATION_MAX 65536
@@ -307,13 +307,14 @@ static bool rq_alloc(grpc_exec_ctx *exec_ctx,
resource_user->free_pool = 0;
resource_quota->free_pool -= amt;
rq_update_estimate(resource_quota);
- if (grpc_resource_quota_trace) {
+ if (GRPC_TRACER_ON(grpc_resource_quota_trace)) {
gpr_log(GPR_DEBUG, "RQ %s %s: grant alloc %" PRId64
" bytes; rq_free_pool -> %" PRId64,
resource_quota->name, resource_user->name, amt,
resource_quota->free_pool);
}
- } else if (grpc_resource_quota_trace && resource_user->free_pool >= 0) {
+ } else if (GRPC_TRACER_ON(grpc_resource_quota_trace) &&
+ resource_user->free_pool >= 0) {
gpr_log(GPR_DEBUG, "RQ %s %s: discard already satisfied alloc request",
resource_quota->name, resource_user->name);
}
@@ -342,7 +343,7 @@ static bool rq_reclaim_from_per_user_free_pool(
resource_user->free_pool = 0;
resource_quota->free_pool += amt;
rq_update_estimate(resource_quota);
- if (grpc_resource_quota_trace) {
+ if (GRPC_TRACER_ON(grpc_resource_quota_trace)) {
gpr_log(GPR_DEBUG, "RQ %s %s: reclaim_from_per_user_free_pool %" PRId64
" bytes; rq_free_pool -> %" PRId64,
resource_quota->name, resource_user->name, amt,
@@ -365,7 +366,7 @@ static bool rq_reclaim(grpc_exec_ctx *exec_ctx,
: GRPC_RULIST_RECLAIMER_BENIGN;
grpc_resource_user *resource_user = rulist_pop_head(resource_quota, list);
if (resource_user == NULL) return false;
- if (grpc_resource_quota_trace) {
+ if (GRPC_TRACER_ON(grpc_resource_quota_trace)) {
gpr_log(GPR_DEBUG, "RQ %s %s: initiate %s reclamation",
resource_quota->name, resource_user->name,
destructive ? "destructive" : "benign");
@@ -786,7 +787,7 @@ void grpc_resource_user_alloc(grpc_exec_ctx *exec_ctx,
gpr_mu_lock(&resource_user->mu);
ru_ref_by(resource_user, (gpr_atm)size);
resource_user->free_pool -= (int64_t)size;
- if (grpc_resource_quota_trace) {
+ if (GRPC_TRACER_ON(grpc_resource_quota_trace)) {
gpr_log(GPR_DEBUG, "RQ %s %s: alloc %" PRIdPTR "; free_pool -> %" PRId64,
resource_user->resource_quota->name, resource_user->name, size,
resource_user->free_pool);
@@ -810,7 +811,7 @@ void grpc_resource_user_free(grpc_exec_ctx *exec_ctx,
gpr_mu_lock(&resource_user->mu);
bool was_zero_or_negative = resource_user->free_pool <= 0;
resource_user->free_pool += (int64_t)size;
- if (grpc_resource_quota_trace) {
+ if (GRPC_TRACER_ON(grpc_resource_quota_trace)) {
gpr_log(GPR_DEBUG, "RQ %s %s: free %" PRIdPTR "; free_pool -> %" PRId64,
resource_user->resource_quota->name, resource_user->name, size,
resource_user->free_pool);
@@ -839,7 +840,7 @@ void grpc_resource_user_post_reclaimer(grpc_exec_ctx *exec_ctx,
void grpc_resource_user_finish_reclamation(grpc_exec_ctx *exec_ctx,
grpc_resource_user *resource_user) {
- if (grpc_resource_quota_trace) {
+ if (GRPC_TRACER_ON(grpc_resource_quota_trace)) {
gpr_log(GPR_DEBUG, "RQ %s %s: reclamation complete",
resource_user->resource_quota->name, resource_user->name);
}
diff --git a/src/core/lib/iomgr/resource_quota.h b/src/core/lib/iomgr/resource_quota.h
index 6f99be0d51..51122dad01 100644
--- a/src/core/lib/iomgr/resource_quota.h
+++ b/src/core/lib/iomgr/resource_quota.h
@@ -36,6 +36,7 @@
#include <grpc/grpc.h>
+#include "src/core/lib/debug/trace.h"
#include "src/core/lib/iomgr/exec_ctx.h"
/** \file Tracks resource usage against a pool.
@@ -75,7 +76,7 @@
maintain lists of users (which users arrange to leave before they are
destroyed) */
-extern int grpc_resource_quota_trace;
+extern grpc_tracer_flag grpc_resource_quota_trace;
grpc_resource_quota *grpc_resource_quota_ref_internal(
grpc_resource_quota *resource_quota);
diff --git a/src/core/lib/iomgr/sys_epoll_wrapper.h b/src/core/lib/iomgr/sys_epoll_wrapper.h
new file mode 100644
index 0000000000..2f08423193
--- /dev/null
+++ b/src/core/lib/iomgr/sys_epoll_wrapper.h
@@ -0,0 +1,43 @@
+/*
+ *
+ * Copyright 2017, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef GRPC_CORE_LIB_IOMGR_SYS_EPOLL_WRAPPER_H
+#define GRPC_CORE_LIB_IOMGR_SYS_EPOLL_WRAPPER_H
+
+#include <sys/epoll.h>
+
+#ifndef EPOLLEXCLUSIVE
+#define EPOLLEXCLUSIVE (1 << 28)
+#endif
+
+#endif /* GRPC_CORE_LIB_IOMGR_SYS_EPOLL_WRAPPER_H */
diff --git a/src/core/lib/iomgr/tcp_client_posix.c b/src/core/lib/iomgr/tcp_client_posix.c
index a2692707d9..5c7da999e0 100644
--- a/src/core/lib/iomgr/tcp_client_posix.c
+++ b/src/core/lib/iomgr/tcp_client_posix.c
@@ -58,7 +58,7 @@
#include "src/core/lib/iomgr/unix_sockets_posix.h"
#include "src/core/lib/support/string.h"
-extern int grpc_tcp_trace;
+extern grpc_tracer_flag grpc_tcp_trace;
typedef struct {
gpr_mu mu;
@@ -114,7 +114,7 @@ done:
static void tc_on_alarm(grpc_exec_ctx *exec_ctx, void *acp, grpc_error *error) {
int done;
async_connect *ac = acp;
- if (grpc_tcp_trace) {
+ if (GRPC_TRACER_ON(grpc_tcp_trace)) {
const char *str = grpc_error_string(error);
gpr_log(GPR_DEBUG, "CLIENT_CONNECT: %s: on_alarm: error=%s", ac->addr_str,
str);
@@ -152,7 +152,7 @@ static void on_writable(grpc_exec_ctx *exec_ctx, void *acp, grpc_error *error) {
GRPC_ERROR_REF(error);
- if (grpc_tcp_trace) {
+ if (GRPC_TRACER_ON(grpc_tcp_trace)) {
const char *str = grpc_error_string(error);
gpr_log(GPR_DEBUG, "CLIENT_CONNECT: %s: on_writable: error=%s",
ac->addr_str, str);
@@ -330,9 +330,9 @@ static void tcp_client_connect_impl(grpc_exec_ctx *exec_ctx,
grpc_schedule_on_exec_ctx);
ac->channel_args = grpc_channel_args_copy(channel_args);
- if (grpc_tcp_trace) {
- gpr_log(GPR_DEBUG, "CLIENT_CONNECT: %s: asynchronously connecting",
- ac->addr_str);
+ if (GRPC_TRACER_ON(grpc_tcp_trace)) {
+ gpr_log(GPR_DEBUG, "CLIENT_CONNECT: %s: asynchronously connecting fd %p",
+ ac->addr_str, fdobj);
}
gpr_mu_lock(&ac->mu);
diff --git a/src/core/lib/iomgr/tcp_client_uv.c b/src/core/lib/iomgr/tcp_client_uv.c
index 682c24ed56..f0856a76d4 100644
--- a/src/core/lib/iomgr/tcp_client_uv.c
+++ b/src/core/lib/iomgr/tcp_client_uv.c
@@ -46,7 +46,7 @@
#include "src/core/lib/iomgr/tcp_uv.h"
#include "src/core/lib/iomgr/timer.h"
-extern int grpc_tcp_trace;
+extern grpc_tracer_flag grpc_tcp_trace;
typedef struct grpc_uv_tcp_connect {
uv_connect_t connect_req;
@@ -72,7 +72,7 @@ static void uv_tc_on_alarm(grpc_exec_ctx *exec_ctx, void *acp,
grpc_error *error) {
int done;
grpc_uv_tcp_connect *connect = acp;
- if (grpc_tcp_trace) {
+ if (GRPC_TRACER_ON(grpc_tcp_trace)) {
const char *str = grpc_error_string(error);
gpr_log(GPR_DEBUG, "CLIENT_CONNECT: %s: on_alarm: error=%s",
connect->addr_name, str);
@@ -156,7 +156,7 @@ static void tcp_client_connect_impl(grpc_exec_ctx *exec_ctx,
uv_tcp_init(uv_default_loop(), connect->tcp_handle);
connect->connect_req.data = connect;
- if (grpc_tcp_trace) {
+ if (GRPC_TRACER_ON(grpc_tcp_trace)) {
gpr_log(GPR_DEBUG, "CLIENT_CONNECT: %s: asynchronously connecting",
connect->addr_name);
}
diff --git a/src/core/lib/iomgr/tcp_posix.c b/src/core/lib/iomgr/tcp_posix.c
index 5f4b38de2b..5d360b0b80 100644
--- a/src/core/lib/iomgr/tcp_posix.c
+++ b/src/core/lib/iomgr/tcp_posix.c
@@ -74,7 +74,7 @@ typedef GRPC_MSG_IOVLEN_TYPE msg_iovlen_type;
typedef size_t msg_iovlen_type;
#endif
-int grpc_tcp_trace = 0;
+grpc_tracer_flag grpc_tcp_trace = GRPC_TRACER_INITIALIZER(false);
typedef struct {
grpc_endpoint base;
@@ -221,7 +221,7 @@ static void call_read_cb(grpc_exec_ctx *exec_ctx, grpc_tcp *tcp,
grpc_error *error) {
grpc_closure *cb = tcp->read_cb;
- if (grpc_tcp_trace) {
+ if (GRPC_TRACER_ON(grpc_tcp_trace)) {
size_t i;
const char *str = grpc_error_string(error);
gpr_log(GPR_DEBUG, "read: error=%s", str);
@@ -468,14 +468,14 @@ static void tcp_handle_write(grpc_exec_ctx *exec_ctx, void *arg /* grpc_tcp */,
}
if (!tcp_flush(tcp, &error)) {
- if (grpc_tcp_trace) {
+ if (GRPC_TRACER_ON(grpc_tcp_trace)) {
gpr_log(GPR_DEBUG, "write: delayed");
}
grpc_fd_notify_on_write(exec_ctx, tcp->em_fd, &tcp->write_closure);
} else {
cb = tcp->write_cb;
tcp->write_cb = NULL;
- if (grpc_tcp_trace) {
+ if (GRPC_TRACER_ON(grpc_tcp_trace)) {
const char *str = grpc_error_string(error);
gpr_log(GPR_DEBUG, "write: %s", str);
}
@@ -490,7 +490,7 @@ static void tcp_write(grpc_exec_ctx *exec_ctx, grpc_endpoint *ep,
grpc_tcp *tcp = (grpc_tcp *)ep;
grpc_error *error = GRPC_ERROR_NONE;
- if (grpc_tcp_trace) {
+ if (GRPC_TRACER_ON(grpc_tcp_trace)) {
size_t i;
for (i = 0; i < buf->count; i++) {
@@ -521,12 +521,12 @@ static void tcp_write(grpc_exec_ctx *exec_ctx, grpc_endpoint *ep,
if (!tcp_flush(tcp, &error)) {
TCP_REF(tcp, "write");
tcp->write_cb = cb;
- if (grpc_tcp_trace) {
+ if (GRPC_TRACER_ON(grpc_tcp_trace)) {
gpr_log(GPR_DEBUG, "write: delayed");
}
grpc_fd_notify_on_write(exec_ctx, tcp->em_fd, &tcp->write_closure);
} else {
- if (grpc_tcp_trace) {
+ if (GRPC_TRACER_ON(grpc_tcp_trace)) {
const char *str = grpc_error_string(error);
gpr_log(GPR_DEBUG, "write: %s", str);
}
diff --git a/src/core/lib/iomgr/tcp_posix.h b/src/core/lib/iomgr/tcp_posix.h
index 1ad5788331..4ad60c116e 100644
--- a/src/core/lib/iomgr/tcp_posix.h
+++ b/src/core/lib/iomgr/tcp_posix.h
@@ -44,10 +44,11 @@
otherwise specified.
*/
+#include "src/core/lib/debug/trace.h"
#include "src/core/lib/iomgr/endpoint.h"
#include "src/core/lib/iomgr/ev_posix.h"
-extern int grpc_tcp_trace;
+extern grpc_tracer_flag grpc_tcp_trace;
/* Create a tcp endpoint given a file desciptor and a read slice size.
Takes ownership of fd. */
diff --git a/src/core/lib/iomgr/tcp_server_posix.c b/src/core/lib/iomgr/tcp_server_posix.c
index e66ffc9b1c..08997b5e2b 100644
--- a/src/core/lib/iomgr/tcp_server_posix.c
+++ b/src/core/lib/iomgr/tcp_server_posix.c
@@ -257,7 +257,7 @@ static void on_read(grpc_exec_ctx *exec_ctx, void *arg, grpc_error *err) {
addr_str = grpc_sockaddr_to_uri(&addr);
gpr_asprintf(&name, "tcp-server-connection:%s", addr_str);
- if (grpc_tcp_trace) {
+ if (GRPC_TRACER_ON(grpc_tcp_trace)) {
gpr_log(GPR_DEBUG, "SERVER_CONNECT: incoming connection: %s", addr_str);
}
diff --git a/src/core/lib/iomgr/tcp_server_uv.c b/src/core/lib/iomgr/tcp_server_uv.c
index e9246948f5..d446e5312a 100644
--- a/src/core/lib/iomgr/tcp_server_uv.c
+++ b/src/core/lib/iomgr/tcp_server_uv.c
@@ -56,6 +56,8 @@ struct grpc_tcp_listener {
int port;
/* linked list */
struct grpc_tcp_listener *next;
+
+ bool closed;
};
struct grpc_tcp_server {
@@ -77,6 +79,8 @@ struct grpc_tcp_server {
/* shutdown callback */
grpc_closure *shutdown_complete;
+ bool shutdown;
+
grpc_resource_quota *resource_quota;
};
@@ -109,6 +113,7 @@ grpc_error *grpc_tcp_server_create(grpc_exec_ctx *exec_ctx,
s->shutdown_starting.head = NULL;
s->shutdown_starting.tail = NULL;
s->shutdown_complete = shutdown_complete;
+ s->shutdown = false;
*server = s;
return GRPC_ERROR_NONE;
}
@@ -125,6 +130,7 @@ void grpc_tcp_server_shutdown_starting_add(grpc_tcp_server *s,
}
static void finish_shutdown(grpc_exec_ctx *exec_ctx, grpc_tcp_server *s) {
+ GPR_ASSERT(s->shutdown);
if (s->shutdown_complete != NULL) {
grpc_closure_sched(exec_ctx, s->shutdown_complete, GRPC_ERROR_NONE);
}
@@ -144,21 +150,31 @@ static void handle_close_callback(uv_handle_t *handle) {
grpc_tcp_listener *sp = (grpc_tcp_listener *)handle->data;
grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT;
sp->server->open_ports--;
- if (sp->server->open_ports == 0) {
+ if (sp->server->open_ports == 0 && sp->server->shutdown) {
finish_shutdown(&exec_ctx, sp->server);
}
grpc_exec_ctx_finish(&exec_ctx);
}
+static void close_listener(grpc_tcp_listener *sp) {
+ if (!sp->closed) {
+ sp->closed = true;
+ uv_close((uv_handle_t *)sp->handle, handle_close_callback);
+ }
+}
+
static void tcp_server_destroy(grpc_exec_ctx *exec_ctx, grpc_tcp_server *s) {
int immediately_done = 0;
grpc_tcp_listener *sp;
+ GPR_ASSERT(!s->shutdown);
+ s->shutdown = true;
+
if (s->open_ports == 0) {
immediately_done = 1;
}
for (sp = s->head; sp; sp = sp->next) {
- uv_close((uv_handle_t *)sp->handle, handle_close_callback);
+ close_listener(sp);
}
if (immediately_done) {
@@ -196,9 +212,14 @@ static void on_connect(uv_stream_t *server, int status) {
int err;
if (status < 0) {
- gpr_log(GPR_INFO, "Skipping on_accept due to error: %s",
- uv_strerror(status));
- return;
+ switch (status) {
+ case UV_EINTR:
+ case UV_EAGAIN:
+ return;
+ default:
+ close_listener(sp);
+ return;
+ }
}
client = gpr_malloc(sizeof(uv_tcp_t));
@@ -287,6 +308,7 @@ static grpc_error *add_socket_to_server(grpc_tcp_server *s, uv_tcp_t *handle,
sp->handle = handle;
sp->port = port;
sp->port_index = port_index;
+ sp->closed = false;
handle->data = sp;
s->open_ports++;
GPR_ASSERT(sp->handle);
diff --git a/src/core/lib/iomgr/tcp_uv.c b/src/core/lib/iomgr/tcp_uv.c
index 8e8db9f7b4..dc23e4f521 100644
--- a/src/core/lib/iomgr/tcp_uv.c
+++ b/src/core/lib/iomgr/tcp_uv.c
@@ -52,7 +52,7 @@
#include "src/core/lib/slice/slice_string_helpers.h"
#include "src/core/lib/support/string.h"
-int grpc_tcp_trace = 0;
+grpc_tracer_flag grpc_tcp_trace = GRPC_TRACER_INITIALIZER(false);
typedef struct {
grpc_endpoint base;
@@ -88,12 +88,12 @@ static void tcp_free(grpc_exec_ctx *exec_ctx, grpc_tcp *tcp) {
#ifdef GRPC_TCP_REFCOUNT_DEBUG
#define TCP_UNREF(exec_ctx, tcp, reason) \
tcp_unref((exec_ctx), (tcp), (reason), __FILE__, __LINE__)
-#define TCP_REF(tcp, reason) \
- tcp_ref((exec_ctx), (tcp), (reason), __FILE__, __LINE__)
+#define TCP_REF(tcp, reason) tcp_ref((tcp), (reason), __FILE__, __LINE__)
static void tcp_unref(grpc_exec_ctx *exec_ctx, grpc_tcp *tcp,
const char *reason, const char *file, int line) {
- gpr_log(file, line, GPR_LOG_SEVERITY_DEBUG, "TCP unref %p : %s %d -> %d", tcp,
- reason, tcp->refcount.count, tcp->refcount.count - 1);
+ gpr_log(file, line, GPR_LOG_SEVERITY_DEBUG,
+ "TCP unref %p : %s %" PRIiPTR " -> %" PRIiPTR, tcp, reason,
+ tcp->refcount.count, tcp->refcount.count - 1);
if (gpr_unref(&tcp->refcount)) {
tcp_free(exec_ctx, tcp);
}
@@ -101,8 +101,9 @@ static void tcp_unref(grpc_exec_ctx *exec_ctx, grpc_tcp *tcp,
static void tcp_ref(grpc_tcp *tcp, const char *reason, const char *file,
int line) {
- gpr_log(file, line, GPR_LOG_SEVERITY_DEBUG, "TCP ref %p : %s %d -> %d", tcp,
- reason, tcp->refcount.count, tcp->refcount.count + 1);
+ gpr_log(file, line, GPR_LOG_SEVERITY_DEBUG,
+ "TCP ref %p : %s %" PRIiPTR " -> %" PRIiPTR, tcp, reason,
+ tcp->refcount.count, tcp->refcount.count + 1);
gpr_ref(&tcp->refcount);
}
#else
@@ -158,7 +159,7 @@ static void read_callback(uv_stream_t *stream, ssize_t nread,
sub = grpc_slice_sub_no_ref(tcp->read_slice, 0, (size_t)nread);
grpc_slice_buffer_add(tcp->read_slices, sub);
error = GRPC_ERROR_NONE;
- if (grpc_tcp_trace) {
+ if (GRPC_TRACER_ON(grpc_tcp_trace)) {
size_t i;
const char *str = grpc_error_string(error);
gpr_log(GPR_DEBUG, "read: error=%s", str);
@@ -199,7 +200,7 @@ static void uv_endpoint_read(grpc_exec_ctx *exec_ctx, grpc_endpoint *ep,
grpc_slice_from_static_string(uv_strerror(status)));
grpc_closure_sched(exec_ctx, cb, error);
}
- if (grpc_tcp_trace) {
+ if (GRPC_TRACER_ON(grpc_tcp_trace)) {
const char *str = grpc_error_string(error);
gpr_log(GPR_DEBUG, "Initiating read on %p: error=%s", tcp, str);
}
@@ -217,7 +218,7 @@ static void write_callback(uv_write_t *req, int status) {
} else {
error = GRPC_ERROR_CREATE_FROM_STATIC_STRING("TCP Write failed");
}
- if (grpc_tcp_trace) {
+ if (GRPC_TRACER_ON(grpc_tcp_trace)) {
const char *str = grpc_error_string(error);
gpr_log(GPR_DEBUG, "write complete on %p: error=%s", tcp, str);
}
@@ -238,7 +239,7 @@ static void uv_endpoint_write(grpc_exec_ctx *exec_ctx, grpc_endpoint *ep,
grpc_slice *slice;
uv_write_t *write_req;
- if (grpc_tcp_trace) {
+ if (GRPC_TRACER_ON(grpc_tcp_trace)) {
size_t j;
for (j = 0; j < write_slices->count; j++) {
@@ -311,6 +312,7 @@ static void uv_endpoint_shutdown(grpc_exec_ctx *exec_ctx, grpc_endpoint *ep,
tcp->shutting_down = true;
uv_shutdown_t *req = &tcp->shutdown_req;
uv_shutdown(req, (uv_stream_t *)tcp->handle, shutdown_callback);
+ grpc_resource_user_shutdown(exec_ctx, tcp->resource_user);
}
GRPC_ERROR_UNREF(why);
}
@@ -346,7 +348,7 @@ grpc_endpoint *grpc_tcp_create(uv_tcp_t *handle,
char *peer_string) {
grpc_tcp *tcp = (grpc_tcp *)gpr_malloc(sizeof(grpc_tcp));
- if (grpc_tcp_trace) {
+ if (GRPC_TRACER_ON(grpc_tcp_trace)) {
gpr_log(GPR_DEBUG, "Creating TCP endpoint %p", tcp);
}
diff --git a/src/core/lib/iomgr/tcp_uv.h b/src/core/lib/iomgr/tcp_uv.h
index 970fcafe4a..106bec5eca 100644
--- a/src/core/lib/iomgr/tcp_uv.h
+++ b/src/core/lib/iomgr/tcp_uv.h
@@ -44,11 +44,12 @@
otherwise specified.
*/
+#include "src/core/lib/debug/trace.h"
#include "src/core/lib/iomgr/endpoint.h"
#include <uv.h>
-extern int grpc_tcp_trace;
+extern grpc_tracer_flag grpc_tcp_trace;
#define GRPC_TCP_DEFAULT_READ_SLICE_SIZE 8192
diff --git a/src/core/lib/iomgr/timer_generic.c b/src/core/lib/iomgr/timer_generic.c
index d8e6068431..b28340b71c 100644
--- a/src/core/lib/iomgr/timer_generic.c
+++ b/src/core/lib/iomgr/timer_generic.c
@@ -56,8 +56,8 @@
#define MIN_QUEUE_WINDOW_DURATION 0.01
#define MAX_QUEUE_WINDOW_DURATION 1
-int grpc_timer_trace = 0;
-int grpc_timer_check_trace = 0;
+grpc_tracer_flag grpc_timer_trace = GRPC_TRACER_INITIALIZER(false);
+grpc_tracer_flag grpc_timer_check_trace = GRPC_TRACER_INITIALIZER(false);
typedef struct {
gpr_mu mu;
@@ -232,14 +232,13 @@ void grpc_timer_init(grpc_exec_ctx *exec_ctx, grpc_timer *timer,
GPR_ASSERT(deadline.clock_type == g_clock_type);
GPR_ASSERT(now.clock_type == g_clock_type);
timer->closure = closure;
- timer->deadline = timespec_to_atm_round_up(deadline);
+ gpr_atm deadline_atm = timer->deadline = timespec_to_atm_round_up(deadline);
- if (grpc_timer_trace) {
+ if (GRPC_TRACER_ON(grpc_timer_trace)) {
gpr_log(GPR_DEBUG, "TIMER %p: SET %" PRId64 ".%09d [%" PRIdPTR
"] now %" PRId64 ".%09d [%" PRIdPTR "] call %p[%p]",
- timer, deadline.tv_sec, deadline.tv_nsec, timer->deadline,
- now.tv_sec, now.tv_nsec, timespec_to_atm_round_down(now), closure,
- closure->cb);
+ timer, deadline.tv_sec, deadline.tv_nsec, deadline_atm, now.tv_sec,
+ now.tv_nsec, timespec_to_atm_round_down(now), closure, closure->cb);
}
if (!g_shared_mutables.initialized) {
@@ -262,13 +261,13 @@ void grpc_timer_init(grpc_exec_ctx *exec_ctx, grpc_timer *timer,
grpc_time_averaged_stats_add_sample(&shard->stats,
ts_to_dbl(gpr_time_sub(deadline, now)));
- if (timer->deadline < shard->queue_deadline_cap) {
+ if (deadline_atm < shard->queue_deadline_cap) {
is_first_timer = grpc_timer_heap_add(&shard->heap, timer);
} else {
timer->heap_index = INVALID_HEAP_INDEX;
list_join(&shard->list, timer);
}
- if (grpc_timer_trace) {
+ if (GRPC_TRACER_ON(grpc_timer_trace)) {
gpr_log(GPR_DEBUG, " .. add to shard %d with queue_deadline_cap=%" PRIdPTR
" => is_first_timer=%s",
(int)(shard - g_shards), shard->queue_deadline_cap,
@@ -289,16 +288,16 @@ void grpc_timer_init(grpc_exec_ctx *exec_ctx, grpc_timer *timer,
grpc_timer_check. */
if (is_first_timer) {
gpr_mu_lock(&g_shared_mutables.mu);
- if (grpc_timer_trace) {
+ if (GRPC_TRACER_ON(grpc_timer_trace)) {
gpr_log(GPR_DEBUG, " .. old shard min_deadline=%" PRIdPTR,
shard->min_deadline);
}
- if (timer->deadline < shard->min_deadline) {
+ if (deadline_atm < shard->min_deadline) {
gpr_atm old_min_deadline = g_shard_queue[0]->min_deadline;
- shard->min_deadline = timer->deadline;
+ shard->min_deadline = deadline_atm;
note_deadline_change(shard);
- if (shard->shard_queue_index == 0 && timer->deadline < old_min_deadline) {
- gpr_atm_no_barrier_store(&g_shared_mutables.min_timer, timer->deadline);
+ if (shard->shard_queue_index == 0 && deadline_atm < old_min_deadline) {
+ gpr_atm_no_barrier_store(&g_shared_mutables.min_timer, deadline_atm);
grpc_kick_poller();
}
}
@@ -319,7 +318,7 @@ void grpc_timer_cancel(grpc_exec_ctx *exec_ctx, grpc_timer *timer) {
shard_type *shard = &g_shards[GPR_HASH_POINTER(timer, NUM_SHARDS)];
gpr_mu_lock(&shard->mu);
- if (grpc_timer_trace) {
+ if (GRPC_TRACER_ON(grpc_timer_trace)) {
gpr_log(GPR_DEBUG, "TIMER %p: CANCEL pending=%s", timer,
timer->pending ? "true" : "false");
}
@@ -355,7 +354,7 @@ static int refill_queue(shard_type *shard, gpr_atm now) {
saturating_add(GPR_MAX(now, shard->queue_deadline_cap),
(gpr_atm)(deadline_delta * 1000.0));
- if (grpc_timer_check_trace) {
+ if (GRPC_TRACER_ON(grpc_timer_check_trace)) {
gpr_log(GPR_DEBUG, " .. shard[%d]->queue_deadline_cap --> %" PRIdPTR,
(int)(shard - g_shards), shard->queue_deadline_cap);
}
@@ -363,7 +362,7 @@ static int refill_queue(shard_type *shard, gpr_atm now) {
next = timer->next;
if (timer->deadline < shard->queue_deadline_cap) {
- if (grpc_timer_check_trace) {
+ if (GRPC_TRACER_ON(grpc_timer_check_trace)) {
gpr_log(GPR_DEBUG, " .. add timer with deadline %" PRIdPTR " to heap",
timer->deadline);
}
@@ -380,7 +379,7 @@ static int refill_queue(shard_type *shard, gpr_atm now) {
static grpc_timer *pop_one(shard_type *shard, gpr_atm now) {
grpc_timer *timer;
for (;;) {
- if (grpc_timer_check_trace) {
+ if (GRPC_TRACER_ON(grpc_timer_check_trace)) {
gpr_log(GPR_DEBUG, " .. shard[%d]: heap_empty=%s",
(int)(shard - g_shards),
grpc_timer_heap_is_empty(&shard->heap) ? "true" : "false");
@@ -390,13 +389,13 @@ static grpc_timer *pop_one(shard_type *shard, gpr_atm now) {
if (!refill_queue(shard, now)) return NULL;
}
timer = grpc_timer_heap_top(&shard->heap);
- if (grpc_timer_check_trace) {
+ if (GRPC_TRACER_ON(grpc_timer_check_trace)) {
gpr_log(GPR_DEBUG,
" .. check top timer deadline=%" PRIdPTR " now=%" PRIdPTR,
timer->deadline, now);
}
if (timer->deadline > now) return NULL;
- if (grpc_timer_trace) {
+ if (GRPC_TRACER_ON(grpc_timer_trace)) {
gpr_log(GPR_DEBUG, "TIMER %p: FIRE %" PRIdPTR "ms late", timer,
now - timer->deadline);
}
@@ -436,7 +435,7 @@ static int run_some_expired_timers(grpc_exec_ctx *exec_ctx, gpr_atm now,
if (gpr_spinlock_trylock(&g_shared_mutables.checker_mu)) {
gpr_mu_lock(&g_shared_mutables.mu);
- if (grpc_timer_check_trace) {
+ if (GRPC_TRACER_ON(grpc_timer_check_trace)) {
gpr_log(GPR_DEBUG, " .. shard[%d]->min_deadline = %" PRIdPTR,
(int)(g_shard_queue[0] - g_shards),
g_shard_queue[0]->min_deadline);
@@ -452,7 +451,7 @@ static int run_some_expired_timers(grpc_exec_ctx *exec_ctx, gpr_atm now,
n +=
pop_timers(exec_ctx, g_shard_queue[0], now, &new_min_deadline, error);
- if (grpc_timer_check_trace) {
+ if (GRPC_TRACER_ON(grpc_timer_check_trace)) {
gpr_log(GPR_DEBUG, " .. popped --> %" PRIdPTR
", shard[%d]->min_deadline %" PRIdPTR
" --> %" PRIdPTR ", now=%" PRIdPTR,
@@ -509,7 +508,7 @@ bool grpc_timer_check(grpc_exec_ctx *exec_ctx, gpr_timespec now,
*next =
atm_to_timespec(GPR_MIN(timespec_to_atm_round_up(*next), min_timer));
}
- if (grpc_timer_check_trace) {
+ if (GRPC_TRACER_ON(grpc_timer_check_trace)) {
gpr_log(GPR_DEBUG,
"TIMER CHECK SKIP: now_atm=%" PRIdPTR " min_timer=%" PRIdPTR,
now_atm, min_timer);
@@ -523,7 +522,7 @@ bool grpc_timer_check(grpc_exec_ctx *exec_ctx, gpr_timespec now,
: GRPC_ERROR_CREATE_FROM_STATIC_STRING("Shutting down timer system");
// tracing
- if (grpc_timer_check_trace) {
+ if (GRPC_TRACER_ON(grpc_timer_check_trace)) {
char *next_str;
if (next == NULL) {
next_str = gpr_strdup("NULL");
@@ -549,7 +548,7 @@ bool grpc_timer_check(grpc_exec_ctx *exec_ctx, gpr_timespec now,
*next = atm_to_timespec(next_atm);
}
// tracing
- if (grpc_timer_check_trace) {
+ if (GRPC_TRACER_ON(grpc_timer_check_trace)) {
char *next_str;
if (next == NULL) {
next_str = gpr_strdup("NULL");
diff --git a/src/core/lib/iomgr/timer_manager.c b/src/core/lib/iomgr/timer_manager.c
new file mode 100644
index 0000000000..24085093e7
--- /dev/null
+++ b/src/core/lib/iomgr/timer_manager.c
@@ -0,0 +1,276 @@
+/*
+ *
+ * Copyright 2017, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "src/core/lib/iomgr/timer_manager.h"
+
+#include <grpc/support/alloc.h>
+#include <grpc/support/log.h>
+#include <grpc/support/thd.h>
+
+#include "src/core/lib/debug/trace.h"
+#include "src/core/lib/iomgr/timer.h"
+
+typedef struct completed_thread {
+ gpr_thd_id t;
+ struct completed_thread *next;
+} completed_thread;
+
+extern grpc_tracer_flag grpc_timer_check_trace;
+
+// global mutex
+static gpr_mu g_mu;
+// are we multi-threaded
+static bool g_threaded;
+// cv to wait until a thread is needed
+static gpr_cv g_cv_wait;
+// cv for notification when threading ends
+static gpr_cv g_cv_shutdown;
+// number of threads in the system
+static int g_thread_count;
+// number of threads sitting around waiting
+static int g_waiter_count;
+// linked list of threads that have completed (and need joining)
+static completed_thread *g_completed_threads;
+// was the manager kicked by the timer system
+static bool g_kicked;
+// is there a thread waiting until the next timer should fire?
+static bool g_has_timed_waiter;
+// generation counter to track which thread is waiting for the next timer
+static uint64_t g_timed_waiter_generation;
+
+static void timer_thread(void *unused);
+
+static void gc_completed_threads(void) {
+ if (g_completed_threads != NULL) {
+ completed_thread *to_gc = g_completed_threads;
+ g_completed_threads = NULL;
+ gpr_mu_unlock(&g_mu);
+ while (to_gc != NULL) {
+ gpr_thd_join(to_gc->t);
+ completed_thread *next = to_gc->next;
+ gpr_free(to_gc);
+ to_gc = next;
+ }
+ gpr_mu_lock(&g_mu);
+ }
+}
+
+static void start_timer_thread_and_unlock(void) {
+ GPR_ASSERT(g_threaded);
+ ++g_waiter_count;
+ ++g_thread_count;
+ gpr_mu_unlock(&g_mu);
+ if (GRPC_TRACER_ON(grpc_timer_check_trace)) {
+ gpr_log(GPR_DEBUG, "Spawn timer thread");
+ }
+ gpr_thd_id thd;
+ gpr_thd_options opt = gpr_thd_options_default();
+ gpr_thd_options_set_joinable(&opt);
+ gpr_thd_new(&thd, timer_thread, NULL, &opt);
+}
+
+void grpc_timer_manager_tick() {
+ grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT;
+ gpr_timespec next = gpr_inf_future(GPR_CLOCK_MONOTONIC);
+ gpr_timespec now = gpr_now(GPR_CLOCK_MONOTONIC);
+ grpc_timer_check(&exec_ctx, now, &next);
+ grpc_exec_ctx_finish(&exec_ctx);
+}
+
+static void timer_thread(void *unused) {
+ // this threads exec_ctx: we try to run things through to completion here
+ // since it's easy to spin up new threads
+ grpc_exec_ctx exec_ctx =
+ GRPC_EXEC_CTX_INITIALIZER(0, grpc_never_ready_to_finish, NULL);
+ const gpr_timespec inf_future = gpr_inf_future(GPR_CLOCK_MONOTONIC);
+ for (;;) {
+ gpr_timespec next = inf_future;
+ gpr_timespec now = gpr_now(GPR_CLOCK_MONOTONIC);
+ // check timer state, updates next to the next time to run a check
+ if (grpc_timer_check(&exec_ctx, now, &next)) {
+ // if there's something to execute...
+ gpr_mu_lock(&g_mu);
+ // remove a waiter from the pool, and start another thread if necessary
+ --g_waiter_count;
+ if (g_waiter_count == 0 && g_threaded) {
+ start_timer_thread_and_unlock();
+ } else {
+ // if there's no thread waiting with a timeout, kick an existing waiter
+ // so that the next deadline is not missed
+ if (!g_has_timed_waiter) {
+ if (GRPC_TRACER_ON(grpc_timer_check_trace)) {
+ gpr_log(GPR_DEBUG, "kick untimed waiter");
+ }
+ gpr_cv_signal(&g_cv_wait);
+ }
+ gpr_mu_unlock(&g_mu);
+ }
+ // without our lock, flush the exec_ctx
+ grpc_exec_ctx_flush(&exec_ctx);
+ gpr_mu_lock(&g_mu);
+ // garbage collect any threads hanging out that are dead
+ gc_completed_threads();
+ // get ready to wait again
+ ++g_waiter_count;
+ gpr_mu_unlock(&g_mu);
+ } else {
+ gpr_mu_lock(&g_mu);
+ // if we're not threaded anymore, leave
+ if (!g_threaded) break;
+ // if there's no timed waiter, we should become one: that waiter waits
+ // only until the next timer should expire
+ // all other timers wait forever
+ uint64_t my_timed_waiter_generation = g_timed_waiter_generation - 1;
+ if (!g_has_timed_waiter) {
+ g_has_timed_waiter = true;
+ // we use a generation counter to track the timed waiter so we can
+ // cancel an existing one quickly (and when it actually times out it'll
+ // figure stuff out instead of incurring a wakeup)
+ my_timed_waiter_generation = ++g_timed_waiter_generation;
+ if (GRPC_TRACER_ON(grpc_timer_check_trace)) {
+ gpr_log(GPR_DEBUG, "sleep for a while");
+ }
+ } else {
+ next = inf_future;
+ if (GRPC_TRACER_ON(grpc_timer_check_trace)) {
+ gpr_log(GPR_DEBUG, "sleep until kicked");
+ }
+ }
+ gpr_cv_wait(&g_cv_wait, &g_mu, next);
+ if (GRPC_TRACER_ON(grpc_timer_check_trace)) {
+ gpr_log(GPR_DEBUG, "wait ended: was_timed:%d kicked:%d",
+ my_timed_waiter_generation == g_timed_waiter_generation,
+ g_kicked);
+ }
+ // if this was the timed waiter, then we need to check timers, and flag
+ // that there's now no timed waiter... we'll look for a replacement if
+ // there's work to do after checking timers (code above)
+ if (my_timed_waiter_generation == g_timed_waiter_generation) {
+ g_has_timed_waiter = false;
+ }
+ // if this was a kick from the timer system, consume it (and don't stop
+ // this thread yet)
+ if (g_kicked) {
+ grpc_timer_consume_kick();
+ g_kicked = false;
+ }
+ gpr_mu_unlock(&g_mu);
+ }
+ }
+ // terminate the thread: drop the waiter count, thread count, and let whomever
+ // stopped the threading stuff know that we're done
+ --g_waiter_count;
+ --g_thread_count;
+ if (0 == g_thread_count) {
+ gpr_cv_signal(&g_cv_shutdown);
+ }
+ completed_thread *ct = gpr_malloc(sizeof(*ct));
+ ct->t = gpr_thd_currentid();
+ ct->next = g_completed_threads;
+ g_completed_threads = ct;
+ gpr_mu_unlock(&g_mu);
+ grpc_exec_ctx_finish(&exec_ctx);
+ if (GRPC_TRACER_ON(grpc_timer_check_trace)) {
+ gpr_log(GPR_DEBUG, "End timer thread");
+ }
+}
+
+static void start_threads(void) {
+ gpr_mu_lock(&g_mu);
+ if (!g_threaded) {
+ g_threaded = true;
+ start_timer_thread_and_unlock();
+ } else {
+ g_threaded = false;
+ gpr_mu_unlock(&g_mu);
+ }
+}
+
+void grpc_timer_manager_init(void) {
+ gpr_mu_init(&g_mu);
+ gpr_cv_init(&g_cv_wait);
+ gpr_cv_init(&g_cv_shutdown);
+ g_threaded = false;
+ g_thread_count = 0;
+ g_waiter_count = 0;
+ g_completed_threads = NULL;
+
+ start_threads();
+}
+
+static void stop_threads(void) {
+ gpr_mu_lock(&g_mu);
+ if (GRPC_TRACER_ON(grpc_timer_check_trace)) {
+ gpr_log(GPR_DEBUG, "stop timer threads: threaded=%d", g_threaded);
+ }
+ if (g_threaded) {
+ g_threaded = false;
+ gpr_cv_broadcast(&g_cv_wait);
+ if (GRPC_TRACER_ON(grpc_timer_check_trace)) {
+ gpr_log(GPR_DEBUG, "num timer threads: %d", g_thread_count);
+ }
+ while (g_thread_count > 0) {
+ gpr_cv_wait(&g_cv_shutdown, &g_mu, gpr_inf_future(GPR_CLOCK_REALTIME));
+ if (GRPC_TRACER_ON(grpc_timer_check_trace)) {
+ gpr_log(GPR_DEBUG, "num timer threads: %d", g_thread_count);
+ }
+ gc_completed_threads();
+ }
+ }
+ gpr_mu_unlock(&g_mu);
+}
+
+void grpc_timer_manager_shutdown(void) {
+ stop_threads();
+
+ gpr_mu_destroy(&g_mu);
+ gpr_cv_destroy(&g_cv_wait);
+ gpr_cv_destroy(&g_cv_shutdown);
+}
+
+void grpc_timer_manager_set_threading(bool threaded) {
+ if (threaded) {
+ start_threads();
+ } else {
+ stop_threads();
+ }
+}
+
+void grpc_kick_poller(void) {
+ gpr_mu_lock(&g_mu);
+ g_kicked = true;
+ g_has_timed_waiter = false;
+ ++g_timed_waiter_generation;
+ gpr_cv_signal(&g_cv_wait);
+ gpr_mu_unlock(&g_mu);
+}
diff --git a/src/core/lib/iomgr/timer_manager.h b/src/core/lib/iomgr/timer_manager.h
new file mode 100644
index 0000000000..46729ccea6
--- /dev/null
+++ b/src/core/lib/iomgr/timer_manager.h
@@ -0,0 +1,52 @@
+/*
+ *
+ * Copyright 2017, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef GRPC_CORE_LIB_IOMGR_TIMER_MANAGER_H
+#define GRPC_CORE_LIB_IOMGR_TIMER_MANAGER_H
+
+#include <stdbool.h>
+
+/* Timer Manager tries to keep one thread waiting for the next timeout at all
+ times */
+
+void grpc_timer_manager_init(void);
+void grpc_timer_manager_shutdown(void);
+
+/* enable/disable threading - must be called after grpc_timer_manager_init and
+ * before grpc_timer_manager_shutdown */
+void grpc_timer_manager_set_threading(bool enabled);
+/* explicitly perform one tick of the timer system - for when threading is
+ * disabled */
+void grpc_timer_manager_tick(void);
+
+#endif /* GRPC_CORE_LIB_IOMGR_TIMER_MANAGER_H */
diff --git a/src/core/lib/iomgr/timer_uv.c b/src/core/lib/iomgr/timer_uv.c
index 8e8a07578c..2952e44b58 100644
--- a/src/core/lib/iomgr/timer_uv.c
+++ b/src/core/lib/iomgr/timer_uv.c
@@ -38,10 +38,14 @@
#include <grpc/support/alloc.h>
#include <grpc/support/log.h>
+#include "src/core/lib/debug/trace.h"
#include "src/core/lib/iomgr/timer.h"
#include <uv.h>
+grpc_tracer_flag grpc_timer_trace = GRPC_TRACER_INITIALIZER(false);
+grpc_tracer_flag grpc_timer_check_trace = GRPC_TRACER_INITIALIZER(false);
+
static void timer_close_callback(uv_handle_t *handle) { gpr_free(handle); }
static void stop_uv_timer(uv_timer_t *handle) {
@@ -100,4 +104,6 @@ bool grpc_timer_check(grpc_exec_ctx *exec_ctx, gpr_timespec now,
void grpc_timer_list_init(gpr_timespec now) {}
void grpc_timer_list_shutdown(grpc_exec_ctx *exec_ctx) {}
+void grpc_timer_consume_kick(void) {}
+
#endif /* GRPC_UV */