diff options
Diffstat (limited to 'src/core/lib/iomgr/ev_epoll_linux.c')
-rw-r--r-- | src/core/lib/iomgr/ev_epoll_linux.c | 251 |
1 files changed, 196 insertions, 55 deletions
diff --git a/src/core/lib/iomgr/ev_epoll_linux.c b/src/core/lib/iomgr/ev_epoll_linux.c index ab77ebc78b..db51ec4939 100644 --- a/src/core/lib/iomgr/ev_epoll_linux.c +++ b/src/core/lib/iomgr/ev_epoll_linux.c @@ -32,10 +32,10 @@ */ #include <grpc/grpc_posix.h> -#include <grpc/support/port_platform.h> +#include "src/core/lib/iomgr/port.h" /* This polling engine is only relevant on linux kernels supporting epoll() */ -#ifdef GPR_LINUX_EPOLL +#ifdef GRPC_LINUX_EPOLL #include "src/core/lib/iomgr/ev_epoll_linux.h" @@ -152,20 +152,20 @@ static void fd_global_shutdown(void); * Polling island Declarations */ -//#define GRPC_PI_REF_COUNT_DEBUG -#ifdef GRPC_PI_REF_COUNT_DEBUG +#ifdef GRPC_WORKQUEUE_REFCOUNT_DEBUG #define PI_ADD_REF(p, r) pi_add_ref_dbg((p), (r), __FILE__, __LINE__) #define PI_UNREF(exec_ctx, p, r) \ pi_unref_dbg((exec_ctx), (p), (r), __FILE__, __LINE__) -#else /* defined(GRPC_PI_REF_COUNT_DEBUG) */ +#else /* defined(GRPC_WORKQUEUE_REFCOUNT_DEBUG) */ #define PI_ADD_REF(p, r) pi_add_ref((p)) #define PI_UNREF(exec_ctx, p, r) pi_unref((exec_ctx), (p)) #endif /* !defined(GPRC_PI_REF_COUNT_DEBUG) */ +/* This is also used as grpc_workqueue (by directly casing it) */ typedef struct polling_island { gpr_mu mu; /* Ref count. Use PI_ADD_REF() and PI_UNREF() macros to increment/decrement @@ -185,8 +185,17 @@ typedef struct polling_island { * (except mu and ref_count) are invalid and must be ignored. */ gpr_atm merged_to; - /* The workqueue associated with this polling island */ - grpc_workqueue *workqueue; + /* Number of threads currently polling on this island */ + gpr_atm poller_count; + /* Mutex guarding the read end of the workqueue (must be held to pop from + * workqueue_items) */ + gpr_mu workqueue_read_mu; + /* Queue of closures to be executed */ + gpr_mpscq workqueue_items; + /* Count of items in workqueue_items */ + gpr_atm workqueue_item_count; + /* Wakeup fd used to wake pollers to check the contents of workqueue_items */ + grpc_wakeup_fd workqueue_wakeup_fd; /* The fd of the underlying epoll set */ int epoll_fd; @@ -275,6 +284,10 @@ static bool append_error(grpc_error **composite, grpc_error *error, threads that woke up MUST NOT call grpc_wakeup_fd_consume_wakeup() */ static grpc_wakeup_fd polling_island_wakeup_fd; +/* The polling island being polled right now. + See comments in workqueue_maybe_wakeup for why this is tracked. */ +static __thread polling_island *g_current_thread_polling_island; + /* Forward declaration */ static void polling_island_delete(grpc_exec_ctx *exec_ctx, polling_island *pi); @@ -289,12 +302,12 @@ static void polling_island_delete(grpc_exec_ctx *exec_ctx, polling_island *pi); gpr_atm g_epoll_sync; #endif /* defined(GRPC_TSAN) */ -#ifdef GRPC_PI_REF_COUNT_DEBUG static void pi_add_ref(polling_island *pi); static void pi_unref(grpc_exec_ctx *exec_ctx, polling_island *pi); -static void pi_add_ref_dbg(polling_island *pi, char *reason, char *file, - int line) { +#ifdef GRPC_WORKQUEUE_REFCOUNT_DEBUG +static void pi_add_ref_dbg(polling_island *pi, const char *reason, + const char *file, int line) { long old_cnt = gpr_atm_acq_load(&pi->ref_count); pi_add_ref(pi); gpr_log(GPR_DEBUG, "Add ref pi: %p, old: %ld -> new:%ld (%s) - (%s, %d)", @@ -302,12 +315,42 @@ static void pi_add_ref_dbg(polling_island *pi, char *reason, char *file, } static void pi_unref_dbg(grpc_exec_ctx *exec_ctx, polling_island *pi, - char *reason, char *file, int line) { + const char *reason, const char *file, int line) { long old_cnt = gpr_atm_acq_load(&pi->ref_count); pi_unref(exec_ctx, pi); gpr_log(GPR_DEBUG, "Unref pi: %p, old:%ld -> new:%ld (%s) - (%s, %d)", (void *)pi, old_cnt, (old_cnt - 1), reason, file, line); } + +static grpc_workqueue *workqueue_ref(grpc_workqueue *workqueue, + const char *file, int line, + const char *reason) { + if (workqueue != NULL) { + pi_add_ref_dbg((polling_island *)workqueue, reason, file, line); + } + return workqueue; +} + +static void workqueue_unref(grpc_exec_ctx *exec_ctx, grpc_workqueue *workqueue, + const char *file, int line, const char *reason) { + if (workqueue != NULL) { + pi_unref_dbg(exec_ctx, (polling_island *)workqueue, reason, file, line); + } +} +#else +static grpc_workqueue *workqueue_ref(grpc_workqueue *workqueue) { + if (workqueue != NULL) { + pi_add_ref((polling_island *)workqueue); + } + return workqueue; +} + +static void workqueue_unref(grpc_exec_ctx *exec_ctx, + grpc_workqueue *workqueue) { + if (workqueue != NULL) { + pi_unref(exec_ctx, (polling_island *)workqueue); + } +} #endif static void pi_add_ref(polling_island *pi) { @@ -315,10 +358,7 @@ static void pi_add_ref(polling_island *pi) { } static void pi_unref(grpc_exec_ctx *exec_ctx, polling_island *pi) { - /* If ref count went to one, we're back to just the workqueue owning a ref. - Unref the workqueue to break the loop. - - If ref count went to zero, delete the polling island. + /* If ref count went to zero, delete the polling island. Note that this deletion not be done under a lock. Once the ref count goes to zero, we are guaranteed that no one else holds a reference to the polling island (and that there is no racing pi_add_ref() call either). @@ -326,20 +366,12 @@ static void pi_unref(grpc_exec_ctx *exec_ctx, polling_island *pi) { Also, if we are deleting the polling island and the merged_to field is non-empty, we should remove a ref to the merged_to polling island */ - switch (gpr_atm_full_fetch_add(&pi->ref_count, -1)) { - case 2: /* last external ref: the only one now owned is by the workqueue */ - GRPC_WORKQUEUE_UNREF(exec_ctx, pi->workqueue, "polling_island"); - break; - case 1: { - polling_island *next = (polling_island *)gpr_atm_acq_load(&pi->merged_to); - polling_island_delete(exec_ctx, pi); - if (next != NULL) { - PI_UNREF(exec_ctx, next, "pi_delete"); /* Recursive call */ - } - break; + if (1 == gpr_atm_full_fetch_add(&pi->ref_count, -1)) { + polling_island *next = (polling_island *)gpr_atm_acq_load(&pi->merged_to); + polling_island_delete(exec_ctx, pi); + if (next != NULL) { + PI_UNREF(exec_ctx, next, "pi_delete"); /* Recursive call */ } - case 0: - GPR_UNREACHABLE_CODE(return ); } } @@ -488,11 +520,20 @@ static polling_island *polling_island_create(grpc_exec_ctx *exec_ctx, pi->fd_capacity = 0; pi->fds = NULL; pi->epoll_fd = -1; - pi->workqueue = NULL; + + gpr_mu_init(&pi->workqueue_read_mu); + gpr_mpscq_init(&pi->workqueue_items); + gpr_atm_rel_store(&pi->workqueue_item_count, 0); gpr_atm_rel_store(&pi->ref_count, 0); + gpr_atm_rel_store(&pi->poller_count, 0); gpr_atm_rel_store(&pi->merged_to, (gpr_atm)NULL); + if (!append_error(error, grpc_wakeup_fd_init(&pi->workqueue_wakeup_fd), + err_desc)) { + goto done; + } + pi->epoll_fd = epoll_create1(EPOLL_CLOEXEC); if (pi->epoll_fd < 0) { @@ -501,26 +542,14 @@ static polling_island *polling_island_create(grpc_exec_ctx *exec_ctx, } polling_island_add_wakeup_fd_locked(pi, &grpc_global_wakeup_fd, error); + polling_island_add_wakeup_fd_locked(pi, &pi->workqueue_wakeup_fd, error); if (initial_fd != NULL) { polling_island_add_fds_locked(pi, &initial_fd, 1, true, error); } - if (append_error(error, grpc_workqueue_create(exec_ctx, &pi->workqueue), - err_desc) && - *error == GRPC_ERROR_NONE) { - polling_island_add_fds_locked(pi, &pi->workqueue->wakeup_read_fd, 1, true, - error); - GPR_ASSERT(pi->workqueue->wakeup_read_fd->polling_island == NULL); - pi->workqueue->wakeup_read_fd->polling_island = pi; - PI_ADD_REF(pi, "fd"); - } - done: if (*error != GRPC_ERROR_NONE) { - if (pi->workqueue != NULL) { - GRPC_WORKQUEUE_UNREF(exec_ctx, pi->workqueue, "polling_island"); - } polling_island_delete(exec_ctx, pi); pi = NULL; } @@ -533,7 +562,11 @@ static void polling_island_delete(grpc_exec_ctx *exec_ctx, polling_island *pi) { if (pi->epoll_fd >= 0) { close(pi->epoll_fd); } + GPR_ASSERT(gpr_atm_no_barrier_load(&pi->workqueue_item_count) == 0); + gpr_mu_destroy(&pi->workqueue_read_mu); + gpr_mpscq_destroy(&pi->workqueue_items); gpr_mu_destroy(&pi->mu); + grpc_wakeup_fd_destroy(&pi->workqueue_wakeup_fd); gpr_free(pi->fds); gpr_free(pi); } @@ -678,6 +711,45 @@ static void polling_island_unlock_pair(polling_island *p, polling_island *q) { } } +static void workqueue_maybe_wakeup(polling_island *pi) { + /* If this thread is the current poller, then it may be that it's about to + decrement the current poller count, so we need to look past this thread */ + bool is_current_poller = (g_current_thread_polling_island == pi); + gpr_atm min_current_pollers_for_wakeup = is_current_poller ? 1 : 0; + gpr_atm current_pollers = gpr_atm_no_barrier_load(&pi->poller_count); + /* Only issue a wakeup if it's likely that some poller could come in and take + it right now. Note that since we do an anticipatory mpscq_pop every poll + loop, it's ok if we miss the wakeup here, as we'll get the work item when + the next poller enters anyway. */ + if (current_pollers > min_current_pollers_for_wakeup) { + GRPC_LOG_IF_ERROR("workqueue_wakeup_fd", + grpc_wakeup_fd_wakeup(&pi->workqueue_wakeup_fd)); + } +} + +static void workqueue_move_items_to_parent(polling_island *q) { + polling_island *p = (polling_island *)gpr_atm_no_barrier_load(&q->merged_to); + if (p == NULL) { + return; + } + gpr_mu_lock(&q->workqueue_read_mu); + int num_added = 0; + while (gpr_atm_no_barrier_load(&q->workqueue_item_count) > 0) { + gpr_mpscq_node *n = gpr_mpscq_pop(&q->workqueue_items); + if (n != NULL) { + gpr_atm_no_barrier_fetch_add(&q->workqueue_item_count, -1); + gpr_atm_no_barrier_fetch_add(&p->workqueue_item_count, 1); + gpr_mpscq_push(&p->workqueue_items, n); + num_added++; + } + } + gpr_mu_unlock(&q->workqueue_read_mu); + if (num_added > 0) { + workqueue_maybe_wakeup(p); + } + workqueue_move_items_to_parent(p); +} + static polling_island *polling_island_merge(polling_island *p, polling_island *q, grpc_error **error) { @@ -702,6 +774,8 @@ static polling_island *polling_island_merge(polling_island *p, /* Add the 'merged_to' link from p --> q */ gpr_atm_rel_store(&p->merged_to, (gpr_atm)q); PI_ADD_REF(q, "pi_merge"); /* To account for the new incoming ref from p */ + + workqueue_move_items_to_parent(q); } /* else if p == q, nothing needs to be done */ @@ -712,6 +786,26 @@ static polling_island *polling_island_merge(polling_island *p, return q; } +static void workqueue_enqueue(grpc_exec_ctx *exec_ctx, + grpc_workqueue *workqueue, grpc_closure *closure, + grpc_error *error) { + GPR_TIMER_BEGIN("workqueue.enqueue", 0); + /* take a ref to the workqueue: otherwise it can happen that whatever events + * this kicks off ends up destroying the workqueue before this function + * completes */ + GRPC_WORKQUEUE_REF(workqueue, "enqueue"); + polling_island *pi = (polling_island *)workqueue; + gpr_atm last = gpr_atm_no_barrier_fetch_add(&pi->workqueue_item_count, 1); + closure->error_data.error = error; + gpr_mpscq_push(&pi->workqueue_items, &closure->next_data.atm_next); + if (last == 0) { + workqueue_maybe_wakeup(pi); + } + workqueue_move_items_to_parent(pi); + GRPC_WORKQUEUE_UNREF(exec_ctx, workqueue, "enqueue"); + GPR_TIMER_END("workqueue.enqueue", 0); +} + static grpc_error *polling_island_global_init() { grpc_error *error = GRPC_ERROR_NONE; @@ -1042,11 +1136,8 @@ static void fd_notify_on_write(grpc_exec_ctx *exec_ctx, grpc_fd *fd, static grpc_workqueue *fd_get_workqueue(grpc_fd *fd) { gpr_mu_lock(&fd->mu); - grpc_workqueue *workqueue = NULL; - if (fd->polling_island != NULL) { - workqueue = - GRPC_WORKQUEUE_REF(fd->polling_island->workqueue, "get_workqueue"); - } + grpc_workqueue *workqueue = GRPC_WORKQUEUE_REF( + (grpc_workqueue *)fd->polling_island, "fd_get_workqueue"); gpr_mu_unlock(&fd->mu); return workqueue; } @@ -1299,7 +1390,29 @@ static void pollset_reset(grpc_pollset *pollset) { GPR_ASSERT(pollset->polling_island == NULL); } -#define GRPC_EPOLL_MAX_EVENTS 1000 +static bool maybe_do_workqueue_work(grpc_exec_ctx *exec_ctx, + polling_island *pi) { + if (gpr_mu_trylock(&pi->workqueue_read_mu)) { + gpr_mpscq_node *n = gpr_mpscq_pop(&pi->workqueue_items); + gpr_mu_unlock(&pi->workqueue_read_mu); + if (n != NULL) { + if (gpr_atm_full_fetch_add(&pi->workqueue_item_count, -1) > 1) { + workqueue_maybe_wakeup(pi); + } + grpc_closure *c = (grpc_closure *)n; + grpc_closure_run(exec_ctx, c, c->error_data.error); + return true; + } else if (gpr_atm_no_barrier_load(&pi->workqueue_item_count) > 0) { + /* n == NULL might mean there's work but it's not available to be popped + * yet - try to ensure another workqueue wakes up to check shortly if so + */ + workqueue_maybe_wakeup(pi); + } + } + return false; +} + +#define GRPC_EPOLL_MAX_EVENTS 100 /* Note: sig_mask contains the signal mask to use *during* epoll_wait() */ static void pollset_work_and_unlock(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset, @@ -1354,7 +1467,13 @@ static void pollset_work_and_unlock(grpc_exec_ctx *exec_ctx, PI_ADD_REF(pi, "ps_work"); gpr_mu_unlock(&pollset->mu); - do { + /* If we get some workqueue work to do, it might end up completing an item on + the completion queue, so there's no need to poll... so we skip that and + redo the complete loop to verify */ + if (!maybe_do_workqueue_work(exec_ctx, pi)) { + gpr_atm_no_barrier_fetch_add(&pi->poller_count, 1); + g_current_thread_polling_island = pi; + GRPC_SCHEDULING_START_BLOCKING_REGION; ep_rv = epoll_pwait(epoll_fd, ep_ev, GRPC_EPOLL_MAX_EVENTS, timeout_ms, sig_mask); @@ -1386,6 +1505,11 @@ static void pollset_work_and_unlock(grpc_exec_ctx *exec_ctx, append_error(error, grpc_wakeup_fd_consume_wakeup(&grpc_global_wakeup_fd), err_desc); + } else if (data_ptr == &pi->workqueue_wakeup_fd) { + append_error(error, + grpc_wakeup_fd_consume_wakeup(&grpc_global_wakeup_fd), + err_desc); + maybe_do_workqueue_work(exec_ctx, pi); } else if (data_ptr == &polling_island_wakeup_fd) { GRPC_POLLING_TRACE( "pollset_work: pollset: %p, worker: %p polling island (epoll_fd: " @@ -1408,7 +1532,10 @@ static void pollset_work_and_unlock(grpc_exec_ctx *exec_ctx, } } } - } while (ep_rv == GRPC_EPOLL_MAX_EVENTS); + + g_current_thread_polling_island = NULL; + gpr_atm_no_barrier_fetch_add(&pi->poller_count, -1); + } GPR_ASSERT(pi != NULL); @@ -1584,6 +1711,12 @@ retry: "pollset_add_fd: Raced creating new polling island. pi_new: %p " "(fd: %d, pollset: %p)", (void *)pi_new, fd->fd, (void *)pollset); + + /* No need to lock 'pi_new' here since this is a new polling island and + * no one has a reference to it yet */ + polling_island_remove_all_fds_locked(pi_new, true, &error); + + /* Ref and unref so that the polling island gets deleted during unref */ PI_ADD_REF(pi_new, "dance_of_destruction"); PI_UNREF(exec_ctx, pi_new, "dance_of_destruction"); goto retry; @@ -1868,6 +2001,10 @@ static const grpc_event_engine_vtable vtable = { .kick_poller = kick_poller, + .workqueue_ref = workqueue_ref, + .workqueue_unref = workqueue_unref, + .workqueue_enqueue = workqueue_enqueue, + .shutdown_engine = shutdown_engine, }; @@ -1892,6 +2029,10 @@ const grpc_event_engine_vtable *grpc_init_epoll_linux(void) { return NULL; } + if (!grpc_has_wakeup_fd()) { + return NULL; + } + if (!is_epoll_available()) { return NULL; } @@ -1914,13 +2055,13 @@ const grpc_event_engine_vtable *grpc_init_epoll_linux(void) { return &vtable; } -#else /* defined(GPR_LINUX_EPOLL) */ -#if defined(GPR_POSIX_SOCKET) +#else /* defined(GRPC_LINUX_EPOLL) */ +#if defined(GRPC_POSIX_SOCKET) #include "src/core/lib/iomgr/ev_posix.h" -/* If GPR_LINUX_EPOLL is not defined, it means epoll is not available. Return +/* If GRPC_LINUX_EPOLL is not defined, it means epoll is not available. Return * NULL */ const grpc_event_engine_vtable *grpc_init_epoll_linux(void) { return NULL; } -#endif /* defined(GPR_POSIX_SOCKET) */ +#endif /* defined(GRPC_POSIX_SOCKET) */ void grpc_use_signal(int signum) {} -#endif /* !defined(GPR_LINUX_EPOLL) */ +#endif /* !defined(GRPC_LINUX_EPOLL) */ |