diff options
Diffstat (limited to 'src/core/lib/iomgr/ev_epoll_linux.c')
-rw-r--r-- | src/core/lib/iomgr/ev_epoll_linux.c | 277 |
1 files changed, 161 insertions, 116 deletions
diff --git a/src/core/lib/iomgr/ev_epoll_linux.c b/src/core/lib/iomgr/ev_epoll_linux.c index cf0fe736a0..6a63c4d1d1 100644 --- a/src/core/lib/iomgr/ev_epoll_linux.c +++ b/src/core/lib/iomgr/ev_epoll_linux.c @@ -57,6 +57,7 @@ #include "src/core/lib/iomgr/ev_posix.h" #include "src/core/lib/iomgr/iomgr_internal.h" #include "src/core/lib/iomgr/wakeup_fd_posix.h" +#include "src/core/lib/iomgr/workqueue.h" #include "src/core/lib/profiling/timers.h" #include "src/core/lib/support/block_annotate.h" @@ -113,9 +114,7 @@ struct grpc_fd { grpc_closure *read_closure; grpc_closure *write_closure; - /* The polling island to which this fd belongs to and the mutex protecting the - the field */ - gpr_mu pi_mu; + /* The polling island to which this fd belongs to (protected by mu) */ struct polling_island *polling_island; struct grpc_fd *freelist_next; @@ -152,16 +151,17 @@ static void fd_global_shutdown(void); * Polling island Declarations */ -// #define GRPC_PI_REF_COUNT_DEBUG +//#define GRPC_PI_REF_COUNT_DEBUG #ifdef GRPC_PI_REF_COUNT_DEBUG #define PI_ADD_REF(p, r) pi_add_ref_dbg((p), (r), __FILE__, __LINE__) -#define PI_UNREF(p, r) pi_unref_dbg((p), (r), __FILE__, __LINE__) +#define PI_UNREF(exec_ctx, p, r) \ + pi_unref_dbg((exec_ctx), (p), (r), __FILE__, __LINE__) #else /* defined(GRPC_PI_REF_COUNT_DEBUG) */ #define PI_ADD_REF(p, r) pi_add_ref((p)) -#define PI_UNREF(p, r) pi_unref((p)) +#define PI_UNREF(exec_ctx, p, r) pi_unref((exec_ctx), (p)) #endif /* !defined(GPRC_PI_REF_COUNT_DEBUG) */ @@ -172,7 +172,7 @@ typedef struct polling_island { Once the ref count becomes zero, this structure is destroyed which means we should ensure that there is never a scenario where a PI_ADD_REF() is racing with a PI_UNREF() that just made the ref_count zero. */ - gpr_refcount ref_count; + gpr_atm ref_count; /* Pointer to the polling_island this merged into. * merged_to value is only set once in polling_island's lifetime (and that too @@ -184,6 +184,9 @@ typedef struct polling_island { * (except mu and ref_count) are invalid and must be ignored. */ gpr_atm merged_to; + /* The workqueue associated with this polling island */ + grpc_workqueue *workqueue; + /* The fd of the underlying epoll set */ int epoll_fd; @@ -191,11 +194,6 @@ typedef struct polling_island { size_t fd_cnt; size_t fd_capacity; grpc_fd **fds; - - /* Polling islands that are no longer needed are kept in a freelist so that - they can be reused. This field points to the next polling island in the - free list */ - struct polling_island *next_free; } polling_island; /******************************************************************************* @@ -253,13 +251,14 @@ struct grpc_pollset_set { * Common helpers */ -static void append_error(grpc_error **composite, grpc_error *error, +static bool append_error(grpc_error **composite, grpc_error *error, const char *desc) { - if (error == GRPC_ERROR_NONE) return; + if (error == GRPC_ERROR_NONE) return true; if (*composite == GRPC_ERROR_NONE) { *composite = GRPC_ERROR_CREATE(desc); } *composite = grpc_error_add_child(*composite, error); + return false; } /******************************************************************************* @@ -275,11 +274,8 @@ static void append_error(grpc_error **composite, grpc_error *error, threads that woke up MUST NOT call grpc_wakeup_fd_consume_wakeup() */ static grpc_wakeup_fd polling_island_wakeup_fd; -/* Polling island freelist */ -static gpr_mu g_pi_freelist_mu; -static polling_island *g_pi_freelist = NULL; - -static void polling_island_delete(); /* Forward declaration */ +/* Forward declaration */ +static void polling_island_delete(grpc_exec_ctx *exec_ctx, polling_island *pi); #ifdef GRPC_TSAN /* Currently TSAN may incorrectly flag data races between epoll_ctl and @@ -293,28 +289,35 @@ gpr_atm g_epoll_sync; #endif /* defined(GRPC_TSAN) */ #ifdef GRPC_PI_REF_COUNT_DEBUG -void pi_add_ref(polling_island *pi); -void pi_unref(polling_island *pi); +static void pi_add_ref(polling_island *pi); +static void pi_unref(grpc_exec_ctx *exec_ctx, polling_island *pi); -void pi_add_ref_dbg(polling_island *pi, char *reason, char *file, int line) { - long old_cnt = gpr_atm_acq_load(&(pi->ref_count.count)); +static void pi_add_ref_dbg(polling_island *pi, char *reason, char *file, + int line) { + long old_cnt = gpr_atm_acq_load(&pi->ref_count); pi_add_ref(pi); gpr_log(GPR_DEBUG, "Add ref pi: %p, old: %ld -> new:%ld (%s) - (%s, %d)", (void *)pi, old_cnt, old_cnt + 1, reason, file, line); } -void pi_unref_dbg(polling_island *pi, char *reason, char *file, int line) { - long old_cnt = gpr_atm_acq_load(&(pi->ref_count.count)); - pi_unref(pi); +static void pi_unref_dbg(grpc_exec_ctx *exec_ctx, polling_island *pi, + char *reason, char *file, int line) { + long old_cnt = gpr_atm_acq_load(&pi->ref_count); + pi_unref(exec_ctx, pi); gpr_log(GPR_DEBUG, "Unref pi: %p, old:%ld -> new:%ld (%s) - (%s, %d)", (void *)pi, old_cnt, (old_cnt - 1), reason, file, line); } #endif -void pi_add_ref(polling_island *pi) { gpr_ref(&pi->ref_count); } +static void pi_add_ref(polling_island *pi) { + gpr_atm_no_barrier_fetch_add(&pi->ref_count, 1); +} -void pi_unref(polling_island *pi) { - /* If ref count went to zero, delete the polling island. +static void pi_unref(grpc_exec_ctx *exec_ctx, polling_island *pi) { + /* If ref count went to one, we're back to just the workqueue owning a ref. + Unref the workqueue to break the loop. + + If ref count went to zero, delete the polling island. Note that this deletion not be done under a lock. Once the ref count goes to zero, we are guaranteed that no one else holds a reference to the polling island (and that there is no racing pi_add_ref() call either). @@ -322,12 +325,20 @@ void pi_unref(polling_island *pi) { Also, if we are deleting the polling island and the merged_to field is non-empty, we should remove a ref to the merged_to polling island */ - if (gpr_unref(&pi->ref_count)) { - polling_island *next = (polling_island *)gpr_atm_acq_load(&pi->merged_to); - polling_island_delete(pi); - if (next != NULL) { - PI_UNREF(next, "pi_delete"); /* Recursive call */ + switch (gpr_atm_full_fetch_add(&pi->ref_count, -1)) { + case 2: /* last external ref: the only one now owned is by the workqueue */ + GRPC_WORKQUEUE_UNREF(exec_ctx, pi->workqueue, "polling_island"); + break; + case 1: { + polling_island *next = (polling_island *)gpr_atm_acq_load(&pi->merged_to); + polling_island_delete(exec_ctx, pi); + if (next != NULL) { + PI_UNREF(exec_ctx, next, "pi_delete"); /* Recursive call */ + } + break; } + case 0: + GPR_UNREACHABLE_CODE(return ); } } @@ -462,69 +473,68 @@ static void polling_island_remove_fd_locked(polling_island *pi, grpc_fd *fd, } /* Might return NULL in case of an error */ -static polling_island *polling_island_create(grpc_fd *initial_fd, +static polling_island *polling_island_create(grpc_exec_ctx *exec_ctx, + grpc_fd *initial_fd, grpc_error **error) { polling_island *pi = NULL; - char *err_msg; const char *err_desc = "polling_island_create"; - /* Try to get one from the polling island freelist */ - gpr_mu_lock(&g_pi_freelist_mu); - if (g_pi_freelist != NULL) { - pi = g_pi_freelist; - g_pi_freelist = g_pi_freelist->next_free; - pi->next_free = NULL; - } - gpr_mu_unlock(&g_pi_freelist_mu); + *error = GRPC_ERROR_NONE; - /* Create new polling island if we could not get one from the free list */ - if (pi == NULL) { - pi = gpr_malloc(sizeof(*pi)); - gpr_mu_init(&pi->mu); - pi->fd_cnt = 0; - pi->fd_capacity = 0; - pi->fds = NULL; - } + pi = gpr_malloc(sizeof(*pi)); + gpr_mu_init(&pi->mu); + pi->fd_cnt = 0; + pi->fd_capacity = 0; + pi->fds = NULL; + pi->epoll_fd = -1; + pi->workqueue = NULL; - gpr_ref_init(&pi->ref_count, 0); + gpr_atm_rel_store(&pi->ref_count, 0); gpr_atm_rel_store(&pi->merged_to, (gpr_atm)NULL); pi->epoll_fd = epoll_create1(EPOLL_CLOEXEC); if (pi->epoll_fd < 0) { - gpr_asprintf(&err_msg, "epoll_create1 failed with error %d (%s)", errno, - strerror(errno)); - append_error(error, GRPC_OS_ERROR(errno, err_msg), err_desc); - gpr_free(err_msg); - } else { - polling_island_add_wakeup_fd_locked(pi, &grpc_global_wakeup_fd, error); - pi->next_free = NULL; + append_error(error, GRPC_OS_ERROR(errno, "epoll_create1"), err_desc); + goto done; + } - if (initial_fd != NULL) { - /* Lock the polling island here just in case we got this structure from - the freelist and the polling island lock was not released yet (by the - code that adds the polling island to the freelist) */ - gpr_mu_lock(&pi->mu); - polling_island_add_fds_locked(pi, &initial_fd, 1, true, error); - gpr_mu_unlock(&pi->mu); - } + polling_island_add_wakeup_fd_locked(pi, &grpc_global_wakeup_fd, error); + + if (initial_fd != NULL) { + polling_island_add_fds_locked(pi, &initial_fd, 1, true, error); + } + + if (append_error(error, grpc_workqueue_create(exec_ctx, &pi->workqueue), + err_desc) && + *error == GRPC_ERROR_NONE) { + polling_island_add_fds_locked(pi, &pi->workqueue->wakeup_read_fd, 1, true, + error); + GPR_ASSERT(pi->workqueue->wakeup_read_fd->polling_island == NULL); + pi->workqueue->wakeup_read_fd->polling_island = pi; + PI_ADD_REF(pi, "fd"); } +done: + if (*error != GRPC_ERROR_NONE) { + if (pi->workqueue != NULL) { + GRPC_WORKQUEUE_UNREF(exec_ctx, pi->workqueue, "polling_island"); + } + polling_island_delete(exec_ctx, pi); + pi = NULL; + } return pi; } -static void polling_island_delete(polling_island *pi) { +static void polling_island_delete(grpc_exec_ctx *exec_ctx, polling_island *pi) { GPR_ASSERT(pi->fd_cnt == 0); - gpr_atm_rel_store(&pi->merged_to, (gpr_atm)NULL); - - close(pi->epoll_fd); - pi->epoll_fd = -1; - - gpr_mu_lock(&g_pi_freelist_mu); - pi->next_free = g_pi_freelist; - g_pi_freelist = pi; - gpr_mu_unlock(&g_pi_freelist_mu); + if (pi->epoll_fd >= 0) { + close(pi->epoll_fd); + } + gpr_mu_destroy(&pi->mu); + gpr_free(pi->fds); + gpr_free(pi); } /* Attempts to gets the last polling island in the linked list (liked by the @@ -704,9 +714,6 @@ static polling_island *polling_island_merge(polling_island *p, static grpc_error *polling_island_global_init() { grpc_error *error = GRPC_ERROR_NONE; - gpr_mu_init(&g_pi_freelist_mu); - g_pi_freelist = NULL; - error = grpc_wakeup_fd_init(&polling_island_wakeup_fd); if (error == GRPC_ERROR_NONE) { error = grpc_wakeup_fd_wakeup(&polling_island_wakeup_fd); @@ -716,18 +723,6 @@ static grpc_error *polling_island_global_init() { } static void polling_island_global_shutdown() { - polling_island *next; - gpr_mu_lock(&g_pi_freelist_mu); - gpr_mu_unlock(&g_pi_freelist_mu); - while (g_pi_freelist != NULL) { - next = g_pi_freelist->next_free; - gpr_mu_destroy(&g_pi_freelist->mu); - gpr_free(g_pi_freelist->fds); - gpr_free(g_pi_freelist); - g_pi_freelist = next; - } - gpr_mu_destroy(&g_pi_freelist_mu); - grpc_wakeup_fd_destroy(&polling_island_wakeup_fd); } @@ -845,7 +840,6 @@ static grpc_fd *fd_create(int fd, const char *name) { if (new_fd == NULL) { new_fd = gpr_malloc(sizeof(grpc_fd)); gpr_mu_init(&new_fd->mu); - gpr_mu_init(&new_fd->pi_mu); } /* Note: It is not really needed to get the new_fd->mu lock here. If this is a @@ -896,6 +890,7 @@ static void fd_orphan(grpc_exec_ctx *exec_ctx, grpc_fd *fd, const char *reason) { bool is_fd_closed = false; grpc_error *error = GRPC_ERROR_NONE; + polling_island *unref_pi = NULL; gpr_mu_lock(&fd->mu); fd->on_done_closure = on_done; @@ -923,21 +918,26 @@ static void fd_orphan(grpc_exec_ctx *exec_ctx, grpc_fd *fd, - Unlock the latest polling island - Set fd->polling_island to NULL (but remove the ref on the polling island before doing this.) */ - gpr_mu_lock(&fd->pi_mu); if (fd->polling_island != NULL) { polling_island *pi_latest = polling_island_lock(fd->polling_island); polling_island_remove_fd_locked(pi_latest, fd, is_fd_closed, &error); gpr_mu_unlock(&pi_latest->mu); - PI_UNREF(fd->polling_island, "fd_orphan"); + unref_pi = fd->polling_island; fd->polling_island = NULL; } - gpr_mu_unlock(&fd->pi_mu); grpc_exec_ctx_sched(exec_ctx, fd->on_done_closure, error, NULL); gpr_mu_unlock(&fd->mu); UNREF_BY(fd, 2, reason); /* Drop the reference */ + if (unref_pi != NULL) { + /* Unref stale polling island here, outside the fd lock above. + The polling island owns a workqueue which owns an fd, and unreffing + inside the lock can cause an eventual lock loop that makes TSAN very + unhappy. */ + PI_UNREF(exec_ctx, unref_pi, "fd_orphan"); + } GRPC_LOG_IF_ERROR("fd_orphan", GRPC_ERROR_REF(error)); } @@ -1037,6 +1037,17 @@ static void fd_notify_on_write(grpc_exec_ctx *exec_ctx, grpc_fd *fd, gpr_mu_unlock(&fd->mu); } +static grpc_workqueue *fd_get_workqueue(grpc_fd *fd) { + gpr_mu_lock(&fd->mu); + grpc_workqueue *workqueue = NULL; + if (fd->polling_island != NULL) { + workqueue = + GRPC_WORKQUEUE_REF(fd->polling_island->workqueue, "get_workqueue"); + } + gpr_mu_unlock(&fd->mu); + return workqueue; +} + /******************************************************************************* * Pollset Definitions */ @@ -1227,9 +1238,10 @@ static void fd_become_writable(grpc_exec_ctx *exec_ctx, grpc_fd *fd) { gpr_mu_unlock(&fd->mu); } -static void pollset_release_polling_island(grpc_pollset *ps, char *reason) { +static void pollset_release_polling_island(grpc_exec_ctx *exec_ctx, + grpc_pollset *ps, char *reason) { if (ps->polling_island != NULL) { - PI_UNREF(ps->polling_island, reason); + PI_UNREF(exec_ctx, ps->polling_island, reason); } ps->polling_island = NULL; } @@ -1242,7 +1254,7 @@ static void finish_shutdown_locked(grpc_exec_ctx *exec_ctx, pollset->finish_shutdown_called = true; /* Release the ref and set pollset->polling_island to NULL */ - pollset_release_polling_island(pollset, "ps_shutdown"); + pollset_release_polling_island(exec_ctx, pollset, "ps_shutdown"); grpc_exec_ctx_sched(exec_ctx, pollset->shutdown_done, GRPC_ERROR_NONE, NULL); } @@ -1281,7 +1293,7 @@ static void pollset_reset(grpc_pollset *pollset) { pollset->finish_shutdown_called = false; pollset->kicked_without_pollers = false; pollset->shutdown_done = NULL; - pollset_release_polling_island(pollset, "ps_reset"); + GPR_ASSERT(pollset->polling_island == NULL); } #define GRPC_EPOLL_MAX_EVENTS 1000 @@ -1309,7 +1321,7 @@ static void pollset_work_and_unlock(grpc_exec_ctx *exec_ctx, this function (i.e pollset_work_and_unlock()) is called */ if (pollset->polling_island == NULL) { - pollset->polling_island = polling_island_create(NULL, error); + pollset->polling_island = polling_island_create(exec_ctx, NULL, error); if (pollset->polling_island == NULL) { GPR_TIMER_END("pollset_work_and_unlock", 0); return; /* Fatal error. We cannot continue */ @@ -1329,7 +1341,7 @@ static void pollset_work_and_unlock(grpc_exec_ctx *exec_ctx, /* Always do PI_ADD_REF before PI_UNREF because PI_UNREF may cause the polling island to be deleted */ PI_ADD_REF(pi, "ps"); - PI_UNREF(pollset->polling_island, "ps"); + PI_UNREF(exec_ctx, pollset->polling_island, "ps"); pollset->polling_island = pi; } @@ -1400,7 +1412,7 @@ static void pollset_work_and_unlock(grpc_exec_ctx *exec_ctx, that we got before releasing the polling island lock). This is because pollset->polling_island pointer might get udpated in other parts of the code when there is an island merge while we are doing epoll_wait() above */ - PI_UNREF(pi, "ps_work"); + PI_UNREF(exec_ctx, pi, "ps_work"); GPR_TIMER_END("pollset_work_and_unlock", 0); } @@ -1517,10 +1529,11 @@ static void pollset_add_fd(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset, grpc_error *error = GRPC_ERROR_NONE; gpr_mu_lock(&pollset->mu); - gpr_mu_lock(&fd->pi_mu); + gpr_mu_lock(&fd->mu); polling_island *pi_new = NULL; +retry: /* 1) If fd->polling_island and pollset->polling_island are both non-NULL and * equal, do nothing. * 2) If fd->polling_island and pollset->polling_island are both NULL, create @@ -1535,15 +1548,44 @@ static void pollset_add_fd(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset, * polling_island fields in both fd and pollset to point to the merged * polling island. */ + + if (fd->orphaned) { + gpr_mu_unlock(&fd->mu); + gpr_mu_unlock(&pollset->mu); + /* early out */ + return; + } + if (fd->polling_island == pollset->polling_island) { pi_new = fd->polling_island; if (pi_new == NULL) { - pi_new = polling_island_create(fd, &error); - - GRPC_POLLING_TRACE( - "pollset_add_fd: Created new polling island. pi_new: %p (fd: %d, " - "pollset: %p)", - (void *)pi_new, fd->fd, (void *)pollset); + /* Unlock before creating a new polling island: the polling island will + create a workqueue which creates a file descriptor, and holding an fd + lock here can eventually cause a loop to appear to TSAN (making it + unhappy). We don't think it's a real loop (there's an epoch point where + that loop possibility disappears), but the advantages of keeping TSAN + happy outweigh any performance advantage we might have by keeping the + lock held. */ + gpr_mu_unlock(&fd->mu); + pi_new = polling_island_create(exec_ctx, fd, &error); + gpr_mu_lock(&fd->mu); + /* Need to reverify any assumptions made between the initial lock and + getting to this branch: if they've changed, we need to throw away our + work and figure things out again. */ + if (fd->polling_island != NULL) { + GRPC_POLLING_TRACE( + "pollset_add_fd: Raced creating new polling island. pi_new: %p " + "(fd: %d, pollset: %p)", + (void *)pi_new, fd->fd, (void *)pollset); + PI_ADD_REF(pi_new, "dance_of_destruction"); + PI_UNREF(exec_ctx, pi_new, "dance_of_destruction"); + goto retry; + } else { + GRPC_POLLING_TRACE( + "pollset_add_fd: Created new polling island. pi_new: %p (fd: %d, " + "pollset: %p)", + (void *)pi_new, fd->fd, (void *)pollset); + } } } else if (fd->polling_island == NULL) { pi_new = polling_island_lock(pollset->polling_island); @@ -1579,7 +1621,7 @@ static void pollset_add_fd(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset, if (fd->polling_island != pi_new) { PI_ADD_REF(pi_new, "fd"); if (fd->polling_island != NULL) { - PI_UNREF(fd->polling_island, "fd"); + PI_UNREF(exec_ctx, fd->polling_island, "fd"); } fd->polling_island = pi_new; } @@ -1587,13 +1629,15 @@ static void pollset_add_fd(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset, if (pollset->polling_island != pi_new) { PI_ADD_REF(pi_new, "ps"); if (pollset->polling_island != NULL) { - PI_UNREF(pollset->polling_island, "ps"); + PI_UNREF(exec_ctx, pollset->polling_island, "ps"); } pollset->polling_island = pi_new; } - gpr_mu_unlock(&fd->pi_mu); + gpr_mu_unlock(&fd->mu); gpr_mu_unlock(&pollset->mu); + + GRPC_LOG_IF_ERROR("pollset_add_fd", error); } /******************************************************************************* @@ -1744,9 +1788,9 @@ static void pollset_set_del_pollset_set(grpc_exec_ctx *exec_ctx, void *grpc_fd_get_polling_island(grpc_fd *fd) { polling_island *pi; - gpr_mu_lock(&fd->pi_mu); + gpr_mu_lock(&fd->mu); pi = fd->polling_island; - gpr_mu_unlock(&fd->pi_mu); + gpr_mu_unlock(&fd->mu); return pi; } @@ -1794,6 +1838,7 @@ static const grpc_event_engine_vtable vtable = { .fd_notify_on_read = fd_notify_on_read, .fd_notify_on_write = fd_notify_on_write, .fd_get_read_notifier_pollset = fd_get_read_notifier_pollset, + .fd_get_workqueue = fd_get_workqueue, .pollset_init = pollset_init, .pollset_shutdown = pollset_shutdown, |