27 files changed, 500 insertions, 108 deletions
diff --git a/src/core/lib/debug/stats.c b/src/core/lib/debug/stats.c
new file mode 100644
index 0000000000..4dbd94c724
--- /dev/null
+++ b/src/core/lib/debug/stats.c
@@ -0,0 +1,67 @@
+/*
+ *
+ * Copyright 2017 gRPC authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "src/core/lib/debug/stats.h"
+
+#include <inttypes.h>
+#include <string.h>
+
+#include <grpc/support/alloc.h>
+#include <grpc/support/string_util.h>
+#include <grpc/support/useful.h>
+
+#include "src/core/lib/support/string.h"
+
+grpc_stats_data *grpc_stats_per_cpu_storage = NULL;
+static size_t g_num_cores;
+
+void grpc_stats_init(void) {
+  g_num_cores = GPR_MAX(1, gpr_cpu_num_cores());
+  grpc_stats_per_cpu_storage =
+      gpr_zalloc(sizeof(grpc_stats_data) * g_num_cores);
+}
+
+void grpc_stats_shutdown(void) { gpr_free(grpc_stats_per_cpu_storage); }
+
+void grpc_stats_collect(grpc_stats_data *output) {
+  memset(output, 0, sizeof(*output));
+  for (size_t core = 0; core < g_num_cores; core++) {
+    for (size_t i = 0; i < GRPC_STATS_COUNTER_COUNT; i++) {
+      output->counters[i] += gpr_atm_no_barrier_load(
+          &grpc_stats_per_cpu_storage[core].counters[i]);
+    }
+  }
+}
+
+char *grpc_stats_data_as_json(const grpc_stats_data *data) {
+  gpr_strvec v;
+  char *tmp;
+  bool is_first = true;
+  gpr_strvec_init(&v);
+  gpr_strvec_add(&v, gpr_strdup("{"));
+  for (size_t i = 0; i < GRPC_STATS_COUNTER_COUNT; i++) {
+    gpr_asprintf(&tmp, "%s\"%s\": %" PRIdPTR, is_first ? "" : ", ",
+                 grpc_stats_counter_name[i], data->counters[i]);
+    gpr_strvec_add(&v, tmp);
+    is_first = false;
+  }
+  gpr_strvec_add(&v, gpr_strdup("}"));
+  tmp = gpr_strvec_flatten(&v, NULL);
+  gpr_strvec_destroy(&v);
+  return tmp;
+}
diff --git a/src/core/lib/debug/stats.h b/src/core/lib/debug/stats.h
new file mode 100644
index 0000000000..563b108dff
--- /dev/null
+++ b/src/core/lib/debug/stats.h
@@ -0,0 +1,44 @@
+/*
+ *
+ * Copyright 2017 gRPC authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#ifndef GRPC_CORE_LIB_DEBUG_STATS_H
+#define GRPC_CORE_LIB_DEBUG_STATS_H
+
+#include <grpc/support/atm.h>
+#include "src/core/lib/debug/stats_data.h"
+#include "src/core/lib/iomgr/exec_ctx.h"
+
+typedef struct grpc_stats_data {
+  gpr_atm counters[GRPC_STATS_COUNTER_COUNT];
+} grpc_stats_data;
+
+extern grpc_stats_data *grpc_stats_per_cpu_storage;
+
+#define GRPC_THREAD_STATS_DATA(exec_ctx) \
+  (&grpc_stats_per_cpu_storage[(exec_ctx)->starting_cpu])
+
+#define GRPC_STATS_INC_COUNTER(exec_ctx, ctr) \
+  (gpr_atm_no_barrier_fetch_add(              \
+      &GRPC_THREAD_STATS_DATA((exec_ctx))->counters[(ctr)], 1))
+
+void grpc_stats_init(void);
+void grpc_stats_shutdown(void);
+void grpc_stats_collect(grpc_stats_data *output);
+char *grpc_stats_data_as_json(const grpc_stats_data *data);
+
+#endif
diff --git a/src/core/lib/support/thd_internal.h b/src/core/lib/debug/stats_data.c
index cc468c7846..2203358a7e 100644
--- a/src/core/lib/support/thd_internal.h
+++ b/src/core/lib/debug/stats_data.c
@@ -1,6 +1,5 @@
 /*
- *
- * Copyright 2015 gRPC authors.
+ * Copyright 2017 gRPC authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,12 +12,14 @@
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- *
  */
 
-#ifndef GRPC_CORE_LIB_SUPPORT_THD_INTERNAL_H
-#define GRPC_CORE_LIB_SUPPORT_THD_INTERNAL_H
-
-/* Internal interfaces between modules within the gpr support library.  */
+/*
+ * Automatically generated by tools/codegen/core/gen_stats_data.py
+ */
 
-#endif /* GRPC_CORE_LIB_SUPPORT_THD_INTERNAL_H */
+#include "src/core/lib/debug/stats_data.h"
+const char *grpc_stats_counter_name[GRPC_STATS_COUNTER_COUNT] = {
+    "client_calls_created", "server_calls_created", "syscall_write",
+    "syscall_read",         "syscall_poll",         "syscall_wait",
+};
diff --git a/src/core/lib/debug/stats_data.h b/src/core/lib/debug/stats_data.h
new file mode 100644
index 0000000000..c9c2f65c30
--- /dev/null
+++ b/src/core/lib/debug/stats_data.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2017 gRPC authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Automatically generated by tools/codegen/core/gen_stats_data.py
+ */
+
+#ifndef GRPC_CORE_LIB_DEBUG_STATS_DATA_H
+#define GRPC_CORE_LIB_DEBUG_STATS_DATA_H
+
+typedef enum {
+  GRPC_STATS_COUNTER_CLIENT_CALLS_CREATED,
+  GRPC_STATS_COUNTER_SERVER_CALLS_CREATED,
+  GRPC_STATS_COUNTER_SYSCALL_WRITE,
+  GRPC_STATS_COUNTER_SYSCALL_READ,
+  GRPC_STATS_COUNTER_SYSCALL_POLL,
+  GRPC_STATS_COUNTER_SYSCALL_WAIT,
+  GRPC_STATS_COUNTER_COUNT
+} grpc_stats_counters;
+#define GRPC_STATS_INC_CLIENT_CALLS_CREATED(exec_ctx) \
+  GRPC_STATS_INC_COUNTER((exec_ctx), GRPC_STATS_COUNTER_CLIENT_CALLS_CREATED)
+#define GRPC_STATS_INC_SERVER_CALLS_CREATED(exec_ctx) \
+  GRPC_STATS_INC_COUNTER((exec_ctx), GRPC_STATS_COUNTER_SERVER_CALLS_CREATED)
+#define GRPC_STATS_INC_SYSCALL_WRITE(exec_ctx) \
+  GRPC_STATS_INC_COUNTER((exec_ctx), GRPC_STATS_COUNTER_SYSCALL_WRITE)
+#define GRPC_STATS_INC_SYSCALL_READ(exec_ctx) \
+  GRPC_STATS_INC_COUNTER((exec_ctx), GRPC_STATS_COUNTER_SYSCALL_READ)
+#define GRPC_STATS_INC_SYSCALL_POLL(exec_ctx) \
+  GRPC_STATS_INC_COUNTER((exec_ctx), GRPC_STATS_COUNTER_SYSCALL_POLL)
+#define GRPC_STATS_INC_SYSCALL_WAIT(exec_ctx) \
+  GRPC_STATS_INC_COUNTER((exec_ctx), GRPC_STATS_COUNTER_SYSCALL_WAIT)
+extern const char *grpc_stats_counter_name[GRPC_STATS_COUNTER_COUNT];
+
+#endif /* GRPC_CORE_LIB_DEBUG_STATS_DATA_H */
diff --git a/src/core/lib/debug/stats_data.yaml b/src/core/lib/debug/stats_data.yaml
new file mode 100644
index 0000000000..8afe48f5cd
--- /dev/null
+++ b/src/core/lib/debug/stats_data.yaml
@@ -0,0 +1,9 @@
+# Stats data declaration
+# use tools/codegen/core/gen_stats_data.py to turn this into stats_data.h
+
+- counter: client_calls_created
+- counter: server_calls_created
+- counter: syscall_write
+- counter: syscall_read
+- counter: syscall_poll
+- counter: syscall_wait
diff --git a/src/core/lib/iomgr/ev_epoll1_linux.c b/src/core/lib/iomgr/ev_epoll1_linux.c
index 3ee646fa36..7f053fa728 100644
--- a/src/core/lib/iomgr/ev_epoll1_linux.c
+++ b/src/core/lib/iomgr/ev_epoll1_linux.c
@@ -39,6 +39,7 @@
 #include <grpc/support/tls.h>
 #include <grpc/support/useful.h>
 
+#include "src/core/lib/debug/stats.h"
 #include "src/core/lib/iomgr/ev_posix.h"
 #include "src/core/lib/iomgr/iomgr_internal.h"
 #include "src/core/lib/iomgr/lockfree_event.h"
@@ -48,7 +49,60 @@
 #include "src/core/lib/support/string.h"
 
 static grpc_wakeup_fd global_wakeup_fd;
-static int g_epfd;
+
+/*******************************************************************************
+ * Singleton epoll set related fields
+ */
+
+#define MAX_EPOLL_EVENTS 100
+#define MAX_EPOLL_EVENTS_HANDLED_PER_ITERATION 1
+
+/* NOTE ON SYNCHRONIZATION:
+ * - Fields in this struct are only modified by the designated poller. Hence
+ *   there is no need for any locks to protect the struct.
+ * - num_events and cursor fields have to be of atomic type to provide memory
+ *   visibility guarantees only. i.e In case of multiple pollers, the designated
+ *   polling thread keeps changing; the thread that wrote these values may be
+ *   different from the thread reading the values
+ */
+typedef struct epoll_set {
+  int epfd;
+
+  /* The epoll_events after the last call to epoll_wait() */
+  struct epoll_event events[MAX_EPOLL_EVENTS];
+
+  /* The number of epoll_events after the last call to epoll_wait() */
+  gpr_atm num_events;
+
+  /* Index of the first event in epoll_events that has to be processed. This
+   * field is only valid if num_events > 0 */
+  gpr_atm cursor;
+} epoll_set;
+
+/* The global singleton epoll set */
+static epoll_set g_epoll_set;
+
+/* Must be called *only* once */
+static bool epoll_set_init() {
+  g_epoll_set.epfd = epoll_create1(EPOLL_CLOEXEC);
+  if (g_epoll_set.epfd < 0) {
+    gpr_log(GPR_ERROR, "epoll unavailable");
+    return false;
+  }
+
+  gpr_log(GPR_INFO, "grpc epoll fd: %d", g_epoll_set.epfd);
+  gpr_atm_no_barrier_store(&g_epoll_set.num_events, 0);
+  gpr_atm_no_barrier_store(&g_epoll_set.cursor, 0);
+  return true;
+}
+
+/* epoll_set_init() MUST be called before calling this. */
+static void epoll_set_shutdown() {
+  if (g_epoll_set.epfd >= 0) {
+    close(g_epoll_set.epfd);
+    g_epoll_set.epfd = -1;
+  }
+}
 
 /*******************************************************************************
  * Fd Declarations
@@ -122,7 +176,7 @@ struct grpc_pollset {
   bool kicked_without_poller;
 
   /* Set to true if the pollset is observed to have no workers available to
-   * poll */
+     poll */
   bool seen_inactive;
   bool shutting_down;             /* Is the pollset shutting down ? */
   grpc_closure *shutdown_closure; /* Called after after shutdown is complete */
@@ -228,7 +282,7 @@ static grpc_fd *fd_create(int fd, const char *name) {
 
   struct epoll_event ev = {.events = (uint32_t)(EPOLLIN | EPOLLOUT | EPOLLET),
                            .data.ptr = new_fd};
-  if (epoll_ctl(g_epfd, EPOLL_CTL_ADD, fd, &ev) != 0) {
+  if (epoll_ctl(g_epoll_set.epfd, EPOLL_CTL_ADD, fd, &ev) != 0) {
     gpr_log(GPR_ERROR, "epoll_ctl failed: %s", strerror(errno));
   }
 
@@ -326,7 +380,10 @@ static void fd_become_writable(grpc_exec_ctx *exec_ctx, grpc_fd *fd) {
 
 GPR_TLS_DECL(g_current_thread_pollset);
 GPR_TLS_DECL(g_current_thread_worker);
+
+/* The designated poller */
 static gpr_atm g_active_poller;
+
 static pollset_neighbourhood *g_neighbourhoods;
 static size_t g_num_neighbourhoods;
 
@@ -380,7 +437,8 @@ static grpc_error *pollset_global_init(void) {
   if (err != GRPC_ERROR_NONE) return err;
   struct epoll_event ev = {.events = (uint32_t)(EPOLLIN | EPOLLET),
                            .data.ptr = &global_wakeup_fd};
-  if (epoll_ctl(g_epfd, EPOLL_CTL_ADD, global_wakeup_fd.read_fd, &ev) != 0) {
+  if (epoll_ctl(g_epoll_set.epfd, EPOLL_CTL_ADD, global_wakeup_fd.read_fd,
+                &ev) != 0) {
     return GRPC_OS_ERROR(errno, "epoll_ctl");
   }
   g_num_neighbourhoods = GPR_CLAMP(gpr_cpu_num_cores(), 1, MAX_NEIGHBOURHOODS);
@@ -497,8 +555,6 @@ static void pollset_shutdown(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
   GPR_TIMER_END("pollset_shutdown", 0);
 }
 
-#define MAX_EPOLL_EVENTS 100
-
 static int poll_deadline_to_millis_timeout(gpr_timespec deadline,
                                            gpr_timespec now) {
   gpr_timespec timeout;
@@ -517,56 +573,90 @@ static int poll_deadline_to_millis_timeout(gpr_timespec deadline,
   return millis >= 1 ? millis : 1;
 }
 
-static grpc_error *pollset_epoll(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
-                                 gpr_timespec now, gpr_timespec deadline) {
-  struct epoll_event events[MAX_EPOLL_EVENTS];
-  static const char *err_desc = "pollset_poll";
-
-  GPR_TIMER_BEGIN("pollset_epoll", 0);
-
-  int timeout = poll_deadline_to_millis_timeout(deadline, now);
-
-  if (timeout != 0) {
-    GRPC_SCHEDULING_START_BLOCKING_REGION;
-  }
-  int r;
-  do {
-    GPR_TIMER_BEGIN("epoll_wait", 0);
-    r = epoll_wait(g_epfd, events, MAX_EPOLL_EVENTS, timeout);
-    GPR_TIMER_END("epoll_wait", 0);
-  } while (r < 0 && errno == EINTR);
-  if (timeout != 0) {
-    GRPC_SCHEDULING_END_BLOCKING_REGION;
-  }
+/* Process the epoll events found by do_epoll_wait() function.
+   - g_epoll_set.cursor points to the index of the first event to be processed
+   - This function then processes up-to MAX_EPOLL_EVENTS_PER_ITERATION and
+     updates the g_epoll_set.cursor
+
+   NOTE ON SYNCRHONIZATION: Similar to do_epoll_wait(), this function is only
+   called by g_active_poller thread. So there is no need for synchronization
+   when accessing fields in g_epoll_set */
+static grpc_error *process_epoll_events(grpc_exec_ctx *exec_ctx,
+                                        grpc_pollset *pollset) {
+  static const char *err_desc = "process_events";
+  grpc_error *error = GRPC_ERROR_NONE;
 
-  if (r < 0) {
-    GPR_TIMER_END("pollset_epoll", 0);
-    return GRPC_OS_ERROR(errno, "epoll_wait");
-  }
+  GPR_TIMER_BEGIN("process_epoll_events", 0);
+  long num_events = gpr_atm_acq_load(&g_epoll_set.num_events);
+  long cursor = gpr_atm_acq_load(&g_epoll_set.cursor);
+  for (int idx = 0;
+       (idx < MAX_EPOLL_EVENTS_HANDLED_PER_ITERATION) && cursor != num_events;
+       idx++) {
+    long c = cursor++;
+    struct epoll_event *ev = &g_epoll_set.events[c];
+    void *data_ptr = ev->data.ptr;
 
-  grpc_error *error = GRPC_ERROR_NONE;
-  for (int i = 0; i < r; i++) {
-    void *data_ptr = events[i].data.ptr;
     if (data_ptr == &global_wakeup_fd) {
       append_error(&error, grpc_wakeup_fd_consume_wakeup(&global_wakeup_fd),
                    err_desc);
     } else {
       grpc_fd *fd = (grpc_fd *)(data_ptr);
-      bool cancel = (events[i].events & (EPOLLERR | EPOLLHUP)) != 0;
-      bool read_ev = (events[i].events & (EPOLLIN | EPOLLPRI)) != 0;
-      bool write_ev = (events[i].events & EPOLLOUT) != 0;
+      bool cancel = (ev->events & (EPOLLERR | EPOLLHUP)) != 0;
+      bool read_ev = (ev->events & (EPOLLIN | EPOLLPRI)) != 0;
+      bool write_ev = (ev->events & EPOLLOUT) != 0;
+
       if (read_ev || cancel) {
         fd_become_readable(exec_ctx, fd, pollset);
       }
+
       if (write_ev || cancel) {
         fd_become_writable(exec_ctx, fd);
       }
     }
   }
-  GPR_TIMER_END("pollset_epoll", 0);
+  gpr_atm_rel_store(&g_epoll_set.cursor, cursor);
+  GPR_TIMER_END("process_epoll_events", 0);
   return error;
 }
 
+/* Do epoll_wait and store the events in g_epoll_set.events field. This does not
+   "process" any of the events yet; that is done in process_epoll_events().
+   *See process_epoll_events() function for more details.
+
+   NOTE ON SYNCHRONIZATION: At any point of time, only the g_active_poller
+   (i.e the designated poller thread) will be calling this function. So there is
+   no need for any synchronization when accesing fields in g_epoll_set */
+static grpc_error *do_epoll_wait(grpc_exec_ctx *exec_ctx, grpc_pollset *ps,
+                                 gpr_timespec now, gpr_timespec deadline) {
+  GPR_TIMER_BEGIN("do_epoll_wait", 0);
+
+  int r;
+  int timeout = poll_deadline_to_millis_timeout(deadline, now);
+  if (timeout != 0) {
+    GRPC_SCHEDULING_START_BLOCKING_REGION;
+  }
+  do {
+    GRPC_STATS_INC_SYSCALL_POLL(exec_ctx);
+    r = epoll_wait(g_epoll_set.epfd, g_epoll_set.events, MAX_EPOLL_EVENTS,
+                   timeout);
+  } while (r < 0 && errno == EINTR);
+  if (timeout != 0) {
+    GRPC_SCHEDULING_END_BLOCKING_REGION;
+  }
+
+  if (r < 0) return GRPC_OS_ERROR(errno, "epoll_wait");
+
+  if (GRPC_TRACER_ON(grpc_polling_trace)) {
+    gpr_log(GPR_DEBUG, "ps: %p poll got %d events", ps, r);
+  }
+
+  gpr_atm_rel_store(&g_epoll_set.num_events, r);
+  gpr_atm_rel_store(&g_epoll_set.cursor, 0);
+
+  GPR_TIMER_END("do_epoll_wait", 0);
+  return GRPC_ERROR_NONE;
+}
+
 static bool begin_worker(grpc_pollset *pollset, grpc_pollset_worker *worker,
                          grpc_pollset_worker **worker_hdl, gpr_timespec *now,
                          gpr_timespec deadline) {
@@ -827,32 +917,55 @@ static void end_worker(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
    The function pollset_work() may temporarily release the lock (pollset->po.mu)
    during the course of its execution but it will always re-acquire the lock and
    ensure that it is held by the time the function returns */
-static grpc_error *pollset_work(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
+static grpc_error *pollset_work(grpc_exec_ctx *exec_ctx, grpc_pollset *ps,
                                 grpc_pollset_worker **worker_hdl,
                                 gpr_timespec now, gpr_timespec deadline) {
   grpc_pollset_worker worker;
   grpc_error *error = GRPC_ERROR_NONE;
   static const char *err_desc = "pollset_work";
   GPR_TIMER_BEGIN("pollset_work", 0);
-  if (pollset->kicked_without_poller) {
-    pollset->kicked_without_poller = false;
+  if (ps->kicked_without_poller) {
+    ps->kicked_without_poller = false;
     GPR_TIMER_END("pollset_work", 0);
     return GRPC_ERROR_NONE;
   }
-  if (begin_worker(pollset, &worker, worker_hdl, &now, deadline)) {
-    gpr_tls_set(&g_current_thread_pollset, (intptr_t)pollset);
+
+  if (begin_worker(ps, &worker, worker_hdl, &now, deadline)) {
+    gpr_tls_set(&g_current_thread_pollset, (intptr_t)ps);
     gpr_tls_set(&g_current_thread_worker, (intptr_t)&worker);
-    GPR_ASSERT(!pollset->shutting_down);
-    GPR_ASSERT(!pollset->seen_inactive);
-    gpr_mu_unlock(&pollset->mu);
-    append_error(&error, pollset_epoll(exec_ctx, pollset, now, deadline),
-                 err_desc);
-    gpr_mu_lock(&pollset->mu);
+    GPR_ASSERT(!ps->shutting_down);
+    GPR_ASSERT(!ps->seen_inactive);
+
+    gpr_mu_unlock(&ps->mu); /* unlock */
+    /* This is the designated polling thread at this point and should ideally do
+       polling. However, if there are unprocessed events left from a previous
+       call to do_epoll_wait(), skip calling epoll_wait() in this iteration and
+       process the pending epoll events.
+
+       The reason for decoupling do_epoll_wait and process_epoll_events is to
+       better distrubute the work (i.e handling epoll events) across multiple
+       threads
+
+       process_epoll_events() returns very quickly: It just queues the work on
+       exec_ctx but does not execute it (the actual exectution or more
+       accurately grpc_exec_ctx_flush() happens in end_worker() AFTER selecting
+       a designated poller). So we are not waiting long periods without a
+       designated poller */
+    if (gpr_atm_acq_load(&g_epoll_set.cursor) ==
+        gpr_atm_acq_load(&g_epoll_set.num_events)) {
+      append_error(&error, do_epoll_wait(exec_ctx, ps, now, deadline),
+                   err_desc);
+    }
+    append_error(&error, process_epoll_events(exec_ctx, ps), err_desc);
+
+    gpr_mu_lock(&ps->mu); /* lock */
+
     gpr_tls_set(&g_current_thread_worker, 0);
   } else {
-    gpr_tls_set(&g_current_thread_pollset, (intptr_t)pollset);
+    gpr_tls_set(&g_current_thread_pollset, (intptr_t)ps);
   }
-  end_worker(exec_ctx, pollset, &worker, worker_hdl);
+  end_worker(exec_ctx, ps, &worker, worker_hdl);
+
   gpr_tls_set(&g_current_thread_pollset, 0);
   GPR_TIMER_END("pollset_work", 0);
   return error;
@@ -1043,7 +1156,7 @@ static void pollset_set_del_pollset_set(grpc_exec_ctx *exec_ctx,
 static void shutdown_engine(void) {
   fd_global_shutdown();
   pollset_global_shutdown();
-  close(g_epfd);
+  epoll_set_shutdown();
 }
 
 static const grpc_event_engine_vtable vtable = {
@@ -1078,32 +1191,25 @@ static const grpc_event_engine_vtable vtable = {
 };
 
 /* It is possible that GLIBC has epoll but the underlying kernel doesn't.
- * Create a dummy epoll_fd to make sure epoll support is available */
+ * Create epoll_fd (epoll_set_init() takes care of that) to make sure epoll
+ * support is available */
 const grpc_event_engine_vtable *grpc_init_epoll1_linux(bool explicit_request) {
-  if (!explicit_request) {
-    return NULL;
-  }
-
   if (!grpc_has_wakeup_fd()) {
     return NULL;
   }
 
-  g_epfd = epoll_create1(EPOLL_CLOEXEC);
-  if (g_epfd < 0) {
-    gpr_log(GPR_ERROR, "epoll unavailable");
+  if (!epoll_set_init()) {
     return NULL;
   }
 
   fd_global_init();
 
   if (!GRPC_LOG_IF_ERROR("pollset_global_init", pollset_global_init())) {
-    close(g_epfd);
     fd_global_shutdown();
+    epoll_set_shutdown();
     return NULL;
   }
 
-  gpr_log(GPR_ERROR, "grpc epoll fd: %d", g_epfd);
-
   return &vtable;
 }
 
diff --git a/src/core/lib/iomgr/ev_epoll_limited_pollers_linux.c b/src/core/lib/iomgr/ev_epoll_limited_pollers_linux.c
index f2f3e15704..e2e3cd9003 100644
--- a/src/core/lib/iomgr/ev_epoll_limited_pollers_linux.c
+++ b/src/core/lib/iomgr/ev_epoll_limited_pollers_linux.c
@@ -40,6 +40,7 @@
 #include <grpc/support/tls.h>
 #include <grpc/support/useful.h>
 
+#include "src/core/lib/debug/stats.h"
 #include "src/core/lib/debug/trace.h"
 #include "src/core/lib/iomgr/ev_posix.h"
 #include "src/core/lib/iomgr/iomgr_internal.h"
@@ -1286,7 +1287,8 @@ static void pollset_destroy(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset) {
 }
 
 /* NOTE: This function may modify 'now' */
-static bool acquire_polling_lease(grpc_pollset_worker *worker,
+static bool acquire_polling_lease(grpc_exec_ctx *exec_ctx,
+                                  grpc_pollset_worker *worker,
                                   polling_island *pi, gpr_timespec deadline,
                                   gpr_timespec *now) {
   bool is_lease_acquired = false;
@@ -1306,6 +1308,7 @@ static bool acquire_polling_lease(grpc_pollset_worker *worker,
     } else {
       struct timespec sigwait_timeout = millis_to_timespec(timeout_ms);
       GRPC_SCHEDULING_START_BLOCKING_REGION;
+      GRPC_STATS_INC_SYSCALL_WAIT(exec_ctx);
       ret = sigtimedwait(&g_wakeup_sig_set, NULL, &sigwait_timeout);
       GRPC_SCHEDULING_END_BLOCKING_REGION;
     }
@@ -1378,7 +1381,7 @@ static void pollset_do_epoll_pwait(grpc_exec_ctx *exec_ctx, int epoll_fd,
                                    sigset_t *sig_mask, grpc_error **error) {
   /* Only g_max_pollers_per_pi threads can be doing polling in parallel.
      If we cannot get a lease, we cannot continue to do epoll_pwait() */
-  if (!acquire_polling_lease(worker, pi, deadline, &now)) {
+  if (!acquire_polling_lease(exec_ctx, worker, pi, deadline, &now)) {
     return;
   }
 
@@ -1391,6 +1394,7 @@ static void pollset_do_epoll_pwait(grpc_exec_ctx *exec_ctx, int epoll_fd,
   int timeout_ms = poll_deadline_to_millis_timeout(deadline, now);
 
   GRPC_SCHEDULING_START_BLOCKING_REGION;
+  GRPC_STATS_INC_SYSCALL_POLL(exec_ctx);
   ep_rv =
       epoll_pwait(epoll_fd, ep_ev, GRPC_EPOLL_MAX_EVENTS, timeout_ms, sig_mask);
   GRPC_SCHEDULING_END_BLOCKING_REGION;
diff --git a/src/core/lib/iomgr/ev_epoll_thread_pool_linux.c b/src/core/lib/iomgr/ev_epoll_thread_pool_linux.c
index 07c8eadf4f..ddb8c74d2d 100644
--- a/src/core/lib/iomgr/ev_epoll_thread_pool_linux.c
+++ b/src/core/lib/iomgr/ev_epoll_thread_pool_linux.c
@@ -41,6 +41,7 @@
 #include <grpc/support/tls.h>
 #include <grpc/support/useful.h>
 
+#include "src/core/lib/debug/stats.h"
 #include "src/core/lib/iomgr/ev_posix.h"
 #include "src/core/lib/iomgr/iomgr_internal.h"
 #include "src/core/lib/iomgr/lockfree_event.h"
@@ -776,6 +777,7 @@ static void do_epoll_wait(grpc_exec_ctx *exec_ctx, int epoll_fd, epoll_set *eps,
 
   GRPC_SCHEDULING_START_BLOCKING_REGION;
   acquire_epoll_lease(eps);
+  GRPC_STATS_INC_SYSCALL_POLL(exec_ctx);
   ep_rv = epoll_wait(epoll_fd, ep_ev, GRPC_EPOLL_MAX_EVENTS, timeout_ms);
   release_epoll_lease(eps);
   GRPC_SCHEDULING_END_BLOCKING_REGION;
diff --git a/src/core/lib/iomgr/ev_epollex_linux.c b/src/core/lib/iomgr/ev_epollex_linux.c
index 770d1fd0a9..0e42f76af3 100644
--- a/src/core/lib/iomgr/ev_epollex_linux.c
+++ b/src/core/lib/iomgr/ev_epollex_linux.c
@@ -37,6 +37,7 @@
 #include <grpc/support/tls.h>
 #include <grpc/support/useful.h>
 
+#include "src/core/lib/debug/stats.h"
 #include "src/core/lib/iomgr/ev_posix.h"
 #include "src/core/lib/iomgr/iomgr_internal.h"
 #include "src/core/lib/iomgr/is_epollexclusive_available.h"
@@ -49,7 +50,7 @@
 #include "src/core/lib/support/spinlock.h"
 
 /*******************************************************************************
- * Pollset-set sibling link
+ * Polling object
  */
 
 typedef enum {
@@ -814,6 +815,7 @@ static grpc_error *pollset_epoll(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
   }
   int r;
   do {
+    GRPC_STATS_INC_SYSCALL_POLL(exec_ctx);
     r = epoll_wait(p->epfd, pollset->events, MAX_EPOLL_EVENTS, timeout);
   } while (r < 0 && errno == EINTR);
   if (timeout != 0) {
diff --git a/src/core/lib/iomgr/ev_epollsig_linux.c b/src/core/lib/iomgr/ev_epollsig_linux.c
index 070d75e42a..59c7cdc285 100644
--- a/src/core/lib/iomgr/ev_epollsig_linux.c
+++ b/src/core/lib/iomgr/ev_epollsig_linux.c
@@ -39,6 +39,7 @@
 #include <grpc/support/tls.h>
 #include <grpc/support/useful.h>
 
+#include "src/core/lib/debug/stats.h"
 #include "src/core/lib/iomgr/ev_posix.h"
 #include "src/core/lib/iomgr/iomgr_internal.h"
 #include "src/core/lib/iomgr/lockfree_event.h"
@@ -1236,6 +1237,7 @@ static void pollset_work_and_unlock(grpc_exec_ctx *exec_ctx,
   g_current_thread_polling_island = pi;
 
   GRPC_SCHEDULING_START_BLOCKING_REGION;
+  GRPC_STATS_INC_SYSCALL_POLL(exec_ctx);
   ep_rv =
       epoll_pwait(epoll_fd, ep_ev, GRPC_EPOLL_MAX_EVENTS, timeout_ms, sig_mask);
   GRPC_SCHEDULING_END_BLOCKING_REGION;
@@ -1728,9 +1730,7 @@ const grpc_event_engine_vtable *grpc_init_epollsig_linux(
   }
 
   if (!is_grpc_wakeup_signal_initialized) {
-    /* TODO(ctiller): when other epoll engines are ready, remove the true || to
-     * force this to be explitly chosen if needed */
-    if (true || explicit_request) {
+    if (explicit_request) {
       grpc_use_signal(SIGRTMIN + 6);
     } else {
       return NULL;
diff --git a/src/core/lib/iomgr/ev_poll_posix.c b/src/core/lib/iomgr/ev_poll_posix.c
index 9472a8e520..fbd265f3ce 100644
--- a/src/core/lib/iomgr/ev_poll_posix.c
+++ b/src/core/lib/iomgr/ev_poll_posix.c
@@ -36,6 +36,7 @@
 #include <grpc/support/tls.h>
 #include <grpc/support/useful.h>
 
+#include "src/core/lib/debug/stats.h"
 #include "src/core/lib/iomgr/iomgr_internal.h"
 #include "src/core/lib/iomgr/timer.h"
 #include "src/core/lib/iomgr/wakeup_fd_cv.h"
@@ -983,6 +984,7 @@ static grpc_error *pollset_work(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
       /* TODO(vpai): Consider first doing a 0 timeout poll here to avoid
          even going into the blocking annotation if possible */
       GRPC_SCHEDULING_START_BLOCKING_REGION;
+      GRPC_STATS_INC_SYSCALL_POLL(exec_ctx);
       r = grpc_poll_function(pfds, pfd_count, timeout);
       GRPC_SCHEDULING_END_BLOCKING_REGION;
 
diff --git a/src/core/lib/iomgr/ev_posix.c b/src/core/lib/iomgr/ev_posix.c
index 91f8cd5482..424b40e425 100644
--- a/src/core/lib/iomgr/ev_posix.c
+++ b/src/core/lib/iomgr/ev_posix.c
@@ -64,8 +64,8 @@ typedef struct {
 } event_engine_factory;
 
 static const event_engine_factory g_factories[] = {
-    {"epollsig", grpc_init_epollsig_linux},
     {"epoll1", grpc_init_epoll1_linux},
+    {"epollsig", grpc_init_epollsig_linux},
     {"epoll-threadpool", grpc_init_epoll_thread_pool_linux},
     {"epoll-limited", grpc_init_epoll_limited_pollers_linux},
     {"poll", grpc_init_poll_posix},
diff --git a/src/core/lib/iomgr/exec_ctx.h b/src/core/lib/iomgr/exec_ctx.h
index a0d2a965d5..c89792c8c4 100644
--- a/src/core/lib/iomgr/exec_ctx.h
+++ b/src/core/lib/iomgr/exec_ctx.h
@@ -19,6 +19,7 @@
 #ifndef GRPC_CORE_LIB_IOMGR_EXEC_CTX_H
 #define GRPC_CORE_LIB_IOMGR_EXEC_CTX_H
 
+#include <grpc/support/cpu.h>
 #include "src/core/lib/iomgr/closure.h"
 
 /* #define GRPC_EXECUTION_CONTEXT_SANITIZER 1 */
@@ -62,6 +63,7 @@ struct grpc_exec_ctx {
   /** last active combiner in the active combiner list */
   grpc_combiner *last_combiner;
   uintptr_t flags;
+  unsigned starting_cpu;
   void *check_ready_to_finish_arg;
   bool (*check_ready_to_finish)(grpc_exec_ctx *exec_ctx, void *arg);
 };
@@ -69,7 +71,10 @@ struct grpc_exec_ctx {
 /* initializer for grpc_exec_ctx:
    prefer to use GRPC_EXEC_CTX_INIT whenever possible */
 #define GRPC_EXEC_CTX_INITIALIZER(flags, finish_check, finish_check_arg) \
-  { GRPC_CLOSURE_LIST_INIT, NULL, NULL, flags, finish_check_arg, finish_check }
+  {                                                                      \
+    GRPC_CLOSURE_LIST_INIT, NULL, NULL, flags, gpr_cpu_current_cpu(),    \
+        finish_check_arg, finish_check                                   \
+  }
 
 /* initialize an execution context at the top level of an API call into grpc
    (this is safe to use elsewhere, though possibly not as efficient) */
diff --git a/src/core/lib/iomgr/iocp_windows.c b/src/core/lib/iomgr/iocp_windows.c
index e343f8a794..c082179c0b 100644
--- a/src/core/lib/iomgr/iocp_windows.c
+++ b/src/core/lib/iomgr/iocp_windows.c
@@ -27,6 +27,7 @@
 #include <grpc/support/log_windows.h>
 #include <grpc/support/thd.h>
 
+#include "src/core/lib/debug/stats.h"
 #include "src/core/lib/iomgr/iocp_windows.h"
 #include "src/core/lib/iomgr/iomgr_internal.h"
 #include "src/core/lib/iomgr/socket_windows.h"
@@ -65,6 +66,7 @@ grpc_iocp_work_status grpc_iocp_work(grpc_exec_ctx *exec_ctx,
   LPOVERLAPPED overlapped;
   grpc_winsocket *socket;
   grpc_winsocket_callback_info *info;
+  GRPC_STATS_INC_SYSCALL_POLL(exec_ctx);
   success = GetQueuedCompletionStatus(
       g_iocp, &bytes, &completion_key, &overlapped,
       deadline_to_millis_timeout(deadline, gpr_now(deadline.clock_type)));
diff --git a/src/core/lib/iomgr/tcp_posix.c b/src/core/lib/iomgr/tcp_posix.c
index 2f543fd8a9..6c620ca245 100644
--- a/src/core/lib/iomgr/tcp_posix.c
+++ b/src/core/lib/iomgr/tcp_posix.c
@@ -40,6 +40,7 @@
 #include <grpc/support/useful.h>
 
 #include "src/core/lib/channel/channel_args.h"
+#include "src/core/lib/debug/stats.h"
 #include "src/core/lib/debug/trace.h"
 #include "src/core/lib/iomgr/ev_posix.h"
 #include "src/core/lib/profiling/timers.h"
@@ -66,7 +67,6 @@ typedef struct {
   grpc_fd *em_fd;
   int fd;
   bool finished_edge;
-  msg_iovlen_type iov_size; /* Number of slices to allocate per read attempt */
   double target_length;
   double bytes_read_this_round;
   gpr_refcount refcount;
@@ -239,7 +239,6 @@ static void tcp_do_read(grpc_exec_ctx *exec_ctx, grpc_tcp *tcp) {
   size_t i;
 
   GPR_ASSERT(!tcp->finished_edge);
-  GPR_ASSERT(tcp->iov_size <= MAX_READ_IOVEC);
   GPR_ASSERT(tcp->incoming_buffer->count <= MAX_READ_IOVEC);
   GPR_TIMER_BEGIN("tcp_continue_read", 0);
 
@@ -251,13 +250,14 @@ static void tcp_do_read(grpc_exec_ctx *exec_ctx, grpc_tcp *tcp) {
   msg.msg_name = NULL;
   msg.msg_namelen = 0;
   msg.msg_iov = iov;
-  msg.msg_iovlen = tcp->iov_size;
+  msg.msg_iovlen = (msg_iovlen_type)tcp->incoming_buffer->count;
   msg.msg_control = NULL;
   msg.msg_controllen = 0;
   msg.msg_flags = 0;
 
   GPR_TIMER_BEGIN("recvmsg", 0);
   do {
+    GRPC_STATS_INC_SYSCALL_READ(exec_ctx);
     read_bytes = recvmsg(tcp->fd, &msg, 0);
   } while (read_bytes < 0 && errno == EINTR);
   GPR_TIMER_END("recvmsg", read_bytes >= 0);
@@ -361,7 +361,8 @@ static void tcp_read(grpc_exec_ctx *exec_ctx, grpc_endpoint *ep,
 
 /* returns true if done, false if pending; if returning true, *error is set */
 #define MAX_WRITE_IOVEC 1000
-static bool tcp_flush(grpc_tcp *tcp, grpc_error **error) {
+static bool tcp_flush(grpc_exec_ctx *exec_ctx, grpc_tcp *tcp,
+                      grpc_error **error) {
   struct msghdr msg;
   struct iovec iov[MAX_WRITE_IOVEC];
   msg_iovlen_type iov_size;
@@ -403,6 +404,7 @@ static bool tcp_flush(grpc_tcp *tcp, grpc_error **error) {
     GPR_TIMER_BEGIN("sendmsg", 1);
     do {
       /* TODO(klempner): Cork if this is a partial write */
+      GRPC_STATS_INC_SYSCALL_WRITE(exec_ctx);
       sent_length = sendmsg(tcp->fd, &msg, SENDMSG_FLAGS);
     } while (sent_length < 0 && errno == EINTR);
     GPR_TIMER_END("sendmsg", 0);
@@ -459,7 +461,7 @@ static void tcp_handle_write(grpc_exec_ctx *exec_ctx, void *arg /* grpc_tcp */,
     return;
   }
 
-  if (!tcp_flush(tcp, &error)) {
+  if (!tcp_flush(exec_ctx, tcp, &error)) {
     if (GRPC_TRACER_ON(grpc_tcp_trace)) {
       gpr_log(GPR_DEBUG, "write: delayed");
     }
@@ -510,7 +512,7 @@ static void tcp_write(grpc_exec_ctx *exec_ctx, grpc_endpoint *ep,
   tcp->outgoing_slice_idx = 0;
   tcp->outgoing_byte_idx = 0;
 
-  if (!tcp_flush(tcp, &error)) {
+  if (!tcp_flush(exec_ctx, tcp, &error)) {
     TCP_REF(tcp, "write");
     tcp->write_cb = cb;
     if (GRPC_TRACER_ON(grpc_tcp_trace)) {
@@ -617,7 +619,6 @@ grpc_endpoint *grpc_tcp_create(grpc_exec_ctx *exec_ctx, grpc_fd *em_fd,
   tcp->min_read_chunk_size = tcp_min_read_chunk_size;
   tcp->max_read_chunk_size = tcp_max_read_chunk_size;
   tcp->bytes_read_this_round = 0;
-  tcp->iov_size = 1;
   tcp->finished_edge = true;
   /* paired with unref in grpc_tcp_destroy */
   gpr_ref_init(&tcp->refcount, 1);
diff --git a/src/core/lib/profiling/timers.h b/src/core/lib/profiling/timers.h
index 4d1437f606..7f02b4bf84 100644
--- a/src/core/lib/profiling/timers.h
+++ b/src/core/lib/profiling/timers.h
@@ -94,7 +94,7 @@ class ProfileScope {
  public:
   ProfileScope(const char *desc, bool important, const char *file, int line)
       : desc_(desc) {
-    gpr_timer_begin((desc_, important ? 1 : 0, file, line);
+    gpr_timer_begin(desc_, important ? 1 : 0, file, line);
   }
   ~ProfileScope() { gpr_timer_end(desc_, 0, "n/a", 0); }
 
diff --git a/src/core/lib/support/block_annotate.h b/src/core/lib/support/block_annotate.h
index 0a2cb45018..8e3ef7df65 100644
--- a/src/core/lib/support/block_annotate.h
+++ b/src/core/lib/support/block_annotate.h
@@ -19,15 +19,37 @@
 #ifndef GRPC_CORE_LIB_SUPPORT_BLOCK_ANNOTATE_H
 #define GRPC_CORE_LIB_SUPPORT_BLOCK_ANNOTATE_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void gpr_thd_start_blocking_region();
+void gpr_thd_end_blocking_region();
+
+#ifdef __cplusplus
+}
+#endif
+
 /* These annotations identify the beginning and end of regions where
    the code may block for reasons other than synchronization functions.
    These include poll, epoll, and getaddrinfo. */
 
+#ifdef GRPC_SCHEDULING_MARK_BLOCKING_REGION
+#define GRPC_SCHEDULING_START_BLOCKING_REGION \
+  do {                                        \
+    gpr_thd_start_blocking_region();          \
+  } while (0)
+#define GRPC_SCHEDULING_END_BLOCKING_REGION \
+  do {                                      \
+    gpr_thd_end_blocking_region();          \
+  } while (0)
+#else
 #define GRPC_SCHEDULING_START_BLOCKING_REGION \
   do {                                        \
   } while (0)
 #define GRPC_SCHEDULING_END_BLOCKING_REGION \
   do {                                      \
   } while (0)
+#endif
 
 #endif /* GRPC_CORE_LIB_SUPPORT_BLOCK_ANNOTATE_H */
diff --git a/src/core/lib/surface/call.c b/src/core/lib/surface/call.c
index a0ac9ae7d5..e68c201134 100644
--- a/src/core/lib/surface/call.c
+++ b/src/core/lib/surface/call.c
@@ -32,6 +32,7 @@
 
 #include "src/core/lib/channel/channel_stack.h"
 #include "src/core/lib/compression/algorithm_metadata.h"
+#include "src/core/lib/debug/stats.h"
 #include "src/core/lib/iomgr/timer.h"
 #include "src/core/lib/profiling/timers.h"
 #include "src/core/lib/slice/slice_internal.h"
@@ -144,6 +145,9 @@ typedef struct {
   grpc_call *sibling_prev;
 } child_call;
 
+#define RECV_NONE ((gpr_atm)0)
+#define RECV_INITIAL_METADATA_FIRST ((gpr_atm)1)
+
 struct grpc_call {
   gpr_refcount ext_ref;
   gpr_arena *arena;
@@ -170,9 +174,6 @@ struct grpc_call {
   gpr_atm any_ops_sent_atm;
   gpr_atm received_final_op_atm;
 
-  /* have we received initial metadata */
-  bool has_initial_md_been_received;
-
   batch_control *active_batches[MAX_CONCURRENT_BATCHES];
   grpc_transport_stream_op_batch_payload stream_op_payload;
 
@@ -230,7 +231,23 @@ struct grpc_call {
     } server;
   } final_op;
 
-  void *saved_receiving_stream_ready_bctlp;
+  /* recv_state can contain one of the following values:
+     RECV_NONE :                 :  no initial metadata and messages received
+     RECV_INITIAL_METADATA_FIRST :  received initial metadata first
+     a batch_control*            :  received messages first
+
+                 +------1------RECV_NONE------3-----+
+                 |                                  |
+                 |                                  |
+                 v                                  v
+     RECV_INITIAL_METADATA_FIRST        receiving_stream_ready_bctlp
+           |           ^                      |           ^
+           |           |                      |           |
+           +-----2-----+                      +-----4-----+
+
+    For 1, 4: See receiving_initial_metadata_ready() function
+    For 2, 3: See receiving_stream_ready() function */
+  gpr_atm recv_state;
 };
 
 grpc_tracer_flag grpc_call_error_trace =
@@ -318,6 +335,11 @@ grpc_error *grpc_call_create(grpc_exec_ctx *exec_ctx,
   /* Always support no compression */
   GPR_BITSET(&call->encodings_accepted_by_peer, GRPC_COMPRESS_NONE);
   call->is_client = args->server_transport_data == NULL;
+  if (call->is_client) {
+    GRPC_STATS_INC_CLIENT_CALLS_CREATED(exec_ctx);
+  } else {
+    GRPC_STATS_INC_SERVER_CALLS_CREATED(exec_ctx);
+  }
   call->stream_op_payload.context = call->context;
   grpc_slice path = grpc_empty_slice();
   if (call->is_client) {
@@ -1400,11 +1422,12 @@ static void receiving_stream_ready(grpc_exec_ctx *exec_ctx, void *bctlp,
     cancel_with_error(exec_ctx, call, STATUS_FROM_SURFACE,
                       GRPC_ERROR_REF(error));
   }
-  if (call->has_initial_md_been_received || error != GRPC_ERROR_NONE ||
-      call->receiving_stream == NULL) {
+  /* If recv_state is RECV_NONE, we will save the batch_control
+   * object with rel_cas, and will not use it after the cas. Its corresponding
+   * acq_load is in receiving_initial_metadata_ready() */
+  if (error != GRPC_ERROR_NONE || call->receiving_stream == NULL ||
+      !gpr_atm_rel_cas(&call->recv_state, RECV_NONE, (gpr_atm)bctlp)) {
     process_data_after_md(exec_ctx, bctlp);
-  } else {
-    call->saved_receiving_stream_ready_bctlp = bctlp;
   }
 }
 
@@ -1533,12 +1556,31 @@ static void receiving_initial_metadata_ready(grpc_exec_ctx *exec_ctx,
     }
   }
 
-  call->has_initial_md_been_received = true;
-  if (call->saved_receiving_stream_ready_bctlp != NULL) {
-    grpc_closure *saved_rsr_closure = GRPC_CLOSURE_CREATE(
-        receiving_stream_ready, call->saved_receiving_stream_ready_bctlp,
-        grpc_schedule_on_exec_ctx);
-    call->saved_receiving_stream_ready_bctlp = NULL;
+  grpc_closure *saved_rsr_closure = NULL;
+  while (true) {
+    gpr_atm rsr_bctlp = gpr_atm_acq_load(&call->recv_state);
+    /* Should only receive initial metadata once */
+    GPR_ASSERT(rsr_bctlp != 1);
+    if (rsr_bctlp == 0) {
+      /* We haven't seen initial metadata and messages before, thus initial
+       * metadata is received first.
+       * no_barrier_cas is used, as this function won't access the batch_control
+       * object saved by receiving_stream_ready() if the initial metadata is
+       * received first. */
+      if (gpr_atm_no_barrier_cas(&call->recv_state, RECV_NONE,
+                                 RECV_INITIAL_METADATA_FIRST)) {
+        break;
+      }
+    } else {
+      /* Already received messages */
+      saved_rsr_closure = GRPC_CLOSURE_CREATE(receiving_stream_ready,
+                                              (batch_control *)rsr_bctlp,
+                                              grpc_schedule_on_exec_ctx);
+      /* No need to modify recv_state */
+      break;
+    }
+  }
+  if (saved_rsr_closure != NULL) {
     GRPC_CLOSURE_RUN(exec_ctx, saved_rsr_closure, GRPC_ERROR_REF(error));
   }
 
diff --git a/src/core/lib/surface/call.h b/src/core/lib/surface/call.h
index 185bfccb77..d537637cbb 100644
--- a/src/core/lib/surface/call.h
+++ b/src/core/lib/surface/call.h
@@ -19,6 +19,10 @@
 #ifndef GRPC_CORE_LIB_SURFACE_CALL_H
 #define GRPC_CORE_LIB_SURFACE_CALL_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "src/core/lib/channel/channel_stack.h"
 #include "src/core/lib/channel/context.h"
 #include "src/core/lib/surface/api_trace.h"
@@ -26,10 +30,6 @@
 #include <grpc/grpc.h>
 #include <grpc/impl/codegen/compression_types.h>
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 typedef void (*grpc_ioreq_completion_func)(grpc_exec_ctx *exec_ctx,
                                            grpc_call *call, int success,
                                            void *user_data);
@@ -89,7 +89,7 @@ grpc_call_error grpc_call_start_batch_and_execute(grpc_exec_ctx *exec_ctx,
 /* Given the top call_element, get the call object. */
 grpc_call *grpc_call_from_top_element(grpc_call_element *surface_element);
 
-void grpc_call_log_batch(char *file, int line, gpr_log_severity severity,
+void grpc_call_log_batch(const char *file, int line, gpr_log_severity severity,
                          grpc_call *call, const grpc_op *ops, size_t nops,
                          void *tag);
 
diff --git a/src/core/lib/surface/call_log_batch.c b/src/core/lib/surface/call_log_batch.c
index 4443aba58a..4a1c265817 100644
--- a/src/core/lib/surface/call_log_batch.c
+++ b/src/core/lib/surface/call_log_batch.c
@@ -103,7 +103,7 @@ char *grpc_op_string(const grpc_op *op) {
   return out;
 }
 
-void grpc_call_log_batch(char *file, int line, gpr_log_severity severity,
+void grpc_call_log_batch(const char *file, int line, gpr_log_severity severity,
                          grpc_call *call, const grpc_op *ops, size_t nops,
                          void *tag) {
   char *tmp;
diff --git a/src/core/lib/surface/init.c b/src/core/lib/surface/init.c
index d199ac060e..898476daee 100644
--- a/src/core/lib/surface/init.c
+++ b/src/core/lib/surface/init.c
@@ -28,6 +28,7 @@
 #include "src/core/lib/channel/channel_stack.h"
 #include "src/core/lib/channel/connected_channel.h"
 #include "src/core/lib/channel/handshaker_registry.h"
+#include "src/core/lib/debug/stats.h"
 #include "src/core/lib/debug/trace.h"
 #include "src/core/lib/http/parser.h"
 #include "src/core/lib/iomgr/combiner.h"
@@ -118,6 +119,7 @@ void grpc_init(void) {
   gpr_mu_lock(&g_init_mu);
   if (++g_initializations == 1) {
     gpr_time_init();
+    grpc_stats_init();
     grpc_slice_intern_init();
     grpc_mdctx_global_init();
     grpc_channel_init_init();
@@ -186,6 +188,7 @@ void grpc_shutdown(void) {
     grpc_mdctx_global_shutdown(&exec_ctx);
     grpc_handshaker_factory_registry_shutdown(&exec_ctx);
     grpc_slice_intern_shutdown();
+    grpc_stats_shutdown();
   }
   gpr_mu_unlock(&g_init_mu);
   grpc_exec_ctx_finish(&exec_ctx);
diff --git a/src/core/lib/transport/byte_stream.c b/src/core/lib/transport/byte_stream.c
index fb03a10315..08f61629a9 100644
--- a/src/core/lib/transport/byte_stream.c
+++ b/src/core/lib/transport/byte_stream.c
@@ -85,6 +85,7 @@ static void slice_buffer_stream_shutdown(grpc_exec_ctx *exec_ctx,
 static void slice_buffer_stream_destroy(grpc_exec_ctx *exec_ctx,
                                         grpc_byte_stream *byte_stream) {
   grpc_slice_buffer_stream *stream = (grpc_slice_buffer_stream *)byte_stream;
+  grpc_slice_buffer_reset_and_unref_internal(exec_ctx, stream->backing_buffer);
   GRPC_ERROR_UNREF(stream->shutdown_error);
 }
 
diff --git a/src/core/lib/transport/byte_stream.h b/src/core/lib/transport/byte_stream.h
index 1e1e8310b8..be2a35213e 100644
--- a/src/core/lib/transport/byte_stream.h
+++ b/src/core/lib/transport/byte_stream.h
@@ -81,7 +81,9 @@ void grpc_byte_stream_destroy(grpc_exec_ctx *exec_ctx,
 
 // grpc_slice_buffer_stream
 //
-// A grpc_byte_stream that wraps a slice buffer.
+// A grpc_byte_stream that wraps a slice buffer.  The stream takes
+// ownership of the slices in the buffer, and on destruction will
+// reset the contents of the buffer.
 
 typedef struct grpc_slice_buffer_stream {
   grpc_byte_stream base;
diff --git a/src/core/lib/transport/metadata_batch.c b/src/core/lib/transport/metadata_batch.c
index 8f24b8527c..a077052561 100644
--- a/src/core/lib/transport/metadata_batch.c
+++ b/src/core/lib/transport/metadata_batch.c
@@ -105,6 +105,7 @@ static grpc_error *maybe_link_callout(grpc_metadata_batch *batch,
     return GRPC_ERROR_NONE;
   }
   if (batch->idx.array[idx] == NULL) {
+    if (grpc_static_callout_is_default[idx]) ++batch->list.default_count;
     batch->idx.array[idx] = storage;
     return GRPC_ERROR_NONE;
   }
@@ -120,6 +121,7 @@ static void maybe_unlink_callout(grpc_metadata_batch *batch,
   if (idx == GRPC_BATCH_CALLOUTS_COUNT) {
     return;
   }
+  if (grpc_static_callout_is_default[idx]) --batch->list.default_count;
   GPR_ASSERT(batch->idx.array[idx] != NULL);
   batch->idx.array[idx] = NULL;
 }
diff --git a/src/core/lib/transport/metadata_batch.h b/src/core/lib/transport/metadata_batch.h
index 1b11a3e252..57d298c75c 100644
--- a/src/core/lib/transport/metadata_batch.h
+++ b/src/core/lib/transport/metadata_batch.h
@@ -41,6 +41,7 @@ typedef struct grpc_linked_mdelem {
 
 typedef struct grpc_mdelem_list {
   size_t count;
+  size_t default_count;  // Number of default keys.
   grpc_linked_mdelem *head;
   grpc_linked_mdelem *tail;
 } grpc_mdelem_list;
diff --git a/src/core/lib/transport/static_metadata.c b/src/core/lib/transport/static_metadata.c
index 28f05d5c44..b20d94aeac 100644
--- a/src/core/lib/transport/static_metadata.c
+++ b/src/core/lib/transport/static_metadata.c
@@ -823,6 +823,31 @@ grpc_mdelem_data grpc_static_mdelem_table[GRPC_STATIC_MDELEM_COUNT] = {
      {.refcount = &grpc_static_metadata_refcounts[97],
       .data.refcounted = {g_bytes + 1040, 13}}},
 };
+bool grpc_static_callout_is_default[GRPC_BATCH_CALLOUTS_COUNT] = {
+    true,  // :path
+    true,  // :method
+    true,  // :status
+    true,  // :authority
+    true,  // :scheme
+    true,  // te
+    true,  // grpc-message
+    true,  // grpc-status
+    true,  // grpc-payload-bin
+    true,  // grpc-encoding
+    true,  // grpc-accept-encoding
+    true,  // grpc-server-stats-bin
+    true,  // grpc-tags-bin
+    true,  // grpc-trace-bin
+    true,  // content-type
+    true,  // content-encoding
+    true,  // accept-encoding
+    true,  // grpc-internal-encoding-request
+    true,  // grpc-internal-stream-encoding-request
+    true,  // user-agent
+    true,  // host
+    true,  // lb-token
+};
+
 const uint8_t grpc_static_accept_encoding_metadata[8] = {0,  76, 77, 78,
                                                          79, 80, 81, 82};
 
diff --git a/src/core/lib/transport/static_metadata.h b/src/core/lib/transport/static_metadata.h
index 93ab90dff8..f03a9d23b1 100644
--- a/src/core/lib/transport/static_metadata.h
+++ b/src/core/lib/transport/static_metadata.h
@@ -571,6 +571,8 @@ typedef union {
              GRPC_BATCH_CALLOUTS_COUNT)                 \
        : GRPC_BATCH_CALLOUTS_COUNT)
 
+extern bool grpc_static_callout_is_default[GRPC_BATCH_CALLOUTS_COUNT];
+
 extern const uint8_t grpc_static_accept_encoding_metadata[8];
 #define GRPC_MDELEM_ACCEPT_ENCODING_FOR_ALGORITHMS(algs)                       \
   (GRPC_MAKE_MDELEM(                                                           \