aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/core/iomgr/tcp_posix.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/iomgr/tcp_posix.c')
-rw-r--r--src/core/iomgr/tcp_posix.c561
1 files changed, 561 insertions, 0 deletions
diff --git a/src/core/iomgr/tcp_posix.c b/src/core/iomgr/tcp_posix.c
new file mode 100644
index 0000000000..8f63f75612
--- /dev/null
+++ b/src/core/iomgr/tcp_posix.c
@@ -0,0 +1,561 @@
+/*
+ *
+ * Copyright 2014, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "src/core/iomgr/tcp_posix.h"
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include <grpc/support/alloc.h>
+#include <grpc/support/log.h>
+#include <grpc/support/slice.h>
+#include <grpc/support/string.h>
+#include <grpc/support/sync.h>
+#include <grpc/support/time.h>
+
+/* Holds a slice array and associated state. */
+typedef struct grpc_tcp_slice_state {
+ gpr_slice *slices; /* Array of slices */
+ size_t nslices; /* Size of slices array. */
+ ssize_t first_slice; /* First valid slice in array */
+ ssize_t last_slice; /* Last valid slice in array */
+ gpr_slice working_slice; /* pointer to original final slice */
+ int working_slice_valid; /* True if there is a working slice */
+ int memory_owned; /* True if slices array is owned */
+} grpc_tcp_slice_state;
+
+static void slice_state_init(grpc_tcp_slice_state *state, gpr_slice *slices,
+ size_t nslices, size_t valid_slices) {
+ state->slices = slices;
+ state->nslices = nslices;
+ if (valid_slices == 0) {
+ state->first_slice = -1;
+ } else {
+ state->first_slice = 0;
+ }
+ state->last_slice = valid_slices - 1;
+ state->working_slice_valid = 0;
+ state->memory_owned = 0;
+}
+
+/* Returns true if there is still available data */
+static int slice_state_has_available(grpc_tcp_slice_state *state) {
+ return state->first_slice != -1 && state->last_slice >= state->first_slice;
+}
+
+static ssize_t slice_state_slices_allocated(grpc_tcp_slice_state *state) {
+ if (state->first_slice == -1) {
+ return 0;
+ } else {
+ return state->last_slice - state->first_slice + 1;
+ }
+}
+
+static void slice_state_realloc(grpc_tcp_slice_state *state, size_t new_size) {
+ /* TODO(klempner): use realloc instead when first_slice is 0 */
+ /* TODO(klempner): Avoid a realloc in cases where it is unnecessary */
+ gpr_slice *slices = state->slices;
+ size_t original_size = slice_state_slices_allocated(state);
+ size_t i;
+ gpr_slice *new_slices = gpr_malloc(sizeof(gpr_slice) * new_size);
+
+ for (i = 0; i < original_size; ++i) {
+ new_slices[i] = slices[i + state->first_slice];
+ }
+
+ state->slices = new_slices;
+ state->last_slice = original_size - 1;
+ if (original_size > 0) {
+ state->first_slice = 0;
+ } else {
+ state->first_slice = -1;
+ }
+ state->nslices = new_size;
+
+ if (state->memory_owned) {
+ gpr_free(slices);
+ }
+ state->memory_owned = 1;
+}
+
+static void slice_state_remove_prefix(grpc_tcp_slice_state *state,
+ size_t prefix_bytes) {
+ gpr_slice *current_slice = &state->slices[state->first_slice];
+ size_t current_slice_size;
+
+ while (slice_state_has_available(state)) {
+ current_slice_size = GPR_SLICE_LENGTH(*current_slice);
+ if (current_slice_size > prefix_bytes) {
+ /* TODO(klempner): Get rid of the extra refcount created here by adding a
+ native "trim the first N bytes" operation to splice */
+ /* TODO(klempner): This really shouldn't be modifying the current slice
+ unless we own the slices array. */
+ *current_slice = gpr_slice_split_tail(current_slice, prefix_bytes);
+ gpr_slice_unref(*current_slice);
+ return;
+ } else {
+ gpr_slice_unref(*current_slice);
+ ++state->first_slice;
+ ++current_slice;
+ prefix_bytes -= current_slice_size;
+ }
+ }
+}
+
+static void slice_state_destroy(grpc_tcp_slice_state *state) {
+ while (slice_state_has_available(state)) {
+ gpr_slice_unref(state->slices[state->first_slice]);
+ ++state->first_slice;
+ }
+
+ if (state->memory_owned) {
+ gpr_free(state->slices);
+ state->memory_owned = 0;
+ }
+}
+
+void slice_state_transfer_ownership(grpc_tcp_slice_state *state,
+ gpr_slice **slices, size_t *nslices) {
+ *slices = state->slices + state->first_slice;
+ *nslices = state->last_slice - state->first_slice + 1;
+
+ state->first_slice = -1;
+ state->last_slice = -1;
+}
+
+/* Fills iov with the first min(iov_size, available) slices, returns number
+ filled */
+static size_t slice_state_to_iovec(grpc_tcp_slice_state *state,
+ struct iovec *iov, size_t iov_size) {
+ size_t nslices = state->last_slice - state->first_slice + 1;
+ gpr_slice *slices = state->slices + state->first_slice;
+ size_t i;
+ if (nslices < iov_size) {
+ iov_size = nslices;
+ }
+
+ for (i = 0; i < iov_size; ++i) {
+ iov[i].iov_base = GPR_SLICE_START_PTR(slices[i]);
+ iov[i].iov_len = GPR_SLICE_LENGTH(slices[i]);
+ }
+ return iov_size;
+}
+
+/* Makes n blocks available at the end of state, writes them into iov, and
+ returns the number of bytes allocated */
+static size_t slice_state_append_blocks_into_iovec(grpc_tcp_slice_state *state,
+ struct iovec *iov, size_t n,
+ size_t slice_size) {
+ size_t target_size;
+ size_t i;
+ size_t allocated_bytes;
+ ssize_t allocated_slices = slice_state_slices_allocated(state);
+
+ if (n - state->working_slice_valid >= state->nslices - state->last_slice) {
+ /* Need to grow the slice array */
+ target_size = state->nslices;
+ do {
+ target_size = target_size * 2;
+ } while (target_size < allocated_slices + n - state->working_slice_valid);
+ /* TODO(klempner): If this ever needs to support both prefix removal and
+ append, we should be smarter about the growth logic here */
+ slice_state_realloc(state, target_size);
+ }
+
+ i = 0;
+ allocated_bytes = 0;
+
+ if (state->working_slice_valid) {
+ iov[0].iov_base = GPR_SLICE_END_PTR(state->slices[state->last_slice]);
+ iov[0].iov_len = GPR_SLICE_LENGTH(state->working_slice) -
+ GPR_SLICE_LENGTH(state->slices[state->last_slice]);
+ allocated_bytes += iov[0].iov_len;
+ ++i;
+ state->slices[state->last_slice] = state->working_slice;
+ state->working_slice_valid = 0;
+ }
+
+ for (; i < n; ++i) {
+ ++state->last_slice;
+ state->slices[state->last_slice] = gpr_slice_malloc(slice_size);
+ iov[i].iov_base = GPR_SLICE_START_PTR(state->slices[state->last_slice]);
+ iov[i].iov_len = slice_size;
+ allocated_bytes += slice_size;
+ }
+ if (state->first_slice == -1) {
+ state->first_slice = 0;
+ }
+ return allocated_bytes;
+}
+
+/* Remove the last n bytes from state */
+/* TODO(klempner): Consider having this defer actual deletion until later */
+static void slice_state_remove_last(grpc_tcp_slice_state *state, size_t bytes) {
+ while (bytes > 0 && slice_state_has_available(state)) {
+ if (GPR_SLICE_LENGTH(state->slices[state->last_slice]) > bytes) {
+ state->working_slice = state->slices[state->last_slice];
+ state->working_slice_valid = 1;
+ /* TODO(klempner): Combine these into a single operation that doesn't need
+ to refcount */
+ gpr_slice_unref(gpr_slice_split_tail(
+ &state->slices[state->last_slice],
+ GPR_SLICE_LENGTH(state->slices[state->last_slice]) - bytes));
+ bytes = 0;
+ } else {
+ bytes -= GPR_SLICE_LENGTH(state->slices[state->last_slice]);
+ gpr_slice_unref(state->slices[state->last_slice]);
+ --state->last_slice;
+ if (state->last_slice == -1) {
+ state->first_slice = -1;
+ }
+ }
+ }
+}
+
+typedef struct {
+ grpc_endpoint base;
+ grpc_fd *em_fd;
+ int fd;
+ size_t slice_size;
+ gpr_refcount refcount;
+
+ grpc_endpoint_read_cb read_cb;
+ void *read_user_data;
+ gpr_timespec read_deadline;
+ grpc_endpoint_write_cb write_cb;
+ void *write_user_data;
+ gpr_timespec write_deadline;
+
+ grpc_tcp_slice_state write_state;
+} grpc_tcp;
+
+static void grpc_tcp_handle_read(void *arg /* grpc_tcp */,
+ grpc_iomgr_cb_status status);
+static void grpc_tcp_handle_write(void *arg /* grpc_tcp */,
+ grpc_iomgr_cb_status status);
+
+static void grpc_tcp_shutdown(grpc_endpoint *ep) {
+ grpc_tcp *tcp = (grpc_tcp *)ep;
+ grpc_fd_shutdown(tcp->em_fd);
+}
+
+static void grpc_tcp_unref(grpc_tcp *tcp) {
+ int refcount_zero = gpr_unref(&tcp->refcount);
+ if (refcount_zero) {
+ grpc_fd_destroy(tcp->em_fd);
+ gpr_free(tcp);
+ }
+}
+
+static void grpc_tcp_destroy(grpc_endpoint *ep) {
+ grpc_tcp *tcp = (grpc_tcp *)ep;
+ grpc_tcp_unref(tcp);
+}
+
+static void call_read_cb(grpc_tcp *tcp, gpr_slice *slices, size_t nslices,
+ grpc_endpoint_cb_status status) {
+ grpc_endpoint_read_cb cb = tcp->read_cb;
+
+#ifdef GRPC_TRACE_TCP
+ size_t i;
+ gpr_log(GPR_DEBUG, "read: status=%d", status);
+ for (i = 0; i < nslices; i++) {
+ char *dump =
+ gpr_hexdump((char *)GPR_SLICE_START_PTR(slices[i]),
+ GPR_SLICE_LENGTH(slices[i]), GPR_HEXDUMP_PLAINTEXT);
+ gpr_log(GPR_DEBUG, "READ: %s", dump);
+ gpr_free(dump);
+ }
+#endif
+
+ tcp->read_cb = NULL;
+ cb(tcp->read_user_data, slices, nslices, status);
+}
+
+#define INLINE_SLICE_BUFFER_SIZE 8
+#define MAX_READ_IOVEC 4
+static void grpc_tcp_handle_read(void *arg /* grpc_tcp */,
+ grpc_iomgr_cb_status status) {
+ grpc_tcp *tcp = (grpc_tcp *)arg;
+ int iov_size = 1;
+ gpr_slice static_read_slices[INLINE_SLICE_BUFFER_SIZE];
+ struct msghdr msg;
+ struct iovec iov[MAX_READ_IOVEC];
+ ssize_t read_bytes;
+ ssize_t allocated_bytes;
+ struct grpc_tcp_slice_state read_state;
+ gpr_slice *final_slices;
+ size_t final_nslices;
+
+ slice_state_init(&read_state, static_read_slices, INLINE_SLICE_BUFFER_SIZE,
+ 0);
+
+ if (status == GRPC_CALLBACK_CANCELLED) {
+ call_read_cb(tcp, NULL, 0, GRPC_ENDPOINT_CB_SHUTDOWN);
+ grpc_tcp_unref(tcp);
+ return;
+ }
+
+ if (status == GRPC_CALLBACK_TIMED_OUT) {
+ call_read_cb(tcp, NULL, 0, GRPC_ENDPOINT_CB_TIMED_OUT);
+ grpc_tcp_unref(tcp);
+ return;
+ }
+
+ /* TODO(klempner): Limit the amount we read at once. */
+ for (;;) {
+ allocated_bytes = slice_state_append_blocks_into_iovec(
+ &read_state, iov, iov_size, tcp->slice_size);
+
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = iov;
+ msg.msg_iovlen = iov_size;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ do {
+ read_bytes = recvmsg(tcp->fd, &msg, 0);
+ } while (read_bytes < 0 && errno == EINTR);
+
+ if (read_bytes < allocated_bytes) {
+ /* TODO(klempner): Consider a second read first, in hopes of getting a
+ * quick EAGAIN and saving a bunch of allocations. */
+ slice_state_remove_last(&read_state, read_bytes < 0
+ ? allocated_bytes
+ : allocated_bytes - read_bytes);
+ }
+
+ if (read_bytes < 0) {
+ /* NB: After calling the user_cb a parallel call of the read handler may
+ * be running. */
+ if (errno == EAGAIN) {
+ if (slice_state_has_available(&read_state)) {
+ /* TODO(klempner): We should probably do the call into the application
+ without all this junk on the stack */
+ /* FIXME(klempner): Refcount properly */
+ slice_state_transfer_ownership(&read_state, &final_slices,
+ &final_nslices);
+ call_read_cb(tcp, final_slices, final_nslices, GRPC_ENDPOINT_CB_OK);
+ slice_state_destroy(&read_state);
+ grpc_tcp_unref(tcp);
+ } else {
+ /* Spurious read event, consume it here */
+ slice_state_destroy(&read_state);
+ grpc_fd_notify_on_read(tcp->em_fd, grpc_tcp_handle_read, tcp,
+ tcp->read_deadline);
+ }
+ } else {
+ /* TODO(klempner): Log interesting errors */
+ call_read_cb(tcp, NULL, 0, GRPC_ENDPOINT_CB_ERROR);
+ slice_state_destroy(&read_state);
+ grpc_tcp_unref(tcp);
+ }
+ return;
+ } else if (read_bytes == 0) {
+ /* 0 read size ==> end of stream */
+ if (slice_state_has_available(&read_state)) {
+ /* there were bytes already read: pass them up to the application */
+ slice_state_transfer_ownership(&read_state, &final_slices,
+ &final_nslices);
+ call_read_cb(tcp, final_slices, final_nslices, GRPC_ENDPOINT_CB_EOF);
+ } else {
+ call_read_cb(tcp, NULL, 0, GRPC_ENDPOINT_CB_EOF);
+ }
+ slice_state_destroy(&read_state);
+ grpc_tcp_unref(tcp);
+ return;
+ } else if (iov_size < MAX_READ_IOVEC) {
+ ++iov_size;
+ }
+ }
+}
+
+static void grpc_tcp_notify_on_read(grpc_endpoint *ep, grpc_endpoint_read_cb cb,
+ void *user_data, gpr_timespec deadline) {
+ grpc_tcp *tcp = (grpc_tcp *)ep;
+ GPR_ASSERT(tcp->read_cb == NULL);
+ tcp->read_cb = cb;
+ tcp->read_user_data = user_data;
+ tcp->read_deadline = deadline;
+ gpr_ref(&tcp->refcount);
+ grpc_fd_notify_on_read(tcp->em_fd, grpc_tcp_handle_read, tcp, deadline);
+}
+
+#define MAX_WRITE_IOVEC 16
+static grpc_endpoint_write_status grpc_tcp_flush(grpc_tcp *tcp) {
+ struct msghdr msg;
+ struct iovec iov[MAX_WRITE_IOVEC];
+ int iov_size;
+ ssize_t sent_length;
+ grpc_tcp_slice_state *state = &tcp->write_state;
+
+ for (;;) {
+ iov_size = slice_state_to_iovec(state, iov, MAX_WRITE_IOVEC);
+
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = iov;
+ msg.msg_iovlen = iov_size;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ do {
+ /* TODO(klempner): Cork if this is a partial write */
+ sent_length = sendmsg(tcp->fd, &msg, 0);
+ } while (sent_length < 0 && errno == EINTR);
+
+ if (sent_length < 0) {
+ if (errno == EAGAIN) {
+ return GRPC_ENDPOINT_WRITE_PENDING;
+ } else {
+ /* TODO(klempner): Log some of these */
+ slice_state_destroy(state);
+ return GRPC_ENDPOINT_WRITE_ERROR;
+ }
+ }
+
+ /* TODO(klempner): Probably better to batch this after we finish flushing */
+ slice_state_remove_prefix(state, sent_length);
+
+ if (!slice_state_has_available(state)) {
+ return GRPC_ENDPOINT_WRITE_DONE;
+ }
+ };
+}
+
+static void grpc_tcp_handle_write(void *arg /* grpc_tcp */,
+ grpc_iomgr_cb_status status) {
+ grpc_tcp *tcp = (grpc_tcp *)arg;
+ grpc_endpoint_write_status write_status;
+ grpc_endpoint_cb_status cb_status;
+ grpc_endpoint_write_cb cb;
+
+ cb_status = GRPC_ENDPOINT_CB_OK;
+
+ if (status == GRPC_CALLBACK_CANCELLED) {
+ cb_status = GRPC_ENDPOINT_CB_SHUTDOWN;
+ } else if (status == GRPC_CALLBACK_TIMED_OUT) {
+ cb_status = GRPC_ENDPOINT_CB_TIMED_OUT;
+ }
+
+ if (cb_status != GRPC_ENDPOINT_CB_OK) {
+ slice_state_destroy(&tcp->write_state);
+ cb = tcp->write_cb;
+ tcp->write_cb = NULL;
+ cb(tcp->write_user_data, cb_status);
+ grpc_tcp_unref(tcp);
+ return;
+ }
+
+ write_status = grpc_tcp_flush(tcp);
+ if (write_status == GRPC_ENDPOINT_WRITE_PENDING) {
+ grpc_fd_notify_on_write(tcp->em_fd, grpc_tcp_handle_write, tcp,
+ tcp->write_deadline);
+ } else {
+ slice_state_destroy(&tcp->write_state);
+ if (write_status == GRPC_ENDPOINT_WRITE_DONE) {
+ cb_status = GRPC_ENDPOINT_CB_OK;
+ } else {
+ cb_status = GRPC_ENDPOINT_CB_ERROR;
+ }
+ cb = tcp->write_cb;
+ tcp->write_cb = NULL;
+ cb(tcp->write_user_data, cb_status);
+ grpc_tcp_unref(tcp);
+ }
+}
+
+static grpc_endpoint_write_status grpc_tcp_write(
+ grpc_endpoint *ep, gpr_slice *slices, size_t nslices,
+ grpc_endpoint_write_cb cb, void *user_data, gpr_timespec deadline) {
+ grpc_tcp *tcp = (grpc_tcp *)ep;
+ grpc_endpoint_write_status status;
+
+#ifdef GRPC_TRACE_TCP
+ size_t i;
+
+ for (i = 0; i < nslices; i++) {
+ char *data =
+ gpr_hexdump((char *)GPR_SLICE_START_PTR(slices[i]),
+ GPR_SLICE_LENGTH(slices[i]), GPR_HEXDUMP_PLAINTEXT);
+ gpr_log(GPR_DEBUG, "WRITE %p: %s", tcp, data);
+ gpr_free(data);
+ }
+#endif
+
+ GPR_ASSERT(tcp->write_cb == NULL);
+ slice_state_init(&tcp->write_state, slices, nslices, nslices);
+
+ status = grpc_tcp_flush(tcp);
+ if (status == GRPC_ENDPOINT_WRITE_PENDING) {
+ /* TODO(klempner): Consider inlining rather than malloc for small nslices */
+ slice_state_realloc(&tcp->write_state, nslices);
+ gpr_ref(&tcp->refcount);
+ tcp->write_cb = cb;
+ tcp->write_user_data = user_data;
+ tcp->write_deadline = deadline;
+ grpc_fd_notify_on_write(tcp->em_fd, grpc_tcp_handle_write, tcp,
+ tcp->write_deadline);
+ }
+
+ return status;
+}
+
+static const grpc_endpoint_vtable vtable = {grpc_tcp_notify_on_read,
+ grpc_tcp_write, grpc_tcp_shutdown,
+ grpc_tcp_destroy};
+
+grpc_endpoint *grpc_tcp_create(grpc_fd *em_fd, size_t slice_size) {
+ grpc_tcp *tcp = (grpc_tcp *)gpr_malloc(sizeof(grpc_tcp));
+ tcp->base.vtable = &vtable;
+ tcp->fd = grpc_fd_get(em_fd);
+ tcp->read_cb = NULL;
+ tcp->write_cb = NULL;
+ tcp->read_user_data = NULL;
+ tcp->write_user_data = NULL;
+ tcp->slice_size = slice_size;
+ tcp->read_deadline = gpr_inf_future;
+ tcp->write_deadline = gpr_inf_future;
+ slice_state_init(&tcp->write_state, NULL, 0, 0);
+ /* paired with unref in grpc_tcp_destroy */
+ gpr_ref_init(&tcp->refcount, 1);
+ tcp->em_fd = em_fd;
+ return &tcp->base;
+}