diff options
Diffstat (limited to 'src/core/ext')
10 files changed, 1226 insertions, 134 deletions
diff --git a/src/core/ext/filters/client_channel/client_channel_channelz.cc b/src/core/ext/filters/client_channel/client_channel_channelz.cc index bad1ef668c..8e5426081c 100644 --- a/src/core/ext/filters/client_channel/client_channel_channelz.cc +++ b/src/core/ext/filters/client_channel/client_channel_channelz.cc @@ -128,7 +128,8 @@ void SubchannelNode::PopulateConnectivityState(grpc_json* json) { if (subchannel_ == nullptr) { state = GRPC_CHANNEL_SHUTDOWN; } else { - state = grpc_subchannel_check_connectivity(subchannel_, nullptr); + state = grpc_subchannel_check_connectivity( + subchannel_, nullptr, true /* inhibit_health_checking */); } json = grpc_json_create_child(nullptr, json, "state", nullptr, GRPC_JSON_OBJECT, false); diff --git a/src/core/ext/filters/client_channel/health/health.pb.c b/src/core/ext/filters/client_channel/health/health.pb.c new file mode 100644 index 0000000000..5499c549cc --- /dev/null +++ b/src/core/ext/filters/client_channel/health/health.pb.c @@ -0,0 +1,23 @@ +/* Automatically generated nanopb constant definitions */ +/* Generated by nanopb-0.3.7-dev */ + +#include "src/core/ext/filters/client_channel/health/health.pb.h" +/* @@protoc_insertion_point(includes) */ +#if PB_PROTO_HEADER_VERSION != 30 +#error Regenerate this file with the current version of nanopb generator. +#endif + + + +const pb_field_t grpc_health_v1_HealthCheckRequest_fields[2] = { + PB_FIELD( 1, STRING , OPTIONAL, STATIC , FIRST, grpc_health_v1_HealthCheckRequest, service, service, 0), + PB_LAST_FIELD +}; + +const pb_field_t grpc_health_v1_HealthCheckResponse_fields[2] = { + PB_FIELD( 1, UENUM , OPTIONAL, STATIC , FIRST, grpc_health_v1_HealthCheckResponse, status, status, 0), + PB_LAST_FIELD +}; + + +/* @@protoc_insertion_point(eof) */ diff --git a/src/core/ext/filters/client_channel/health/health.pb.h b/src/core/ext/filters/client_channel/health/health.pb.h new file mode 100644 index 0000000000..9d54ccd618 --- /dev/null +++ b/src/core/ext/filters/client_channel/health/health.pb.h @@ -0,0 +1,73 @@ +/* Automatically generated nanopb header */ +/* Generated by nanopb-0.3.7-dev */ + +#ifndef PB_GRPC_HEALTH_V1_HEALTH_PB_H_INCLUDED +#define PB_GRPC_HEALTH_V1_HEALTH_PB_H_INCLUDED +#include "pb.h" +/* @@protoc_insertion_point(includes) */ +#if PB_PROTO_HEADER_VERSION != 30 +#error Regenerate this file with the current version of nanopb generator. +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* Enum definitions */ +typedef enum _grpc_health_v1_HealthCheckResponse_ServingStatus { + grpc_health_v1_HealthCheckResponse_ServingStatus_UNKNOWN = 0, + grpc_health_v1_HealthCheckResponse_ServingStatus_SERVING = 1, + grpc_health_v1_HealthCheckResponse_ServingStatus_NOT_SERVING = 2, + grpc_health_v1_HealthCheckResponse_ServingStatus_SERVICE_UNKNOWN = 3 +} grpc_health_v1_HealthCheckResponse_ServingStatus; +#define _grpc_health_v1_HealthCheckResponse_ServingStatus_MIN grpc_health_v1_HealthCheckResponse_ServingStatus_UNKNOWN +#define _grpc_health_v1_HealthCheckResponse_ServingStatus_MAX grpc_health_v1_HealthCheckResponse_ServingStatus_SERVICE_UNKNOWN +#define _grpc_health_v1_HealthCheckResponse_ServingStatus_ARRAYSIZE ((grpc_health_v1_HealthCheckResponse_ServingStatus)(grpc_health_v1_HealthCheckResponse_ServingStatus_SERVICE_UNKNOWN+1)) + +/* Struct definitions */ +typedef struct _grpc_health_v1_HealthCheckRequest { + bool has_service; + char service[200]; +/* @@protoc_insertion_point(struct:grpc_health_v1_HealthCheckRequest) */ +} grpc_health_v1_HealthCheckRequest; + +typedef struct _grpc_health_v1_HealthCheckResponse { + bool has_status; + grpc_health_v1_HealthCheckResponse_ServingStatus status; +/* @@protoc_insertion_point(struct:grpc_health_v1_HealthCheckResponse) */ +} grpc_health_v1_HealthCheckResponse; + +/* Default values for struct fields */ + +/* Initializer values for message structs */ +#define grpc_health_v1_HealthCheckRequest_init_default {false, ""} +#define grpc_health_v1_HealthCheckResponse_init_default {false, (grpc_health_v1_HealthCheckResponse_ServingStatus)0} +#define grpc_health_v1_HealthCheckRequest_init_zero {false, ""} +#define grpc_health_v1_HealthCheckResponse_init_zero {false, (grpc_health_v1_HealthCheckResponse_ServingStatus)0} + +/* Field tags (for use in manual encoding/decoding) */ +#define grpc_health_v1_HealthCheckRequest_service_tag 1 +#define grpc_health_v1_HealthCheckResponse_status_tag 1 + +/* Struct field encoding specification for nanopb */ +extern const pb_field_t grpc_health_v1_HealthCheckRequest_fields[2]; +extern const pb_field_t grpc_health_v1_HealthCheckResponse_fields[2]; + +/* Maximum encoded size of messages (where known) */ +#define grpc_health_v1_HealthCheckRequest_size 203 +#define grpc_health_v1_HealthCheckResponse_size 2 + +/* Message IDs (where set with "msgid" option) */ +#ifdef PB_MSGID + +#define HEALTH_MESSAGES \ + + +#endif + +#ifdef __cplusplus +} /* extern "C" */ +#endif +/* @@protoc_insertion_point(eof) */ + +#endif diff --git a/src/core/ext/filters/client_channel/health/health_check_client.cc b/src/core/ext/filters/client_channel/health/health_check_client.cc new file mode 100644 index 0000000000..400d99b07c --- /dev/null +++ b/src/core/ext/filters/client_channel/health/health_check_client.cc @@ -0,0 +1,646 @@ +/* + * + * Copyright 2018 gRPC authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include <grpc/support/port_platform.h> + +#include <stdint.h> + +#include "src/core/ext/filters/client_channel/health/health_check_client.h" + +#include "pb_decode.h" +#include "pb_encode.h" +#include "src/core/ext/filters/client_channel/health/health.pb.h" +#include "src/core/lib/debug/trace.h" +#include "src/core/lib/gprpp/mutex_lock.h" +#include "src/core/lib/slice/slice_internal.h" +#include "src/core/lib/transport/error_utils.h" +#include "src/core/lib/transport/status_metadata.h" + +#define HEALTH_CHECK_INITIAL_CONNECT_BACKOFF_SECONDS 1 +#define HEALTH_CHECK_RECONNECT_BACKOFF_MULTIPLIER 1.6 +#define HEALTH_CHECK_RECONNECT_MAX_BACKOFF_SECONDS 120 +#define HEALTH_CHECK_RECONNECT_JITTER 0.2 + +grpc_core::TraceFlag grpc_health_check_client_trace(false, + "health_check_client"); + +namespace grpc_core { + +// +// HealthCheckClient +// + +HealthCheckClient::HealthCheckClient( + const char* service_name, + RefCountedPtr<ConnectedSubchannel> connected_subchannel, + grpc_pollset_set* interested_parties, + grpc_core::RefCountedPtr<grpc_core::channelz::SubchannelNode> channelz_node) + : InternallyRefCountedWithTracing<HealthCheckClient>( + &grpc_health_check_client_trace), + service_name_(service_name), + connected_subchannel_(std::move(connected_subchannel)), + interested_parties_(interested_parties), + channelz_node_(std::move(channelz_node)), + retry_backoff_( + BackOff::Options() + .set_initial_backoff( + HEALTH_CHECK_INITIAL_CONNECT_BACKOFF_SECONDS * 1000) + .set_multiplier(HEALTH_CHECK_RECONNECT_BACKOFF_MULTIPLIER) + .set_jitter(HEALTH_CHECK_RECONNECT_JITTER) + .set_max_backoff(HEALTH_CHECK_RECONNECT_MAX_BACKOFF_SECONDS * + 1000)) { + if (grpc_health_check_client_trace.enabled()) { + gpr_log(GPR_INFO, "created HealthCheckClient %p", this); + } + GRPC_CLOSURE_INIT(&retry_timer_callback_, OnRetryTimer, this, + grpc_schedule_on_exec_ctx); + gpr_mu_init(&mu_); + StartCall(); +} + +HealthCheckClient::~HealthCheckClient() { + if (grpc_health_check_client_trace.enabled()) { + gpr_log(GPR_INFO, "destroying HealthCheckClient %p", this); + } + GRPC_ERROR_UNREF(error_); + gpr_mu_destroy(&mu_); +} + +void HealthCheckClient::NotifyOnHealthChange(grpc_connectivity_state* state, + grpc_closure* closure) { + MutexLock lock(&mu_); + GPR_ASSERT(notify_state_ == nullptr); + if (*state != state_) { + *state = state_; + GRPC_CLOSURE_SCHED(closure, GRPC_ERROR_REF(error_)); + return; + } + notify_state_ = state; + on_health_changed_ = closure; +} + +void HealthCheckClient::SetHealthStatus(grpc_connectivity_state state, + grpc_error* error) { + MutexLock lock(&mu_); + SetHealthStatusLocked(state, error); +} + +void HealthCheckClient::SetHealthStatusLocked(grpc_connectivity_state state, + grpc_error* error) { + if (grpc_health_check_client_trace.enabled()) { + gpr_log(GPR_INFO, "HealthCheckClient %p: setting state=%d error=%s", this, + state, grpc_error_string(error)); + } + if (notify_state_ != nullptr && *notify_state_ != state) { + *notify_state_ = state; + notify_state_ = nullptr; + GRPC_CLOSURE_SCHED(on_health_changed_, GRPC_ERROR_REF(error)); + on_health_changed_ = nullptr; + } + state_ = state; + GRPC_ERROR_UNREF(error_); + error_ = error; +} + +void HealthCheckClient::Orphan() { + if (grpc_health_check_client_trace.enabled()) { + gpr_log(GPR_INFO, "HealthCheckClient %p: shutting down", this); + } + { + MutexLock lock(&mu_); + if (on_health_changed_ != nullptr) { + *notify_state_ = GRPC_CHANNEL_SHUTDOWN; + notify_state_ = nullptr; + GRPC_CLOSURE_SCHED(on_health_changed_, GRPC_ERROR_NONE); + on_health_changed_ = nullptr; + } + shutting_down_ = true; + call_state_.reset(); + if (retry_timer_callback_pending_) { + grpc_timer_cancel(&retry_timer_); + } + } + Unref(DEBUG_LOCATION, "orphan"); +} + +void HealthCheckClient::StartCall() { + MutexLock lock(&mu_); + StartCallLocked(); +} + +void HealthCheckClient::StartCallLocked() { + if (shutting_down_) return; + GPR_ASSERT(call_state_ == nullptr); + SetHealthStatusLocked(GRPC_CHANNEL_CONNECTING, GRPC_ERROR_NONE); + call_state_ = MakeOrphanable<CallState>(Ref(), interested_parties_); + if (grpc_health_check_client_trace.enabled()) { + gpr_log(GPR_INFO, "HealthCheckClient %p: created CallState %p", this, + call_state_.get()); + } + call_state_->StartCall(); +} + +void HealthCheckClient::StartRetryTimer() { + MutexLock lock(&mu_); + SetHealthStatusLocked( + GRPC_CHANNEL_TRANSIENT_FAILURE, + GRPC_ERROR_CREATE_FROM_STATIC_STRING( + "health check call failed; will retry after backoff")); + grpc_millis next_try = retry_backoff_.NextAttemptTime(); + if (grpc_health_check_client_trace.enabled()) { + gpr_log(GPR_INFO, "HealthCheckClient %p: health check call lost...", this); + grpc_millis timeout = next_try - ExecCtx::Get()->Now(); + if (timeout > 0) { + gpr_log(GPR_INFO, + "HealthCheckClient %p: ... will retry in %" PRId64 "ms.", this, + timeout); + } else { + gpr_log(GPR_INFO, "HealthCheckClient %p: ... retrying immediately.", + this); + } + } + // Ref for callback, tracked manually. + Ref(DEBUG_LOCATION, "health_retry_timer").release(); + retry_timer_callback_pending_ = true; + grpc_timer_init(&retry_timer_, next_try, &retry_timer_callback_); +} + +void HealthCheckClient::OnRetryTimer(void* arg, grpc_error* error) { + HealthCheckClient* self = static_cast<HealthCheckClient*>(arg); + { + MutexLock lock(&self->mu_); + self->retry_timer_callback_pending_ = false; + if (!self->shutting_down_ && error == GRPC_ERROR_NONE && + self->call_state_ == nullptr) { + if (grpc_health_check_client_trace.enabled()) { + gpr_log(GPR_INFO, "HealthCheckClient %p: restarting health check call", + self); + } + self->StartCallLocked(); + } + } + self->Unref(DEBUG_LOCATION, "health_retry_timer"); +} + +// +// protobuf helpers +// + +namespace { + +void EncodeRequest(const char* service_name, + ManualConstructor<SliceBufferByteStream>* send_message) { + grpc_health_v1_HealthCheckRequest request_struct; + request_struct.has_service = true; + snprintf(request_struct.service, sizeof(request_struct.service), "%s", + service_name); + pb_ostream_t ostream; + memset(&ostream, 0, sizeof(ostream)); + pb_encode(&ostream, grpc_health_v1_HealthCheckRequest_fields, + &request_struct); + grpc_slice request_slice = GRPC_SLICE_MALLOC(ostream.bytes_written); + ostream = pb_ostream_from_buffer(GRPC_SLICE_START_PTR(request_slice), + GRPC_SLICE_LENGTH(request_slice)); + GPR_ASSERT(pb_encode(&ostream, grpc_health_v1_HealthCheckRequest_fields, + &request_struct) != 0); + grpc_slice_buffer slice_buffer; + grpc_slice_buffer_init(&slice_buffer); + grpc_slice_buffer_add(&slice_buffer, request_slice); + send_message->Init(&slice_buffer, 0); + grpc_slice_buffer_destroy_internal(&slice_buffer); +} + +// Returns true if healthy. +// If there was an error parsing the response, sets *error and returns false. +bool DecodeResponse(grpc_slice_buffer* slice_buffer, grpc_error** error) { + // If message is empty, assume unhealthy. + if (slice_buffer->length == 0) { + *error = + GRPC_ERROR_CREATE_FROM_STATIC_STRING("health check response was empty"); + return false; + } + // Concatenate the slices to form a single string. + UniquePtr<uint8_t> recv_message_deleter; + uint8_t* recv_message; + if (slice_buffer->count == 1) { + recv_message = GRPC_SLICE_START_PTR(slice_buffer->slices[0]); + } else { + recv_message = static_cast<uint8_t*>(gpr_malloc(slice_buffer->length)); + recv_message_deleter.reset(recv_message); + size_t offset = 0; + for (size_t i = 0; i < slice_buffer->count; ++i) { + memcpy(recv_message + offset, + GRPC_SLICE_START_PTR(slice_buffer->slices[i]), + GRPC_SLICE_LENGTH(slice_buffer->slices[i])); + offset += GRPC_SLICE_LENGTH(slice_buffer->slices[i]); + } + } + // Deserialize message. + grpc_health_v1_HealthCheckResponse response_struct; + pb_istream_t istream = + pb_istream_from_buffer(recv_message, slice_buffer->length); + if (!pb_decode(&istream, grpc_health_v1_HealthCheckResponse_fields, + &response_struct)) { + // Can't parse message; assume unhealthy. + *error = GRPC_ERROR_CREATE_FROM_STATIC_STRING( + "cannot parse health check response"); + return false; + } + if (!response_struct.has_status) { + // Field not present; assume unhealthy. + *error = GRPC_ERROR_CREATE_FROM_STATIC_STRING( + "status field not present in health check response"); + return false; + } + return response_struct.status == + grpc_health_v1_HealthCheckResponse_ServingStatus_SERVING; +} + +} // namespace + +// +// HealthCheckClient::CallState +// + +HealthCheckClient::CallState::CallState( + RefCountedPtr<HealthCheckClient> health_check_client, + grpc_pollset_set* interested_parties) + : InternallyRefCountedWithTracing<CallState>( + &grpc_health_check_client_trace), + health_check_client_(std::move(health_check_client)), + pollent_(grpc_polling_entity_create_from_pollset_set(interested_parties)), + arena_(gpr_arena_create(health_check_client_->connected_subchannel_ + ->GetInitialCallSizeEstimate(0))) { + memset(&call_combiner_, 0, sizeof(call_combiner_)); + grpc_call_combiner_init(&call_combiner_); + memset(context_, 0, sizeof(context_)); + gpr_atm_rel_store(&seen_response_, static_cast<gpr_atm>(0)); +} + +HealthCheckClient::CallState::~CallState() { + if (grpc_health_check_client_trace.enabled()) { + gpr_log(GPR_INFO, "HealthCheckClient %p: destroying CallState %p", + health_check_client_.get(), this); + } + if (call_ != nullptr) GRPC_SUBCHANNEL_CALL_UNREF(call_, "call_ended"); + // Unset the call combiner cancellation closure. This has the + // effect of scheduling the previously set cancellation closure, if + // any, so that it can release any internal references it may be + // holding to the call stack. Also flush the closures on exec_ctx so that + // filters that schedule cancel notification closures on exec_ctx do not + // need to take a ref of the call stack to guarantee closure liveness. + grpc_call_combiner_set_notify_on_cancel(&call_combiner_, nullptr); + grpc_core::ExecCtx::Get()->Flush(); + grpc_call_combiner_destroy(&call_combiner_); + gpr_arena_destroy(arena_); +} + +void HealthCheckClient::CallState::Orphan() { + grpc_call_combiner_cancel(&call_combiner_, GRPC_ERROR_CANCELLED); + Cancel(); +} + +void HealthCheckClient::CallState::StartCall() { + ConnectedSubchannel::CallArgs args = { + &pollent_, + GRPC_MDSTR_SLASH_GRPC_DOT_HEALTH_DOT_V1_DOT_HEALTH_SLASH_WATCH, + gpr_now(GPR_CLOCK_MONOTONIC), // start_time + GRPC_MILLIS_INF_FUTURE, // deadline + arena_, + context_, + &call_combiner_, + 0, // parent_data_size + }; + grpc_error* error = + health_check_client_->connected_subchannel_->CreateCall(args, &call_); + if (error != GRPC_ERROR_NONE) { + gpr_log(GPR_ERROR, + "HealthCheckClient %p CallState %p: error creating health " + "checking call on subchannel (%s); will retry", + health_check_client_.get(), this, grpc_error_string(error)); + GRPC_ERROR_UNREF(error); + // Schedule instead of running directly, since we must not be + // holding health_check_client_->mu_ when CallEnded() is called. + Ref(DEBUG_LOCATION, "call_end_closure").release(); + GRPC_CLOSURE_SCHED( + GRPC_CLOSURE_INIT(&batch_.handler_private.closure, CallEndedRetry, this, + grpc_schedule_on_exec_ctx), + GRPC_ERROR_NONE); + return; + } + // Initialize payload and batch. + memset(&batch_, 0, sizeof(batch_)); + batch_.payload = &payload_; + // on_complete callback takes ref, handled manually. + Ref(DEBUG_LOCATION, "on_complete").release(); + batch_.on_complete = GRPC_CLOSURE_INIT(&on_complete_, OnComplete, this, + grpc_schedule_on_exec_ctx); + // Add send_initial_metadata op. + grpc_metadata_batch_init(&send_initial_metadata_); + error = grpc_metadata_batch_add_head( + &send_initial_metadata_, &path_metadata_storage_, + grpc_mdelem_from_slices( + GRPC_MDSTR_PATH, + GRPC_MDSTR_SLASH_GRPC_DOT_HEALTH_DOT_V1_DOT_HEALTH_SLASH_WATCH)); + GPR_ASSERT(error == GRPC_ERROR_NONE); + payload_.send_initial_metadata.send_initial_metadata = + &send_initial_metadata_; + payload_.send_initial_metadata.send_initial_metadata_flags = 0; + payload_.send_initial_metadata.peer_string = nullptr; + batch_.send_initial_metadata = true; + // Add send_message op. + EncodeRequest(health_check_client_->service_name_, &send_message_); + payload_.send_message.send_message.reset(send_message_.get()); + batch_.send_message = true; + // Add send_trailing_metadata op. + grpc_metadata_batch_init(&send_trailing_metadata_); + payload_.send_trailing_metadata.send_trailing_metadata = + &send_trailing_metadata_; + batch_.send_trailing_metadata = true; + // Add recv_initial_metadata op. + grpc_metadata_batch_init(&recv_initial_metadata_); + payload_.recv_initial_metadata.recv_initial_metadata = + &recv_initial_metadata_; + payload_.recv_initial_metadata.recv_flags = nullptr; + payload_.recv_initial_metadata.trailing_metadata_available = nullptr; + payload_.recv_initial_metadata.peer_string = nullptr; + // recv_initial_metadata_ready callback takes ref, handled manually. + Ref(DEBUG_LOCATION, "recv_initial_metadata_ready").release(); + payload_.recv_initial_metadata.recv_initial_metadata_ready = + GRPC_CLOSURE_INIT(&recv_initial_metadata_ready_, RecvInitialMetadataReady, + this, grpc_schedule_on_exec_ctx); + batch_.recv_initial_metadata = true; + // Add recv_message op. + payload_.recv_message.recv_message = &recv_message_; + // recv_message callback takes ref, handled manually. + Ref(DEBUG_LOCATION, "recv_message_ready").release(); + payload_.recv_message.recv_message_ready = GRPC_CLOSURE_INIT( + &recv_message_ready_, RecvMessageReady, this, grpc_schedule_on_exec_ctx); + batch_.recv_message = true; + // Start batch. + StartBatch(&batch_); + // Initialize recv_trailing_metadata batch. + memset(&recv_trailing_metadata_batch_, 0, + sizeof(recv_trailing_metadata_batch_)); + recv_trailing_metadata_batch_.payload = &payload_; + // Add recv_trailing_metadata op. + grpc_metadata_batch_init(&recv_trailing_metadata_); + payload_.recv_trailing_metadata.recv_trailing_metadata = + &recv_trailing_metadata_; + payload_.recv_trailing_metadata.collect_stats = &collect_stats_; + // This callback signals the end of the call, so it relies on the + // initial ref instead of taking a new ref. When it's invoked, the + // initial ref is released. + payload_.recv_trailing_metadata.recv_trailing_metadata_ready = + GRPC_CLOSURE_INIT(&recv_trailing_metadata_ready_, + RecvTrailingMetadataReady, this, + grpc_schedule_on_exec_ctx); + recv_trailing_metadata_batch_.recv_trailing_metadata = true; + // Start recv_trailing_metadata batch. + StartBatch(&recv_trailing_metadata_batch_); +} + +void HealthCheckClient::CallState::StartBatchInCallCombiner(void* arg, + grpc_error* error) { + grpc_transport_stream_op_batch* batch = + static_cast<grpc_transport_stream_op_batch*>(arg); + grpc_subchannel_call* call = + static_cast<grpc_subchannel_call*>(batch->handler_private.extra_arg); + grpc_subchannel_call_process_op(call, batch); +} + +void HealthCheckClient::CallState::StartBatch( + grpc_transport_stream_op_batch* batch) { + batch->handler_private.extra_arg = call_; + GRPC_CLOSURE_INIT(&batch->handler_private.closure, StartBatchInCallCombiner, + batch, grpc_schedule_on_exec_ctx); + GRPC_CALL_COMBINER_START(&call_combiner_, &batch->handler_private.closure, + GRPC_ERROR_NONE, "start_subchannel_batch"); +} + +void HealthCheckClient::CallState::OnCancelComplete(void* arg, + grpc_error* error) { + HealthCheckClient::CallState* self = + static_cast<HealthCheckClient::CallState*>(arg); + GRPC_CALL_COMBINER_STOP(&self->call_combiner_, "health_cancel"); + self->Unref(DEBUG_LOCATION, "cancel"); +} + +void HealthCheckClient::CallState::StartCancel(void* arg, grpc_error* error) { + HealthCheckClient::CallState* self = + static_cast<HealthCheckClient::CallState*>(arg); + auto* batch = grpc_make_transport_stream_op( + GRPC_CLOSURE_CREATE(OnCancelComplete, self, grpc_schedule_on_exec_ctx)); + batch->cancel_stream = true; + batch->payload->cancel_stream.cancel_error = GRPC_ERROR_CANCELLED; + grpc_subchannel_call_process_op(self->call_, batch); +} + +void HealthCheckClient::CallState::Cancel() { + if (call_ != nullptr) { + Ref(DEBUG_LOCATION, "cancel").release(); + GRPC_CALL_COMBINER_START( + &call_combiner_, + GRPC_CLOSURE_CREATE(StartCancel, this, grpc_schedule_on_exec_ctx), + GRPC_ERROR_NONE, "health_cancel"); + } +} + +void HealthCheckClient::CallState::OnComplete(void* arg, grpc_error* error) { + HealthCheckClient::CallState* self = + static_cast<HealthCheckClient::CallState*>(arg); + GRPC_CALL_COMBINER_STOP(&self->call_combiner_, "on_complete"); + grpc_metadata_batch_destroy(&self->send_initial_metadata_); + grpc_metadata_batch_destroy(&self->send_trailing_metadata_); + self->Unref(DEBUG_LOCATION, "on_complete"); +} + +void HealthCheckClient::CallState::RecvInitialMetadataReady(void* arg, + grpc_error* error) { + HealthCheckClient::CallState* self = + static_cast<HealthCheckClient::CallState*>(arg); + GRPC_CALL_COMBINER_STOP(&self->call_combiner_, "recv_initial_metadata_ready"); + grpc_metadata_batch_destroy(&self->recv_initial_metadata_); + self->Unref(DEBUG_LOCATION, "recv_initial_metadata_ready"); +} + +void HealthCheckClient::CallState::DoneReadingRecvMessage(grpc_error* error) { + recv_message_.reset(); + if (error != GRPC_ERROR_NONE) { + GRPC_ERROR_UNREF(error); + Cancel(); + grpc_slice_buffer_destroy_internal(&recv_message_buffer_); + Unref(DEBUG_LOCATION, "recv_message_ready"); + return; + } + const bool healthy = DecodeResponse(&recv_message_buffer_, &error); + const grpc_connectivity_state state = + healthy ? GRPC_CHANNEL_READY : GRPC_CHANNEL_TRANSIENT_FAILURE; + if (error == GRPC_ERROR_NONE && !healthy) { + error = GRPC_ERROR_CREATE_FROM_STATIC_STRING("backend unhealthy"); + } + health_check_client_->SetHealthStatus(state, error); + gpr_atm_rel_store(&seen_response_, static_cast<gpr_atm>(1)); + grpc_slice_buffer_destroy_internal(&recv_message_buffer_); + // Start another recv_message batch. + // This re-uses the ref we're holding. + // Note: Can't just reuse batch_ here, since we don't know that all + // callbacks from the original batch have completed yet. + memset(&recv_message_batch_, 0, sizeof(recv_message_batch_)); + recv_message_batch_.payload = &payload_; + payload_.recv_message.recv_message = &recv_message_; + payload_.recv_message.recv_message_ready = GRPC_CLOSURE_INIT( + &recv_message_ready_, RecvMessageReady, this, grpc_schedule_on_exec_ctx); + recv_message_batch_.recv_message = true; + StartBatch(&recv_message_batch_); +} + +grpc_error* HealthCheckClient::CallState::PullSliceFromRecvMessage() { + grpc_slice slice; + grpc_error* error = recv_message_->Pull(&slice); + if (error == GRPC_ERROR_NONE) { + grpc_slice_buffer_add(&recv_message_buffer_, slice); + } + return error; +} + +void HealthCheckClient::CallState::ContinueReadingRecvMessage() { + while (recv_message_->Next(SIZE_MAX, &recv_message_ready_)) { + grpc_error* error = PullSliceFromRecvMessage(); + if (error != GRPC_ERROR_NONE) { + DoneReadingRecvMessage(error); + return; + } + if (recv_message_buffer_.length == recv_message_->length()) { + DoneReadingRecvMessage(GRPC_ERROR_NONE); + break; + } + } +} + +void HealthCheckClient::CallState::OnByteStreamNext(void* arg, + grpc_error* error) { + HealthCheckClient::CallState* self = + static_cast<HealthCheckClient::CallState*>(arg); + if (error != GRPC_ERROR_NONE) { + self->DoneReadingRecvMessage(GRPC_ERROR_REF(error)); + return; + } + error = self->PullSliceFromRecvMessage(); + if (error != GRPC_ERROR_NONE) { + self->DoneReadingRecvMessage(error); + return; + } + if (self->recv_message_buffer_.length == self->recv_message_->length()) { + self->DoneReadingRecvMessage(GRPC_ERROR_NONE); + } else { + self->ContinueReadingRecvMessage(); + } +} + +void HealthCheckClient::CallState::RecvMessageReady(void* arg, + grpc_error* error) { + HealthCheckClient::CallState* self = + static_cast<HealthCheckClient::CallState*>(arg); + GRPC_CALL_COMBINER_STOP(&self->call_combiner_, "recv_message_ready"); + if (self->recv_message_ == nullptr) { + self->Unref(DEBUG_LOCATION, "recv_message_ready"); + return; + } + grpc_slice_buffer_init(&self->recv_message_buffer_); + GRPC_CLOSURE_INIT(&self->recv_message_ready_, OnByteStreamNext, self, + grpc_schedule_on_exec_ctx); + self->ContinueReadingRecvMessage(); + // Ref will continue to be held until we finish draining the byte stream. +} + +void HealthCheckClient::CallState::RecvTrailingMetadataReady( + void* arg, grpc_error* error) { + HealthCheckClient::CallState* self = + static_cast<HealthCheckClient::CallState*>(arg); + GRPC_CALL_COMBINER_STOP(&self->call_combiner_, + "recv_trailing_metadata_ready"); + // Get call status. + grpc_status_code status = GRPC_STATUS_UNKNOWN; + if (error != GRPC_ERROR_NONE) { + grpc_error_get_status(error, GRPC_MILLIS_INF_FUTURE, &status, + nullptr /* slice */, nullptr /* http_error */, + nullptr /* error_string */); + } else if (self->recv_trailing_metadata_.idx.named.grpc_status != nullptr) { + status = grpc_get_status_code_from_metadata( + self->recv_trailing_metadata_.idx.named.grpc_status->md); + } + if (grpc_health_check_client_trace.enabled()) { + gpr_log(GPR_INFO, + "HealthCheckClient %p CallState %p: health watch failed with " + "status %d", + self->health_check_client_.get(), self, status); + } + // Clean up. + grpc_metadata_batch_destroy(&self->recv_trailing_metadata_); + // For status UNIMPLEMENTED, give up and assume always healthy. + bool retry = true; + if (status == GRPC_STATUS_UNIMPLEMENTED) { + static const char kErrorMessage[] = + "health checking Watch method returned UNIMPLEMENTED; " + "disabling health checks but assuming server is healthy"; + gpr_log(GPR_ERROR, kErrorMessage); + if (self->health_check_client_->channelz_node_ != nullptr) { + self->health_check_client_->channelz_node_->AddTraceEvent( + channelz::ChannelTrace::Error, + grpc_slice_from_static_string(kErrorMessage)); + } + self->health_check_client_->SetHealthStatus(GRPC_CHANNEL_READY, + GRPC_ERROR_NONE); + retry = false; + } + self->CallEnded(retry); +} + +void HealthCheckClient::CallState::CallEndedRetry(void* arg, + grpc_error* error) { + HealthCheckClient::CallState* self = + static_cast<HealthCheckClient::CallState*>(arg); + self->CallEnded(true /* retry */); + self->Unref(DEBUG_LOCATION, "call_end_closure"); +} + +void HealthCheckClient::CallState::CallEnded(bool retry) { + // If this CallState is still in use, this call ended because of a failure, + // so we need to stop using it and optionally create a new one. + // Otherwise, we have deliberately ended this call, and no further action + // is required. + if (this == health_check_client_->call_state_.get()) { + health_check_client_->call_state_.reset(); + if (retry) { + GPR_ASSERT(!health_check_client_->shutting_down_); + if (static_cast<bool>(gpr_atm_acq_load(&seen_response_))) { + // If the call fails after we've gotten a successful response, reset + // the backoff and restart the call immediately. + health_check_client_->retry_backoff_.Reset(); + health_check_client_->StartCall(); + } else { + // If the call failed without receiving any messages, retry later. + health_check_client_->StartRetryTimer(); + } + } + } + Unref(DEBUG_LOCATION, "call_ended"); +} + +} // namespace grpc_core diff --git a/src/core/ext/filters/client_channel/health/health_check_client.h b/src/core/ext/filters/client_channel/health/health_check_client.h new file mode 100644 index 0000000000..7f77348f18 --- /dev/null +++ b/src/core/ext/filters/client_channel/health/health_check_client.h @@ -0,0 +1,173 @@ +/* + * + * Copyright 2018 gRPC authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#ifndef GRPC_CORE_EXT_FILTERS_CLIENT_CHANNEL_HEALTH_HEALTH_CHECK_CLIENT_H +#define GRPC_CORE_EXT_FILTERS_CLIENT_CHANNEL_HEALTH_HEALTH_CHECK_CLIENT_H + +#include <grpc/support/port_platform.h> + +#include <grpc/grpc.h> +#include <grpc/support/atm.h> +#include <grpc/support/sync.h> + +#include "src/core/ext/filters/client_channel/client_channel_channelz.h" +#include "src/core/ext/filters/client_channel/subchannel.h" +#include "src/core/lib/backoff/backoff.h" +#include "src/core/lib/gpr/arena.h" +#include "src/core/lib/gprpp/orphanable.h" +#include "src/core/lib/gprpp/ref_counted_ptr.h" +#include "src/core/lib/iomgr/call_combiner.h" +#include "src/core/lib/iomgr/closure.h" +#include "src/core/lib/iomgr/polling_entity.h" +#include "src/core/lib/iomgr/timer.h" +#include "src/core/lib/transport/byte_stream.h" +#include "src/core/lib/transport/metadata_batch.h" +#include "src/core/lib/transport/transport.h" + +namespace grpc_core { + +class HealthCheckClient + : public InternallyRefCountedWithTracing<HealthCheckClient> { + public: + HealthCheckClient(const char* service_name, + RefCountedPtr<ConnectedSubchannel> connected_subchannel, + grpc_pollset_set* interested_parties, + RefCountedPtr<channelz::SubchannelNode> channelz_node); + + ~HealthCheckClient(); + + // When the health state changes from *state, sets *state to the new + // value and schedules closure. + // Only one closure can be outstanding at a time. + void NotifyOnHealthChange(grpc_connectivity_state* state, + grpc_closure* closure); + + void Orphan() override; + + private: + // Contains a call to the backend and all the data related to the call. + class CallState : public InternallyRefCountedWithTracing<CallState> { + public: + CallState(RefCountedPtr<HealthCheckClient> health_check_client, + grpc_pollset_set* interested_parties_); + ~CallState(); + + void Orphan() override; + + void StartCall(); + + private: + void Cancel(); + + void StartBatch(grpc_transport_stream_op_batch* batch); + static void StartBatchInCallCombiner(void* arg, grpc_error* error); + + static void CallEndedRetry(void* arg, grpc_error* error); + void CallEnded(bool retry); + + static void OnComplete(void* arg, grpc_error* error); + static void RecvInitialMetadataReady(void* arg, grpc_error* error); + static void RecvMessageReady(void* arg, grpc_error* error); + static void RecvTrailingMetadataReady(void* arg, grpc_error* error); + static void StartCancel(void* arg, grpc_error* error); + static void OnCancelComplete(void* arg, grpc_error* error); + + static void OnByteStreamNext(void* arg, grpc_error* error); + void ContinueReadingRecvMessage(); + grpc_error* PullSliceFromRecvMessage(); + void DoneReadingRecvMessage(grpc_error* error); + + RefCountedPtr<HealthCheckClient> health_check_client_; + grpc_polling_entity pollent_; + + gpr_arena* arena_; + grpc_call_combiner call_combiner_; + grpc_call_context_element context_[GRPC_CONTEXT_COUNT]; + + // The streaming call to the backend. Always non-NULL. + grpc_subchannel_call* call_; + + grpc_transport_stream_op_batch_payload payload_; + grpc_transport_stream_op_batch batch_; + grpc_transport_stream_op_batch recv_message_batch_; + grpc_transport_stream_op_batch recv_trailing_metadata_batch_; + + grpc_closure on_complete_; + + // send_initial_metadata + grpc_metadata_batch send_initial_metadata_; + grpc_linked_mdelem path_metadata_storage_; + + // send_message + ManualConstructor<SliceBufferByteStream> send_message_; + + // send_trailing_metadata + grpc_metadata_batch send_trailing_metadata_; + + // recv_initial_metadata + grpc_metadata_batch recv_initial_metadata_; + grpc_closure recv_initial_metadata_ready_; + + // recv_message + OrphanablePtr<ByteStream> recv_message_; + grpc_closure recv_message_ready_; + grpc_slice_buffer recv_message_buffer_; + gpr_atm seen_response_; + + // recv_trailing_metadata + grpc_metadata_batch recv_trailing_metadata_; + grpc_transport_stream_stats collect_stats_; + grpc_closure recv_trailing_metadata_ready_; + }; + + void StartCall(); + void StartCallLocked(); // Requires holding mu_. + + void StartRetryTimer(); + static void OnRetryTimer(void* arg, grpc_error* error); + + void SetHealthStatus(grpc_connectivity_state state, grpc_error* error); + void SetHealthStatusLocked(grpc_connectivity_state state, + grpc_error* error); // Requires holding mu_. + + const char* service_name_; // Do not own. + RefCountedPtr<ConnectedSubchannel> connected_subchannel_; + grpc_pollset_set* interested_parties_; // Do not own. + RefCountedPtr<channelz::SubchannelNode> channelz_node_; + + gpr_mu mu_; + grpc_connectivity_state state_ = GRPC_CHANNEL_CONNECTING; + grpc_error* error_ = GRPC_ERROR_NONE; + grpc_connectivity_state* notify_state_ = nullptr; + grpc_closure* on_health_changed_ = nullptr; + bool shutting_down_ = false; + + // The data associated with the current health check call. It holds a ref + // to this HealthCheckClient object. + OrphanablePtr<CallState> call_state_; + + // Call retry state. + BackOff retry_backoff_; + grpc_timer retry_timer_; + grpc_closure retry_timer_callback_; + bool retry_timer_callback_pending_ = false; +}; + +} // namespace grpc_core + +#endif /* GRPC_CORE_EXT_FILTERS_CLIENT_CHANNEL_HEALTH_HEALTH_CHECK_CLIENT_H */ diff --git a/src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc b/src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc index 5511df7a27..17e0d26875 100644 --- a/src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc +++ b/src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc @@ -1699,7 +1699,7 @@ grpc_channel_args* GrpcLb::CreateRoundRobinPolicyArgsLocked() { // Replace the LB addresses in the channel args that we pass down to // the subchannel. static const char* keys_to_remove[] = {GRPC_ARG_LB_ADDRESSES}; - const grpc_arg args_to_add[] = { + grpc_arg args_to_add[3] = { grpc_lb_addresses_create_channel_arg(addresses), // A channel arg indicating if the target is a backend inferred from a // grpclb load balancer. @@ -1708,9 +1708,15 @@ grpc_channel_args* GrpcLb::CreateRoundRobinPolicyArgsLocked() { GRPC_ARG_ADDRESS_IS_BACKEND_FROM_GRPCLB_LOAD_BALANCER), is_backend_from_grpclb_load_balancer), }; + size_t num_args_to_add = 2; + if (is_backend_from_grpclb_load_balancer) { + args_to_add[2] = grpc_channel_arg_integer_create( + const_cast<char*>(GRPC_ARG_INHIBIT_HEALTH_CHECKING), 1); + ++num_args_to_add; + } grpc_channel_args* args = grpc_channel_args_copy_and_add_and_remove( args_, keys_to_remove, GPR_ARRAY_SIZE(keys_to_remove), args_to_add, - GPR_ARRAY_SIZE(args_to_add)); + num_args_to_add); grpc_lb_addresses_destroy(addresses); return args; } diff --git a/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc b/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc index f4dca146f7..eb494486b9 100644 --- a/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc +++ b/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc @@ -359,9 +359,14 @@ void PickFirst::UpdateLocked(const grpc_channel_args& args) { "Pick First %p received update with %" PRIuPTR " addresses", this, addresses->num_addresses); } + grpc_arg new_arg = grpc_channel_arg_integer_create( + const_cast<char*>(GRPC_ARG_INHIBIT_HEALTH_CHECKING), 1); + grpc_channel_args* new_args = + grpc_channel_args_copy_and_add(&args, &new_arg, 1); auto subchannel_list = MakeOrphanable<PickFirstSubchannelList>( this, &grpc_lb_pick_first_trace, addresses, combiner(), - client_channel_factory(), args); + client_channel_factory(), *new_args); + grpc_channel_args_destroy(new_args); if (subchannel_list->num_subchannels() == 0) { // Empty update or no valid subchannels. Unsubscribe from all current // subchannels and put the channel in TRANSIENT_FAILURE. diff --git a/src/core/ext/filters/client_channel/lb_policy/subchannel_list.h b/src/core/ext/filters/client_channel/lb_policy/subchannel_list.h index e0e0e1e638..4ec9e935ed 100644 --- a/src/core/ext/filters/client_channel/lb_policy/subchannel_list.h +++ b/src/core/ext/filters/client_channel/lb_policy/subchannel_list.h @@ -102,8 +102,8 @@ class SubchannelData { // ProcessConnectivityChangeLocked()). grpc_connectivity_state CheckConnectivityStateLocked(grpc_error** error) { GPR_ASSERT(!connectivity_notification_pending_); - pending_connectivity_state_unsafe_ = - grpc_subchannel_check_connectivity(subchannel(), error); + pending_connectivity_state_unsafe_ = grpc_subchannel_check_connectivity( + subchannel(), error, subchannel_list_->inhibit_health_checking()); UpdateConnectedSubchannelLocked(); return pending_connectivity_state_unsafe_; } @@ -216,6 +216,7 @@ class SubchannelList // Accessors. LoadBalancingPolicy* policy() const { return policy_; } TraceFlag* tracer() const { return tracer_; } + bool inhibit_health_checking() const { return inhibit_health_checking_; } // Resets connection backoff of all subchannels. // TODO(roth): We will probably need to rethink this as part of moving @@ -254,6 +255,8 @@ class SubchannelList TraceFlag* tracer_; + bool inhibit_health_checking_; + grpc_combiner* combiner_; // The list of subchannels. @@ -340,7 +343,8 @@ void SubchannelData<SubchannelListType, subchannel_list()->Ref(DEBUG_LOCATION, "connectivity_watch").release(); grpc_subchannel_notify_on_state_change( subchannel_, subchannel_list_->policy()->interested_parties(), - &pending_connectivity_state_unsafe_, &connectivity_changed_closure_); + &pending_connectivity_state_unsafe_, &connectivity_changed_closure_, + subchannel_list_->inhibit_health_checking()); } template <typename SubchannelListType, typename SubchannelDataType> @@ -359,7 +363,8 @@ void SubchannelData<SubchannelListType, GPR_ASSERT(connectivity_notification_pending_); grpc_subchannel_notify_on_state_change( subchannel_, subchannel_list_->policy()->interested_parties(), - &pending_connectivity_state_unsafe_, &connectivity_changed_closure_); + &pending_connectivity_state_unsafe_, &connectivity_changed_closure_, + subchannel_list_->inhibit_health_checking()); } template <typename SubchannelListType, typename SubchannelDataType> @@ -390,8 +395,9 @@ void SubchannelData<SubchannelListType, SubchannelDataType>:: subchannel_, reason); } GPR_ASSERT(connectivity_notification_pending_); - grpc_subchannel_notify_on_state_change(subchannel_, nullptr, nullptr, - &connectivity_changed_closure_); + grpc_subchannel_notify_on_state_change( + subchannel_, nullptr, nullptr, &connectivity_changed_closure_, + subchannel_list_->inhibit_health_checking()); } template <typename SubchannelListType, typename SubchannelDataType> @@ -499,8 +505,13 @@ SubchannelList<SubchannelListType, SubchannelDataType>::SubchannelList( subchannels_.reserve(addresses->num_addresses); // We need to remove the LB addresses in order to be able to compare the // subchannel keys of subchannels from a different batch of addresses. + // We also remove the inhibit-health-checking arg, since we are + // handling that here. + inhibit_health_checking_ = grpc_channel_arg_get_bool( + grpc_channel_args_find(&args, GRPC_ARG_INHIBIT_HEALTH_CHECKING), false); static const char* keys_to_remove[] = {GRPC_ARG_SUBCHANNEL_ADDRESS, - GRPC_ARG_LB_ADDRESSES}; + GRPC_ARG_LB_ADDRESSES, + GRPC_ARG_INHIBIT_HEALTH_CHECKING}; // Create a subchannel for each address. grpc_subchannel_args sc_args; for (size_t i = 0; i < addresses->num_addresses; i++) { diff --git a/src/core/ext/filters/client_channel/subchannel.cc b/src/core/ext/filters/client_channel/subchannel.cc index a0a40eb2b3..e4c6efe862 100644 --- a/src/core/ext/filters/client_channel/subchannel.cc +++ b/src/core/ext/filters/client_channel/subchannel.cc @@ -30,6 +30,7 @@ #include <grpc/support/string_util.h> #include "src/core/ext/filters/client_channel/client_channel.h" +#include "src/core/ext/filters/client_channel/health/health_check_client.h" #include "src/core/ext/filters/client_channel/parse_address.h" #include "src/core/ext/filters/client_channel/proxy_mapper_registry.h" #include "src/core/ext/filters/client_channel/subchannel_index.h" @@ -41,6 +42,7 @@ #include "src/core/lib/gpr/alloc.h" #include "src/core/lib/gprpp/debug_location.h" #include "src/core/lib/gprpp/manual_constructor.h" +#include "src/core/lib/gprpp/mutex_lock.h" #include "src/core/lib/gprpp/ref_counted_ptr.h" #include "src/core/lib/iomgr/sockaddr_utils.h" #include "src/core/lib/iomgr/timer.h" @@ -50,6 +52,7 @@ #include "src/core/lib/surface/channel_init.h" #include "src/core/lib/transport/connectivity_state.h" #include "src/core/lib/transport/error_utils.h" +#include "src/core/lib/transport/service_config.h" #include "src/core/lib/transport/status_metadata.h" #define INTERNAL_REF_BITS 16 @@ -66,6 +69,10 @@ struct state_watcher { grpc_closure closure; grpc_subchannel* subchannel; grpc_connectivity_state connectivity_state; + grpc_connectivity_state last_connectivity_state; + grpc_core::OrphanablePtr<grpc_core::HealthCheckClient> health_check_client; + grpc_closure health_check_closure; + grpc_connectivity_state health_state; }; } // namespace @@ -78,6 +85,12 @@ typedef struct external_state_watcher { struct external_state_watcher* prev; } external_state_watcher; +namespace grpc_core { + +class ConnectedSubchannelStateWatcher; + +} // namespace grpc_core + struct grpc_subchannel { grpc_connector* connector; @@ -109,19 +122,24 @@ struct grpc_subchannel { being setup */ grpc_pollset_set* pollset_set; + grpc_core::UniquePtr<char> health_check_service_name; + /** mutex protecting remaining elements */ gpr_mu mu; - /** active connection, or null; of type grpc_core::ConnectedSubchannel - */ + /** active connection, or null */ grpc_core::RefCountedPtr<grpc_core::ConnectedSubchannel> connected_subchannel; + grpc_core::OrphanablePtr<grpc_core::ConnectedSubchannelStateWatcher> + connected_subchannel_watcher; /** have we seen a disconnection? */ bool disconnected; /** are we connecting */ bool connecting; + /** connectivity state tracking */ grpc_connectivity_state_tracker state_tracker; + grpc_connectivity_state_tracker state_and_health_tracker; external_state_watcher root_external_state_watcher; @@ -153,6 +171,171 @@ struct grpc_subchannel_call { grpc_millis deadline; }; +static void maybe_start_connecting_locked(grpc_subchannel* c); + +static const char* subchannel_connectivity_state_change_string( + grpc_connectivity_state state) { + switch (state) { + case GRPC_CHANNEL_IDLE: + return "Subchannel state change to IDLE"; + case GRPC_CHANNEL_CONNECTING: + return "Subchannel state change to CONNECTING"; + case GRPC_CHANNEL_READY: + return "Subchannel state change to READY"; + case GRPC_CHANNEL_TRANSIENT_FAILURE: + return "Subchannel state change to TRANSIENT_FAILURE"; + case GRPC_CHANNEL_SHUTDOWN: + return "Subchannel state change to SHUTDOWN"; + } + GPR_UNREACHABLE_CODE(return "UNKNOWN"); +} + +static void set_subchannel_connectivity_state_locked( + grpc_subchannel* c, grpc_connectivity_state state, grpc_error* error, + const char* reason) { + if (c->channelz_subchannel != nullptr) { + c->channelz_subchannel->AddTraceEvent( + grpc_core::channelz::ChannelTrace::Severity::Info, + grpc_slice_from_static_string( + subchannel_connectivity_state_change_string(state))); + } + grpc_connectivity_state_set(&c->state_tracker, state, error, reason); +} + +namespace grpc_core { + +class ConnectedSubchannelStateWatcher + : public InternallyRefCounted<ConnectedSubchannelStateWatcher> { + public: + // Must be instantiated while holding c->mu. + explicit ConnectedSubchannelStateWatcher(grpc_subchannel* c) + : subchannel_(c) { + // Steal subchannel ref for connecting. + GRPC_SUBCHANNEL_WEAK_REF(subchannel_, "state_watcher"); + GRPC_SUBCHANNEL_WEAK_UNREF(subchannel_, "connecting"); + // Start watching for connectivity state changes. + // Callback uses initial ref to this. + GRPC_CLOSURE_INIT(&on_connectivity_changed_, OnConnectivityChanged, this, + grpc_schedule_on_exec_ctx); + c->connected_subchannel->NotifyOnStateChange(c->pollset_set, + &pending_connectivity_state_, + &on_connectivity_changed_); + // Start health check if needed. + grpc_connectivity_state health_state = GRPC_CHANNEL_READY; + if (c->health_check_service_name != nullptr) { + health_check_client_ = grpc_core::MakeOrphanable<HealthCheckClient>( + c->health_check_service_name.get(), c->connected_subchannel, + c->pollset_set, c->channelz_subchannel); + GRPC_CLOSURE_INIT(&on_health_changed_, OnHealthChanged, this, + grpc_schedule_on_exec_ctx); + Ref().release(); // Ref for health callback tracked manually. + health_check_client_->NotifyOnHealthChange(&health_state_, + &on_health_changed_); + health_state = GRPC_CHANNEL_CONNECTING; + } + // Report initial state. + set_subchannel_connectivity_state_locked( + c, GRPC_CHANNEL_READY, GRPC_ERROR_NONE, "subchannel_connected"); + grpc_connectivity_state_set(&c->state_and_health_tracker, health_state, + GRPC_ERROR_NONE, "subchannel_connected"); + } + + ~ConnectedSubchannelStateWatcher() { + GRPC_SUBCHANNEL_WEAK_UNREF(subchannel_, "state_watcher"); + } + + void Orphan() override { health_check_client_.reset(); } + + private: + static void OnConnectivityChanged(void* arg, grpc_error* error) { + auto* self = static_cast<ConnectedSubchannelStateWatcher*>(arg); + grpc_subchannel* c = self->subchannel_; + { + MutexLock lock(&c->mu); + switch (self->pending_connectivity_state_) { + case GRPC_CHANNEL_TRANSIENT_FAILURE: + case GRPC_CHANNEL_SHUTDOWN: { + if (!c->disconnected && c->connected_subchannel != nullptr) { + if (grpc_trace_stream_refcount.enabled()) { + gpr_log(GPR_INFO, + "Connected subchannel %p of subchannel %p has gone into " + "%s. Attempting to reconnect.", + c->connected_subchannel.get(), c, + grpc_connectivity_state_name( + self->pending_connectivity_state_)); + } + c->connected_subchannel.reset(); + c->connected_subchannel_watcher.reset(); + self->last_connectivity_state_ = GRPC_CHANNEL_TRANSIENT_FAILURE; + set_subchannel_connectivity_state_locked( + c, GRPC_CHANNEL_TRANSIENT_FAILURE, GRPC_ERROR_REF(error), + "reflect_child"); + grpc_connectivity_state_set(&c->state_and_health_tracker, + GRPC_CHANNEL_TRANSIENT_FAILURE, + GRPC_ERROR_REF(error), "reflect_child"); + c->backoff_begun = false; + c->backoff->Reset(); + maybe_start_connecting_locked(c); + } else { + self->last_connectivity_state_ = GRPC_CHANNEL_SHUTDOWN; + } + self->health_check_client_.reset(); + break; + } + default: { + // In principle, this should never happen. We should not get + // a callback for READY, because that was the state we started + // this watch from. And a connected subchannel should never go + // from READY to CONNECTING or IDLE. + self->last_connectivity_state_ = self->pending_connectivity_state_; + set_subchannel_connectivity_state_locked( + c, self->pending_connectivity_state_, GRPC_ERROR_REF(error), + "reflect_child"); + if (self->pending_connectivity_state_ != GRPC_CHANNEL_READY) { + grpc_connectivity_state_set(&c->state_and_health_tracker, + self->pending_connectivity_state_, + GRPC_ERROR_REF(error), "reflect_child"); + } + c->connected_subchannel->NotifyOnStateChange( + nullptr, &self->pending_connectivity_state_, + &self->on_connectivity_changed_); + self = nullptr; // So we don't unref below. + } + } + } + // Don't unref until we've released the lock, because this might + // cause the subchannel (which contains the lock) to be destroyed. + if (self != nullptr) self->Unref(); + } + + static void OnHealthChanged(void* arg, grpc_error* error) { + auto* self = static_cast<ConnectedSubchannelStateWatcher*>(arg); + if (self->health_state_ == GRPC_CHANNEL_SHUTDOWN) { + self->Unref(); + return; + } + grpc_subchannel* c = self->subchannel_; + MutexLock lock(&c->mu); + if (self->last_connectivity_state_ == GRPC_CHANNEL_READY) { + grpc_connectivity_state_set(&c->state_and_health_tracker, + self->health_state_, GRPC_ERROR_REF(error), + "health_changed"); + } + self->health_check_client_->NotifyOnHealthChange(&self->health_state_, + &self->on_health_changed_); + } + + grpc_subchannel* subchannel_; + grpc_closure on_connectivity_changed_; + grpc_connectivity_state pending_connectivity_state_ = GRPC_CHANNEL_READY; + grpc_connectivity_state last_connectivity_state_ = GRPC_CHANNEL_READY; + grpc_core::OrphanablePtr<grpc_core::HealthCheckClient> health_check_client_; + grpc_closure on_health_changed_; + grpc_connectivity_state health_state_ = GRPC_CHANNEL_CONNECTING; +}; + +} // namespace grpc_core + #define SUBCHANNEL_CALL_TO_CALL_STACK(call) \ (grpc_call_stack*)((char*)(call) + GPR_ROUND_UP_TO_ALIGNMENT_SIZE( \ sizeof(grpc_subchannel_call))) @@ -198,8 +381,10 @@ static void subchannel_destroy(void* arg, grpc_error* error) { c->channelz_subchannel.reset(); } gpr_free((void*)c->filters); + c->health_check_service_name.reset(); grpc_channel_args_destroy(c->args); grpc_connectivity_state_destroy(&c->state_tracker); + grpc_connectivity_state_destroy(&c->state_and_health_tracker); grpc_connector_unref(c->connector); grpc_pollset_set_destroy(c->pollset_set); grpc_subchannel_key_destroy(c->key); @@ -262,6 +447,7 @@ static void disconnect(grpc_subchannel* c) { grpc_connector_shutdown(c->connector, GRPC_ERROR_CREATE_FROM_STATIC_STRING( "Subchannel disconnected")); c->connected_subchannel.reset(); + c->connected_subchannel_watcher.reset(); gpr_mu_unlock(&c->mu); } @@ -337,6 +523,31 @@ static void parse_args_for_backoff_values( .set_max_backoff(max_backoff_ms); } +namespace grpc_core { +namespace { + +struct HealthCheckParams { + UniquePtr<char> service_name; + + static void Parse(const grpc_json* field, HealthCheckParams* params) { + if (strcmp(field->key, "healthCheckConfig") == 0) { + if (field->type != GRPC_JSON_OBJECT) return; + for (grpc_json* sub_field = field->child; sub_field != nullptr; + sub_field = sub_field->next) { + if (sub_field->key == nullptr) return; + if (strcmp(sub_field->key, "serviceName") == 0) { + if (params->service_name != nullptr) return; // Duplicate. + if (sub_field->type != GRPC_JSON_STRING) return; + params->service_name.reset(gpr_strdup(sub_field->value)); + } + } + } + } +}; + +} // namespace +} // namespace grpc_core + grpc_subchannel* grpc_subchannel_create(grpc_connector* connector, const grpc_subchannel_args* args) { grpc_subchannel_key* key = grpc_subchannel_key_create(args); @@ -387,12 +598,28 @@ grpc_subchannel* grpc_subchannel_create(grpc_connector* connector, grpc_schedule_on_exec_ctx); grpc_connectivity_state_init(&c->state_tracker, GRPC_CHANNEL_IDLE, "subchannel"); + grpc_connectivity_state_init(&c->state_and_health_tracker, GRPC_CHANNEL_IDLE, + "subchannel"); grpc_core::BackOff::Options backoff_options; parse_args_for_backoff_values(args->args, &backoff_options, &c->min_connect_timeout_ms); c->backoff.Init(backoff_options); gpr_mu_init(&c->mu); + // Check whether we should enable health checking. + const char* service_config_json = grpc_channel_arg_get_string( + grpc_channel_args_find(c->args, GRPC_ARG_SERVICE_CONFIG)); + if (service_config_json != nullptr) { + grpc_core::UniquePtr<grpc_core::ServiceConfig> service_config = + grpc_core::ServiceConfig::Create(service_config_json); + if (service_config != nullptr) { + grpc_core::HealthCheckParams params; + service_config->ParseGlobalParams(grpc_core::HealthCheckParams::Parse, + ¶ms); + c->health_check_service_name = std::move(params.service_name); + } + } + const grpc_arg* arg = grpc_channel_args_find(c->args, GRPC_ARG_ENABLE_CHANNELZ); bool channelz_enabled = @@ -428,35 +655,6 @@ intptr_t grpc_subchannel_get_child_socket_uuid(grpc_subchannel* subchannel) { } } -static const char* subchannel_connectivity_state_change_string( - grpc_connectivity_state state) { - switch (state) { - case GRPC_CHANNEL_IDLE: - return "Subchannel state change to IDLE"; - case GRPC_CHANNEL_CONNECTING: - return "Subchannel state change to CONNECTING"; - case GRPC_CHANNEL_READY: - return "Subchannel state change to READY"; - case GRPC_CHANNEL_TRANSIENT_FAILURE: - return "Subchannel state change to TRANSIENT_FAILURE"; - case GRPC_CHANNEL_SHUTDOWN: - return "Subchannel state change to SHUTDOWN"; - } - GPR_UNREACHABLE_CODE(return "UNKNOWN"); -} - -static void set_subchannel_connectivity_state_locked( - grpc_subchannel* c, grpc_connectivity_state state, grpc_error* error, - const char* reason) { - if (c->channelz_subchannel != nullptr) { - c->channelz_subchannel->AddTraceEvent( - grpc_core::channelz::ChannelTrace::Severity::Info, - grpc_slice_from_static_string( - subchannel_connectivity_state_change_string(state))); - } - grpc_connectivity_state_set(&c->state_tracker, state, error, reason); -} - static void continue_connect_locked(grpc_subchannel* c) { grpc_connect_in_args args; args.interested_parties = c->pollset_set; @@ -467,15 +665,19 @@ static void continue_connect_locked(grpc_subchannel* c) { args.channel_args = c->args; set_subchannel_connectivity_state_locked(c, GRPC_CHANNEL_CONNECTING, GRPC_ERROR_NONE, "connecting"); + grpc_connectivity_state_set(&c->state_and_health_tracker, + GRPC_CHANNEL_CONNECTING, GRPC_ERROR_NONE, + "connecting"); grpc_connector_connect(c->connector, &args, &c->connecting_result, &c->on_connected); } -grpc_connectivity_state grpc_subchannel_check_connectivity(grpc_subchannel* c, - grpc_error** error) { - grpc_connectivity_state state; +grpc_connectivity_state grpc_subchannel_check_connectivity( + grpc_subchannel* c, grpc_error** error, bool inhibit_health_checks) { gpr_mu_lock(&c->mu); - state = grpc_connectivity_state_get(&c->state_tracker, error); + grpc_connectivity_state_tracker* tracker = + inhibit_health_checks ? &c->state_tracker : &c->state_and_health_tracker; + grpc_connectivity_state state = grpc_connectivity_state_get(tracker, error); gpr_mu_unlock(&c->mu); return state; } @@ -533,7 +735,8 @@ static void maybe_start_connecting_locked(grpc_subchannel* c) { /* Already connected: don't restart */ return; } - if (!grpc_connectivity_state_has_watchers(&c->state_tracker)) { + if (!grpc_connectivity_state_has_watchers(&c->state_tracker) && + !grpc_connectivity_state_has_watchers(&c->state_and_health_tracker)) { /* Nobody is interested in connecting: so don't just yet */ return; } @@ -560,16 +763,18 @@ static void maybe_start_connecting_locked(grpc_subchannel* c) { void grpc_subchannel_notify_on_state_change( grpc_subchannel* c, grpc_pollset_set* interested_parties, - grpc_connectivity_state* state, grpc_closure* notify) { + grpc_connectivity_state* state, grpc_closure* notify, + bool inhibit_health_checks) { + grpc_connectivity_state_tracker* tracker = + inhibit_health_checks ? &c->state_tracker : &c->state_and_health_tracker; external_state_watcher* w; - if (state == nullptr) { gpr_mu_lock(&c->mu); for (w = c->root_external_state_watcher.next; w != &c->root_external_state_watcher; w = w->next) { if (w->notify == notify) { - grpc_connectivity_state_notify_on_state_change(&c->state_tracker, - nullptr, &w->closure); + grpc_connectivity_state_notify_on_state_change(tracker, nullptr, + &w->closure); } } gpr_mu_unlock(&c->mu); @@ -588,62 +793,12 @@ void grpc_subchannel_notify_on_state_change( w->next = &c->root_external_state_watcher; w->prev = w->next->prev; w->next->prev = w->prev->next = w; - grpc_connectivity_state_notify_on_state_change(&c->state_tracker, state, - &w->closure); + grpc_connectivity_state_notify_on_state_change(tracker, state, &w->closure); maybe_start_connecting_locked(c); gpr_mu_unlock(&c->mu); } } -static void on_connected_subchannel_connectivity_changed(void* p, - grpc_error* error) { - state_watcher* connected_subchannel_watcher = static_cast<state_watcher*>(p); - grpc_subchannel* c = connected_subchannel_watcher->subchannel; - gpr_mu* mu = &c->mu; - - gpr_mu_lock(mu); - - switch (connected_subchannel_watcher->connectivity_state) { - case GRPC_CHANNEL_TRANSIENT_FAILURE: - case GRPC_CHANNEL_SHUTDOWN: { - if (!c->disconnected && c->connected_subchannel != nullptr) { - if (grpc_trace_stream_refcount.enabled()) { - gpr_log(GPR_INFO, - "Connected subchannel %p of subchannel %p has gone into %s. " - "Attempting to reconnect.", - c->connected_subchannel.get(), c, - grpc_connectivity_state_name( - connected_subchannel_watcher->connectivity_state)); - } - c->connected_subchannel.reset(); - set_subchannel_connectivity_state_locked( - c, GRPC_CHANNEL_TRANSIENT_FAILURE, GRPC_ERROR_REF(error), - "reflect_child"); - c->backoff_begun = false; - c->backoff->Reset(); - maybe_start_connecting_locked(c); - } else { - connected_subchannel_watcher->connectivity_state = - GRPC_CHANNEL_SHUTDOWN; - } - break; - } - default: { - set_subchannel_connectivity_state_locked( - c, connected_subchannel_watcher->connectivity_state, - GRPC_ERROR_REF(error), "reflect_child"); - GRPC_SUBCHANNEL_WEAK_REF(c, "state_watcher"); - c->connected_subchannel->NotifyOnStateChange( - nullptr, &connected_subchannel_watcher->connectivity_state, - &connected_subchannel_watcher->closure); - connected_subchannel_watcher = nullptr; - } - } - gpr_mu_unlock(mu); - GRPC_SUBCHANNEL_WEAK_UNREF(c, "state_watcher"); - gpr_free(connected_subchannel_watcher); -} - static bool publish_transport_locked(grpc_subchannel* c) { /* construct channel stack */ grpc_channel_stack_builder* builder = grpc_channel_stack_builder_create(); @@ -670,17 +825,7 @@ static bool publish_transport_locked(grpc_subchannel* c) { intptr_t socket_uuid = c->connecting_result.socket_uuid; memset(&c->connecting_result, 0, sizeof(c->connecting_result)); - /* initialize state watcher */ - state_watcher* connected_subchannel_watcher = static_cast<state_watcher*>( - gpr_zalloc(sizeof(*connected_subchannel_watcher))); - connected_subchannel_watcher->subchannel = c; - connected_subchannel_watcher->connectivity_state = GRPC_CHANNEL_READY; - GRPC_CLOSURE_INIT(&connected_subchannel_watcher->closure, - on_connected_subchannel_connectivity_changed, - connected_subchannel_watcher, grpc_schedule_on_exec_ctx); - if (c->disconnected) { - gpr_free(connected_subchannel_watcher); grpc_channel_stack_destroy(stk); gpr_free(stk); return false; @@ -692,17 +837,10 @@ static bool publish_transport_locked(grpc_subchannel* c) { gpr_log(GPR_INFO, "New connected subchannel at %p for subchannel %p", c->connected_subchannel.get(), c); - /* setup subchannel watching connected subchannel for changes; subchannel - ref for connecting is donated to the state watcher */ - GRPC_SUBCHANNEL_WEAK_REF(c, "state_watcher"); - GRPC_SUBCHANNEL_WEAK_UNREF(c, "connecting"); - c->connected_subchannel->NotifyOnStateChange( - c->pollset_set, &connected_subchannel_watcher->connectivity_state, - &connected_subchannel_watcher->closure); - - /* signal completion */ - set_subchannel_connectivity_state_locked(c, GRPC_CHANNEL_READY, - GRPC_ERROR_NONE, "connected"); + // Instantiate state watcher. Will clean itself up. + c->connected_subchannel_watcher = + grpc_core::MakeOrphanable<grpc_core::ConnectedSubchannelStateWatcher>(c); + return true; } @@ -725,6 +863,12 @@ static void on_subchannel_connected(void* arg, grpc_error* error) { "Connect Failed", &error, 1), GRPC_ERROR_INT_GRPC_STATUS, GRPC_STATUS_UNAVAILABLE), "connect_failed"); + grpc_connectivity_state_set( + &c->state_and_health_tracker, GRPC_CHANNEL_TRANSIENT_FAILURE, + grpc_error_set_int(GRPC_ERROR_CREATE_REFERENCING_FROM_STATIC_STRING( + "Connect Failed", &error, 1), + GRPC_ERROR_INT_GRPC_STATUS, GRPC_STATUS_UNAVAILABLE), + "connect_failed"); const char* errmsg = grpc_error_string(error); gpr_log(GPR_INFO, "Connect failed: %s", errmsg); @@ -956,15 +1100,8 @@ void ConnectedSubchannel::Ping(grpc_closure* on_initiate, grpc_error* ConnectedSubchannel::CreateCall(const CallArgs& args, grpc_subchannel_call** call) { - size_t allocation_size = - GPR_ROUND_UP_TO_ALIGNMENT_SIZE(sizeof(grpc_subchannel_call)); - if (args.parent_data_size > 0) { - allocation_size += - GPR_ROUND_UP_TO_ALIGNMENT_SIZE(channel_stack_->call_stack_size) + - args.parent_data_size; - } else { - allocation_size += channel_stack_->call_stack_size; - } + const size_t allocation_size = + GetInitialCallSizeEstimate(args.parent_data_size); *call = static_cast<grpc_subchannel_call*>( gpr_arena_alloc(args.arena, allocation_size)); grpc_call_stack* callstk = SUBCHANNEL_CALL_TO_CALL_STACK(*call); @@ -994,4 +1131,18 @@ grpc_error* ConnectedSubchannel::CreateCall(const CallArgs& args, return GRPC_ERROR_NONE; } +size_t ConnectedSubchannel::GetInitialCallSizeEstimate( + size_t parent_data_size) const { + size_t allocation_size = + GPR_ROUND_UP_TO_ALIGNMENT_SIZE(sizeof(grpc_subchannel_call)); + if (parent_data_size > 0) { + allocation_size += + GPR_ROUND_UP_TO_ALIGNMENT_SIZE(channel_stack_->call_stack_size) + + parent_data_size; + } else { + allocation_size += channel_stack_->call_stack_size; + } + return allocation_size; +} + } // namespace grpc_core diff --git a/src/core/ext/filters/client_channel/subchannel.h b/src/core/ext/filters/client_channel/subchannel.h index c53b13e37e..ec3b4d86e4 100644 --- a/src/core/ext/filters/client_channel/subchannel.h +++ b/src/core/ext/filters/client_channel/subchannel.h @@ -103,6 +103,8 @@ class ConnectedSubchannel : public RefCountedWithTracing<ConnectedSubchannel> { } intptr_t socket_uuid() { return socket_uuid_; } + size_t GetInitialCallSizeEstimate(size_t parent_data_size) const; + private: grpc_channel_stack* channel_stack_; // ref counted pointer to the channelz node in this connected subchannel's @@ -143,13 +145,14 @@ void* grpc_connected_subchannel_call_get_parent_data( /** poll the current connectivity state of a channel */ grpc_connectivity_state grpc_subchannel_check_connectivity( - grpc_subchannel* channel, grpc_error** error); + grpc_subchannel* channel, grpc_error** error, bool inhibit_health_checking); /** Calls notify when the connectivity state of a channel becomes different from *state. Updates *state with the new state of the channel. */ void grpc_subchannel_notify_on_state_change( grpc_subchannel* channel, grpc_pollset_set* interested_parties, - grpc_connectivity_state* state, grpc_closure* notify); + grpc_connectivity_state* state, grpc_closure* notify, + bool inhibit_health_checks); /** retrieve the grpc_core::ConnectedSubchannel - or nullptr if not connected * (which may happen before it initially connects or during transient failures) |