aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2018-04-11 15:59:47 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-04-11 16:06:18 -0700
commitd6e2513d60999bf0cf315c42a14c0e45eb49cda2 (patch)
tree9bce39960da909b12d50d6468b5e0917347c6ef7
parentacad7022b09b090da0684f209ac8d0feb1c986a2 (diff)
support profiling multiple tpu through one grpc and one session.
data are saved with host prefix. PiperOrigin-RevId: 192523668
-rw-r--r--tensorflow/contrib/tpu/profiler/BUILD1
-rw-r--r--tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc53
-rw-r--r--tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc3
-rw-r--r--tensorflow/contrib/tpu/profiler/tpu_profiler.proto7
-rw-r--r--tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto6
5 files changed, 60 insertions, 10 deletions
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index 1c32993e8e..dbf1ab6bbf 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -46,6 +46,7 @@ tf_cc_binary(
visibility = ["//visibility:public"],
deps = [
":dump_tpu_profile",
+ ":tpu_profiler_analysis_proto_cc",
":tpu_profiler_proto_cc",
":version",
"//tensorflow/core:framework_internal",
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index 6b198dbc16..a535884263 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -26,6 +26,7 @@ limitations under the License.
#include "tensorflow/contrib/tpu/profiler/dump_tpu_profile.h"
#include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
+#include "tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.grpc.pb.h"
#include "tensorflow/contrib/tpu/profiler/version.h"
#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
#include "tensorflow/core/lib/core/errors.h"
@@ -40,6 +41,7 @@ namespace tensorflow {
namespace tpu {
namespace {
+using ::tensorflow::grpc::TPUProfileAnalysis;
using ::tensorflow::TPUProfiler;
constexpr uint64 kMaxEvents = 1000000;
@@ -64,11 +66,10 @@ Status ValidateHostPortPair(const string& host_port) {
return Status::OK();
}
-// Returns whether the returned trace is empty.
-// Failure are handled by CHECK, i.e. abort()
-bool Profile(const string& service_addr, const string& logdir, int duration_ms,
- const string& repository_root, const string& session_id,
- const ProfileOptions& opts) {
+ProfileRequest PopulateProfileRequest(int duration_ms,
+ const string& repository_root,
+ const string& session_id,
+ const ProfileOptions& opts) {
ProfileRequest request;
request.set_duration_ms(duration_ms);
request.set_max_events(kMaxEvents);
@@ -83,6 +84,17 @@ bool Profile(const string& service_addr, const string& logdir, int duration_ms,
*request.mutable_opts() = opts;
std::cout << "Limiting the number of trace events to " << kMaxEvents
<< std::endl;
+ return request;
+}
+
+// Returns whether the returned trace is empty.
+// Failure are handled by CHECK, i.e. abort()
+bool Profile(const string& service_addr, const string& logdir, int duration_ms,
+ const string& repository_root, const string& session_id,
+ const ProfileOptions& opts) {
+ ProfileRequest request =
+ PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
+
::grpc::ClientContext context;
::grpc::ChannelArguments channel_args;
// TODO(ioeric): use `SetMaxReceiveMessageSize` instead once it's available.
@@ -120,7 +132,36 @@ bool NewSession(const string& service_addr,
const std::vector<tensorflow::string>& hostnames,
int duration_ms, const string& repository_root,
const string& session_id, const ProfileOptions& opts) {
- return true;
+ NewProfileSessionRequest new_session_request;
+ *new_session_request.mutable_request() =
+ PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
+ new_session_request.set_repository_root(repository_root);
+ new_session_request.set_session_id(session_id);
+ std::copy(
+ hostnames.begin(), hostnames.end(),
+ proto2::RepeatedFieldBackInserter(new_session_request.mutable_hosts()));
+
+ ::grpc::ClientContext context;
+ ::grpc::ChannelArguments channel_args;
+ // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
+ // `ValidateHostPortPair` checks for empty host string case.
+ channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
+ // TODO(jiesun): GRPC support following relevant naming scheme:
+ // 1. dns:///host:port
+ // 2. ipv4:host:port or ipv6:[host]:port
+ // We might need to change the prefix which depends on what TPU name resolver
+ // will give us.
+ std::unique_ptr<TPUProfileAnalysis::Stub> stub =
+ TPUProfileAnalysis::NewStub(::grpc::CreateCustomChannel(
+ "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
+ channel_args));
+ NewProfileSessionResponse new_session_response;
+ TF_QCHECK_OK(FromGrpcStatus(
+ stub->NewSession(&context, new_session_request, &new_session_response)));
+
+ std::cout << "Profile session succeed for hosts:"
+ << str_util::Join(hostnames, ",");
+ return new_session_response.empty_trace();
}
} // namespace
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index ae508583f8..b53f9be2e2 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -64,7 +64,8 @@ Status WriteGzippedDataToFile(const string& filename, const string& data) {
Status DumpTraceToLogDirectory(StringPiece run_dir, const string& host_prefix,
const string& encoded_trace, std::ostream* os) {
- string proto_path = JoinPath(run_dir, kProtoTraceFileName);
+ string proto_path =
+ JoinPath(run_dir, StrCat(host_prefix, kProtoTraceFileName));
TF_RETURN_IF_ERROR(
WriteStringToFile(Env::Default(), proto_path, encoded_trace));
LOG(INFO) << "Dumped raw-proto trace data to " << proto_path;
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
index 8505c4bc69..7be694e866 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
@@ -96,5 +96,10 @@ message ProfileResponse {
// Data payload for each required tools.
repeated ProfileToolData tool_data = 6;
- // next-field: 7
+
+ // When we write profiling data directly to repository directory, we need a
+ // way to figure out whether the captured trace is empty (due to idle TPU).
+ bool empty_trace = 7;
+
+ // next-field: 8
}
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
index a4fc8d4e87..8b0bbde98e 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
@@ -7,13 +7,15 @@ message NewProfileSessionRequest {
ProfileRequest request = 1;
string repository_root = 2;
repeated string hosts = 3;
+ string session_id = 4;
}
message NewProfileSessionResponse {
// Auxiliary error_message.
string error_message = 1;
- // If success, return session identifier for future reference.
- string session_id = 2;
+
+ // Whether all hosts had returned a empty trace.
+ bool empty_trace = 2;
}
message EnumProfileSessionsAndToolsRequest {