aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/trace_listener.h
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/stream_executor/trace_listener.h')
-rw-r--r--tensorflow/stream_executor/trace_listener.h59
1 files changed, 59 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/trace_listener.h b/tensorflow/stream_executor/trace_listener.h
new file mode 100644
index 0000000000..dcbb223f4f
--- /dev/null
+++ b/tensorflow/stream_executor/trace_listener.h
@@ -0,0 +1,59 @@
+// This file defines the StreamExecutor trace listener, used for inserting
+// non-device-specific instrumentation into the StreamExecutor.
+#ifndef TENSORFLOW_STREAM_EXECUTOR_TRACE_LISTENER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_TRACE_LISTENER_H_
+
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/kernel.h"
+#include "tensorflow/stream_executor/launch_dim.h"
+#include "tensorflow/stream_executor/lib/status.h"
+
+namespace perftools {
+namespace gputools {
+
+class Stream;
+
+// Traces StreamExecutor PIMPL-level events.
+// The few StreamExecutor interfaces that are synchronous have both Begin and
+// Complete versions of their trace calls. Asynchronous operations only have
+// Submit calls, as execution of the underlying operations is device-specific.
+// As all tracing calls mirror StreamExecutor routines, documentation here is
+// minimal.
+//
+// All calls have default implementations that perform no work; subclasses
+// should override functionality of interest. Keep in mind that these routines
+// are not called on a dedicated thread, so callbacks should execute quickly.
+//
+// Note: This API is constructed on an as-needed basis. Users should add
+// support for further StreamExecutor operations as required. By enforced
+// convention (see SCOPED_TRACE in stream_executor_pimpl.cc), synchronous
+// tracepoints should be named NameBegin and NameComplete.
+class TraceListener {
+ public:
+ virtual ~TraceListener() {}
+
+ virtual void LaunchSubmit(Stream* stream, const ThreadDim& thread_dims,
+ const BlockDim& block_dims,
+ const KernelBase& kernel,
+ const std::vector<KernelArg>& args) {}
+
+ virtual void SynchronousMemcpyH2DBegin(int64 correlation_id,
+ const void* host_src, int64 size,
+ DeviceMemoryBase* gpu_dst) {}
+ virtual void SynchronousMemcpyH2DComplete(int64 correlation_id,
+ const port::Status* result) {}
+
+ virtual void SynchronousMemcpyD2HBegin(int64 correlation_id,
+ const DeviceMemoryBase& gpu_src,
+ int64 size, void* host_dst) {}
+ virtual void SynchronousMemcpyD2HComplete(int64 correlation_id,
+ const port::Status* result) {}
+
+ virtual void BlockHostUntilDoneBegin(int64 correlation_id, Stream* stream) {}
+ virtual void BlockHostUntilDoneComplete(int64 correlation_id, bool result) {}
+};
+
+} // namespace gputools
+} // namespace perftools
+
+#endif // TENSORFLOW_STREAM_EXECUTOR_TRACE_LISTENER_H_