1 files changed, 65 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/launch_dim.h b/tensorflow/stream_executor/launch_dim.h
new file mode 100644
index 0000000000..9b870ed6aa
--- /dev/null
+++ b/tensorflow/stream_executor/launch_dim.h
@@ -0,0 +1,65 @@
+// Types to express dimensionality of a kernel launch. Blocks and threads
+// are (up to) 3-dimensional.
+//
+// A thread is conceptually like a SIMD lane. Some number, typically 32
+// (though that fact should not be relied on) SIMD lanes are tied together with
+// a single PC in a unit called a warp. There is a maximum number of threads
+// that can execute in a shared-context entity called a block. Presently, that
+// number is 1024 -- again, something that should not be relied on from this
+// comment, but checked via perftools::gputools::DeviceDescription.
+//
+// For additional information, see
+// http://docs.nvidia.com/cuda/kepler-tuning-guide/#device-utilization-and-occupancy
+//
+// Because of that modest thread-per-block limit, a kernel can be launched with
+// multiple blocks. Each block is indivisibly scheduled onto a single core.
+// Blocks can also be used in a multi-dimensional configuration, and the block
+// count has much less modest limits -- typically they're similar to the maximum
+// amount of addressable memory.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_LAUNCH_DIM_H_
+#define TENSORFLOW_STREAM_EXECUTOR_LAUNCH_DIM_H_
+
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "tensorflow/stream_executor/lib/strcat.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace perftools {
+namespace gputools {
+
+// Basic type that represents a 3-dimensional index space.
+struct Dim3D {
+  uint64 x, y, z;
+
+  Dim3D(uint64 x, uint64 y, uint64 z) : x(x), y(y), z(z) {}
+};
+
+// Thread dimensionality for use in a kernel launch. See file comment for
+// details.
+struct ThreadDim : public Dim3D {
+  explicit ThreadDim(uint64 x = 1, uint64 y = 1, uint64 z = 1)
+      : Dim3D(x, y, z) {}
+
+  // Returns a string representation of the thread dimensionality.
+  string ToString() const {
+    return port::StrCat("ThreadDim{", x, ", ", y, ", ", z, "}");
+  }
+};
+
+// Block dimensionality for use in a kernel launch. See file comment for
+// details.
+struct BlockDim : public Dim3D {
+  explicit BlockDim(uint64 x = 1, uint64 y = 1, uint64 z = 1)
+      : Dim3D(x, y, z) {}
+
+  // Returns a string representation of the block dimensionality.
+  string ToString() const {
+    return port::StrCat("BlockDim{", x, ", ", y, ", ", z, "}");
+  }
+};
+
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_LAUNCH_DIM_H_