aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/kernel_spec.h
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/stream_executor/kernel_spec.h')
-rw-r--r--tensorflow/stream_executor/kernel_spec.h365
1 files changed, 365 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/kernel_spec.h b/tensorflow/stream_executor/kernel_spec.h
new file mode 100644
index 0000000000..01a47ac253
--- /dev/null
+++ b/tensorflow/stream_executor/kernel_spec.h
@@ -0,0 +1,365 @@
+// Kernel-loader specs are structures that describe how to load a data-parallel
+// kernel on a given platform for subsequent launching. Headers that instantiate
+// these data structures will typically be auto-generated. However, users can
+// also instantiate them by hand.
+//
+// A kernel with the same exact functionality and type signature may be
+// implemented on several different platforms. Typical usage is to create a
+// singleton that describes how to load a kernel on the various supported
+// platforms:
+//
+// static const MultiKernelLoaderSpec &SaxpySpec() {
+// static auto *mkls =
+// (new MultiKernelLoaderSpec{4 /* = arity */})
+// ->AddCudaPtxOnDisk(ptx_file_path, ptx_kernelname)
+// ->AddOpenCLTextOnDisk(opencl_text_file_path, ocl_kernelname);
+// };
+//
+// return *mkls;
+// }
+//
+// This lazily instantiates an object that describes how to load CUDA PTX
+// present on disk that implements saxpy for the for the CUDA platform, or
+// OpenCL text present on disk that implements saxpy for an OpenCL-based
+// platform. The CudaPtxOnDisk and OpenCLTextOnDisk objects are subtypes of
+// KernelLoaderSpec -- KernelLoaderSpec describes how to load a kernel for
+// subsequent launching on a single platform.
+//
+// For the loader functionality that accepts these KernelLoaderSpecs in order
+// to grab the kernel appropriately, see StreamExecutor::GetKernel().
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_
+#define TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_
+
+#include <stddef.h>
+#include <map>
+#include <memory>
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "tensorflow/stream_executor/lib/stringpiece.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace perftools {
+namespace gputools {
+
+// Describes how to load a kernel on a target platform.
+//
+// This is an abstract base class, subclassed for specific platforms.
+// The filename_or_text field represents the program location (i.e. PTX or
+// OpenCL loadable translation unit path) and is simply stored; whether it is a
+// filename or text is exposed via more specifically named accessors in
+// subclasses.
+//
+// These kernel loader specifications are typically auto-generated into header
+// files at build time, but can also be specified manually.
+class KernelLoaderSpec {
+ public:
+ virtual ~KernelLoaderSpec() {}
+
+ // Returns the kernel name to load out of the program.
+ const string &kernelname() const { return kernelname_; }
+
+ protected:
+ explicit KernelLoaderSpec(port::StringPiece kernelname);
+
+ private:
+ // The kernel name that should be loaded out of the program description given
+ // above.
+ string kernelname_;
+
+ SE_DISALLOW_COPY_AND_ASSIGN(KernelLoaderSpec);
+};
+
+// An abstract kernel loader spec that has an associated file path, where
+// there's a canonical suffix for the filename; e.g. see CudaPtxOnDisk whose
+// canonical filename suffix is ".ptx".
+class OnDiskKernelLoaderSpec : public KernelLoaderSpec {
+ public:
+ ~OnDiskKernelLoaderSpec() override {}
+
+ // Returns the path to the on-disk loadable kernel file.
+ const string &filename() const { return filename_; }
+
+ // Returns the canonical suffix for this on-disk kernel loader spec format;
+ // e.g. PTX files on disk have a canonical suffix of ".ptx".
+ virtual const char *CanonicalSuffix() const = 0;
+
+ protected:
+ OnDiskKernelLoaderSpec(port::StringPiece filename,
+ port::StringPiece kernelname);
+
+ string filename_;
+
+ private:
+ SE_DISALLOW_COPY_AND_ASSIGN(OnDiskKernelLoaderSpec);
+};
+
+// Kernel loader specification for PTX text that resides on disk.
+class CudaPtxOnDisk : public OnDiskKernelLoaderSpec {
+ public:
+ CudaPtxOnDisk(port::StringPiece filename, port::StringPiece kernelname);
+ ~CudaPtxOnDisk() override {}
+
+ const char *CanonicalSuffix() const override { return ".ptx"; }
+
+ private:
+ SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxOnDisk);
+};
+
+// Kernel loader specification for CUBIN binary that resides on disk.
+class CudaCubinOnDisk : public OnDiskKernelLoaderSpec {
+ public:
+ CudaCubinOnDisk(port::StringPiece filename, port::StringPiece kernelname);
+ ~CudaCubinOnDisk() override {}
+
+ const string &filename() const { return filename_; }
+
+ const char *CanonicalSuffix() const override { return ".cubin"; }
+
+ private:
+ string filename_;
+
+ SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinOnDisk);
+};
+
+// Kernel loader specification for PTX text that resides in memory.
+class CudaPtxInMemory : public KernelLoaderSpec {
+ public:
+ // Components: compute capability major number, compute capability minor
+ // number, and PTX source.
+ typedef std::tuple<int, int, port::StringPiece> PtxSpec;
+
+ // Single-PTX constructor. Adds the provided PTX version with an unknown
+ // compute capability. Since the CC is unknown, the PTX is assumed to be very
+ // generally usable - in other words, PTX specified in this manner is VERY
+ // likely to be used as the default! Note that the PTX can be compressed,
+ // which is indicated by the argument ptx_compressed.
+ //
+ // Warning: the string backing the provided port::StringPiece ptx must outlive this
+ // instance.
+ CudaPtxInMemory(port::StringPiece ptx, port::StringPiece kernelname,
+ bool ptx_compressed = false);
+
+ // Multiple-PTX-version constructor. Adds each item in spec_list to this
+ // object. Note that the PTX can be compressed, which is indicated by the
+ // argument ptx_compressed.
+ CudaPtxInMemory(const std::initializer_list<PtxSpec> &spec_list,
+ port::StringPiece kernel_name, bool ptx_compressed = false);
+ ~CudaPtxInMemory() override {}
+
+ // Add the PTX implementation described by ptx_spec to this object. On
+ // collision (i.e., if a version with the same compute_capability already
+ // exists), the existing implementation will be overwritten.
+ void AddSpec(PtxSpec ptx_spec);
+
+ // Returns pointer to the ptx of available implementation with the
+ // lowest-valued compute capability. For example, if PTX written to CC2.0,
+ // 3.0, and 3.5 are all available, the version for CC2.0 will be set. Returns
+ // nullptr on failed lookup (if any version is not available).
+ // When the ptx is compressed, returns the decompressed ptx.
+ const char *default_text() const;
+
+ // Similar to default_text().
+ // When the ptx is compressed, returns the decompressed ptx.
+ const char *original_default_text() const;
+
+ // Returns pointer to the ptx for the requested compute capability.
+ // Returns nullptr on failed lookup (if the requested version is not
+ // available).
+ // When the ptx is compressed, returns the decompressed ptx.
+ const char *text(int compute_capability_major,
+ int compute_capability_minor) const;
+
+ // Similar to text().
+ // When the ptx is compressed, returns the original compressed ptx.
+ const char *original_text(int compute_capability_major,
+ int compute_capability_minor) const;
+
+ // Decompresses the PTX string using bzip2.
+ static string DecompressPtx(const char *ptx);
+
+ private:
+ // PTX translation unit text contents in memory. The key is of as a tuple
+ // "<cc_major>,<cc_minor>", i.e., "2,0", "3,0", "3,5". Because CC's
+ // represented in this way have a clear sorting order, map::begin() will give
+ // the lowest-numbered version available, i.e. the default.
+ std::map<std::tuple<int, int>, const char *,
+ bool (*)(const std::tuple<int, int> &, const std::tuple<int, int> &)>
+ ptx_by_compute_capability_;
+
+ // Stores all decompressed ptx strings, with original ptx string as keys.
+ // It is marked as mutable for lazy decompression.
+ mutable std::map<const char *, string> decompressed_ptx_;
+ mutable mutex mu_;
+
+ // Defines the minimum compute capability possible. Used when PTX has no
+ // compute capability specified (in the single-PTX constructor).
+ static const std::tuple<int, int> kMinimumCapability;
+
+ SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxInMemory);
+};
+
+// Kernel loader specification for OpenCL text that resides on disk.
+class OpenCLTextOnDisk : public OnDiskKernelLoaderSpec {
+ public:
+ OpenCLTextOnDisk(port::StringPiece filename, port::StringPiece kernelname);
+ ~OpenCLTextOnDisk() override {}
+
+ const char *CanonicalSuffix() const override { return ".ocl"; }
+
+ private:
+ SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextOnDisk);
+};
+
+// Kernel loader specification for OpenCL binary that resides on disk.
+class OpenCLBinaryOnDisk : public OnDiskKernelLoaderSpec {
+ public:
+ OpenCLBinaryOnDisk(port::StringPiece filename, port::StringPiece kernelname);
+ ~OpenCLBinaryOnDisk() override {}
+
+ const char *CanonicalSuffix() const override { return ".aocx"; }
+
+ private:
+ SE_DISALLOW_COPY_AND_ASSIGN(OpenCLBinaryOnDisk);
+};
+
+// Kernel loader specification for OpenCL text that resides in memory.
+class OpenCLTextInMemory : public KernelLoaderSpec {
+ public:
+ OpenCLTextInMemory(port::StringPiece text, port::StringPiece kernelname);
+ ~OpenCLTextInMemory() override {}
+
+ // Returns the OpenCL text contents.
+ const string &text() const { return text_; }
+
+ private:
+ // OpenCL translation unit text contents in memory.
+ string text_;
+
+ SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextInMemory);
+};
+
+// Kernel loader specification for a CUBIN blob that resides in memory.
+class CudaCubinInMemory : public KernelLoaderSpec {
+ public:
+ CudaCubinInMemory(const char *bytes, port::StringPiece kernelname);
+ ~CudaCubinInMemory() override {}
+
+ const char *bytes() const { return bytes_; }
+
+ private:
+ const char *bytes_;
+
+ SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinInMemory);
+};
+
+// Describes how to load a kernel on any subset of a number of target platforms.
+class MultiKernelLoaderSpec {
+ public:
+ explicit MultiKernelLoaderSpec(size_t arity);
+
+ // Returns the number of arguments that this kernel accepts.
+ size_t arity() const { return arity_; }
+
+ // Convenience getters for testing whether these platform variants have
+ // kernel loader specifications available.
+ bool has_cuda_ptx_on_disk() const { return cuda_ptx_on_disk_ != nullptr; }
+ bool has_cuda_cubin_on_disk() const { return cuda_cubin_on_disk_ != nullptr; }
+ bool has_cuda_cubin_in_memory() const {
+ return cuda_cubin_in_memory_ != nullptr;
+ }
+ bool has_cuda_ptx_in_memory() const { return cuda_ptx_in_memory_ != nullptr; }
+ bool has_ocl_text_on_disk() const { return ocl_text_on_disk_ != nullptr; }
+ bool has_ocl_binary_on_disk() const { return ocl_binary_on_disk_ != nullptr; }
+ bool has_ocl_text_in_memory() const { return ocl_text_in_memory_ != nullptr; }
+
+ // Accessors for platform variant kernel load specifications.
+ // Precondition: corresponding has_* is true.
+ const CudaPtxOnDisk &cuda_ptx_on_disk() const {
+ CHECK(has_cuda_ptx_on_disk());
+ return *cuda_ptx_on_disk_;
+ }
+ const CudaCubinOnDisk &cuda_cubin_on_disk() const {
+ CHECK(has_cuda_cubin_on_disk());
+ return *cuda_cubin_on_disk_;
+ }
+ const CudaCubinInMemory &cuda_cubin_in_memory() const {
+ CHECK(has_cuda_cubin_in_memory());
+ return *cuda_cubin_in_memory_;
+ }
+ const CudaPtxInMemory &cuda_ptx_in_memory() const {
+ CHECK(has_cuda_ptx_in_memory());
+ return *cuda_ptx_in_memory_;
+ }
+ const OpenCLTextOnDisk &ocl_text_on_disk() const {
+ CHECK(has_ocl_text_on_disk());
+ return *ocl_text_on_disk_;
+ }
+ const OpenCLBinaryOnDisk &ocl_binary_on_disk() const {
+ CHECK(has_ocl_binary_on_disk());
+ return *ocl_binary_on_disk_;
+ }
+ const OpenCLTextInMemory &ocl_text_in_memory() const {
+ CHECK(has_ocl_text_in_memory());
+ return *ocl_text_in_memory_;
+ }
+
+ // Builder-pattern-like methods for use in initializing a
+ // MultiKernelLoaderSpec. Each of these should be used at most once for a
+ // single MultiKernelLoaderSpec object. See file comment for example usage.
+ //
+ // Note that the kernelname parameter must be consistent with the kernel in
+ // the PTX or OpenCL being loaded. Also be aware that in CUDA C++ the kernel
+ // name may be mangled by the compiler if it is not declared in an
+ // extern "C" scope.
+ MultiKernelLoaderSpec *AddOpenCLTextOnDisk(port::StringPiece filename,
+ port::StringPiece kernelname);
+ MultiKernelLoaderSpec *AddOpenCLBinaryOnDisk(port::StringPiece filename,
+ port::StringPiece kernelname);
+ MultiKernelLoaderSpec *AddOpenCLTextInMemory(port::StringPiece ocl_text,
+ port::StringPiece kernelname);
+ MultiKernelLoaderSpec *AddCudaPtxOnDisk(port::StringPiece filename,
+ port::StringPiece kernelname);
+ MultiKernelLoaderSpec *AddCudaCubinOnDisk(port::StringPiece filename,
+ port::StringPiece kernelname);
+ MultiKernelLoaderSpec *AddCudaCubinInMemory(const char *cubin_bytes,
+ port::StringPiece kernelname);
+ MultiKernelLoaderSpec *AddCudaPtxInMemory(port::StringPiece ptx,
+ port::StringPiece kernelname);
+ MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory(
+ port::StringPiece ptx, port::StringPiece kernelname);
+ MultiKernelLoaderSpec *AddCudaPtxInMemory(
+ std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
+ port::StringPiece kernelname);
+ MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory(
+ std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
+ port::StringPiece kernelname);
+
+ private:
+ std::unique_ptr<CudaPtxOnDisk>
+ cuda_ptx_on_disk_; // PTX text that resides in a file.
+ std::unique_ptr<CudaCubinOnDisk>
+ cuda_cubin_on_disk_; // Binary CUDA program in a file.
+ std::unique_ptr<CudaCubinInMemory>
+ cuda_cubin_in_memory_; // Binary CUDA program in memory.
+ std::unique_ptr<CudaPtxInMemory>
+ cuda_ptx_in_memory_; // PTX text that resides in memory.
+ std::unique_ptr<OpenCLTextOnDisk>
+ ocl_text_on_disk_; // OpenCL text that resides on disk.
+ std::unique_ptr<OpenCLBinaryOnDisk>
+ ocl_binary_on_disk_; // OpenCL binary that resides on disk.
+ std::unique_ptr<OpenCLTextInMemory>
+ ocl_text_in_memory_; // OpenCL text that resides in memory.
+
+ // Number of parameters that the kernel takes. (This is nicer to have in a
+ // constexpr than having to determine it from the types via template
+ // metaprogramming).
+ size_t arity_;
+};
+
+} // namespace gputools
+} // namespace perftools
+
+#endif // TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_