diff options
Diffstat (limited to 'tensorflow/stream_executor/kernel_spec.h')
-rw-r--r-- | tensorflow/stream_executor/kernel_spec.h | 365 |
1 files changed, 365 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/kernel_spec.h b/tensorflow/stream_executor/kernel_spec.h new file mode 100644 index 0000000000..01a47ac253 --- /dev/null +++ b/tensorflow/stream_executor/kernel_spec.h @@ -0,0 +1,365 @@ +// Kernel-loader specs are structures that describe how to load a data-parallel +// kernel on a given platform for subsequent launching. Headers that instantiate +// these data structures will typically be auto-generated. However, users can +// also instantiate them by hand. +// +// A kernel with the same exact functionality and type signature may be +// implemented on several different platforms. Typical usage is to create a +// singleton that describes how to load a kernel on the various supported +// platforms: +// +// static const MultiKernelLoaderSpec &SaxpySpec() { +// static auto *mkls = +// (new MultiKernelLoaderSpec{4 /* = arity */}) +// ->AddCudaPtxOnDisk(ptx_file_path, ptx_kernelname) +// ->AddOpenCLTextOnDisk(opencl_text_file_path, ocl_kernelname); +// }; +// +// return *mkls; +// } +// +// This lazily instantiates an object that describes how to load CUDA PTX +// present on disk that implements saxpy for the for the CUDA platform, or +// OpenCL text present on disk that implements saxpy for an OpenCL-based +// platform. The CudaPtxOnDisk and OpenCLTextOnDisk objects are subtypes of +// KernelLoaderSpec -- KernelLoaderSpec describes how to load a kernel for +// subsequent launching on a single platform. +// +// For the loader functionality that accepts these KernelLoaderSpecs in order +// to grab the kernel appropriately, see StreamExecutor::GetKernel(). + +#ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_ +#define TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_ + +#include <stddef.h> +#include <map> +#include <memory> +#include "tensorflow/stream_executor/platform/port.h" + +#include "tensorflow/stream_executor/lib/stringpiece.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { + +// Describes how to load a kernel on a target platform. +// +// This is an abstract base class, subclassed for specific platforms. +// The filename_or_text field represents the program location (i.e. PTX or +// OpenCL loadable translation unit path) and is simply stored; whether it is a +// filename or text is exposed via more specifically named accessors in +// subclasses. +// +// These kernel loader specifications are typically auto-generated into header +// files at build time, but can also be specified manually. +class KernelLoaderSpec { + public: + virtual ~KernelLoaderSpec() {} + + // Returns the kernel name to load out of the program. + const string &kernelname() const { return kernelname_; } + + protected: + explicit KernelLoaderSpec(port::StringPiece kernelname); + + private: + // The kernel name that should be loaded out of the program description given + // above. + string kernelname_; + + SE_DISALLOW_COPY_AND_ASSIGN(KernelLoaderSpec); +}; + +// An abstract kernel loader spec that has an associated file path, where +// there's a canonical suffix for the filename; e.g. see CudaPtxOnDisk whose +// canonical filename suffix is ".ptx". +class OnDiskKernelLoaderSpec : public KernelLoaderSpec { + public: + ~OnDiskKernelLoaderSpec() override {} + + // Returns the path to the on-disk loadable kernel file. + const string &filename() const { return filename_; } + + // Returns the canonical suffix for this on-disk kernel loader spec format; + // e.g. PTX files on disk have a canonical suffix of ".ptx". + virtual const char *CanonicalSuffix() const = 0; + + protected: + OnDiskKernelLoaderSpec(port::StringPiece filename, + port::StringPiece kernelname); + + string filename_; + + private: + SE_DISALLOW_COPY_AND_ASSIGN(OnDiskKernelLoaderSpec); +}; + +// Kernel loader specification for PTX text that resides on disk. +class CudaPtxOnDisk : public OnDiskKernelLoaderSpec { + public: + CudaPtxOnDisk(port::StringPiece filename, port::StringPiece kernelname); + ~CudaPtxOnDisk() override {} + + const char *CanonicalSuffix() const override { return ".ptx"; } + + private: + SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxOnDisk); +}; + +// Kernel loader specification for CUBIN binary that resides on disk. +class CudaCubinOnDisk : public OnDiskKernelLoaderSpec { + public: + CudaCubinOnDisk(port::StringPiece filename, port::StringPiece kernelname); + ~CudaCubinOnDisk() override {} + + const string &filename() const { return filename_; } + + const char *CanonicalSuffix() const override { return ".cubin"; } + + private: + string filename_; + + SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinOnDisk); +}; + +// Kernel loader specification for PTX text that resides in memory. +class CudaPtxInMemory : public KernelLoaderSpec { + public: + // Components: compute capability major number, compute capability minor + // number, and PTX source. + typedef std::tuple<int, int, port::StringPiece> PtxSpec; + + // Single-PTX constructor. Adds the provided PTX version with an unknown + // compute capability. Since the CC is unknown, the PTX is assumed to be very + // generally usable - in other words, PTX specified in this manner is VERY + // likely to be used as the default! Note that the PTX can be compressed, + // which is indicated by the argument ptx_compressed. + // + // Warning: the string backing the provided port::StringPiece ptx must outlive this + // instance. + CudaPtxInMemory(port::StringPiece ptx, port::StringPiece kernelname, + bool ptx_compressed = false); + + // Multiple-PTX-version constructor. Adds each item in spec_list to this + // object. Note that the PTX can be compressed, which is indicated by the + // argument ptx_compressed. + CudaPtxInMemory(const std::initializer_list<PtxSpec> &spec_list, + port::StringPiece kernel_name, bool ptx_compressed = false); + ~CudaPtxInMemory() override {} + + // Add the PTX implementation described by ptx_spec to this object. On + // collision (i.e., if a version with the same compute_capability already + // exists), the existing implementation will be overwritten. + void AddSpec(PtxSpec ptx_spec); + + // Returns pointer to the ptx of available implementation with the + // lowest-valued compute capability. For example, if PTX written to CC2.0, + // 3.0, and 3.5 are all available, the version for CC2.0 will be set. Returns + // nullptr on failed lookup (if any version is not available). + // When the ptx is compressed, returns the decompressed ptx. + const char *default_text() const; + + // Similar to default_text(). + // When the ptx is compressed, returns the decompressed ptx. + const char *original_default_text() const; + + // Returns pointer to the ptx for the requested compute capability. + // Returns nullptr on failed lookup (if the requested version is not + // available). + // When the ptx is compressed, returns the decompressed ptx. + const char *text(int compute_capability_major, + int compute_capability_minor) const; + + // Similar to text(). + // When the ptx is compressed, returns the original compressed ptx. + const char *original_text(int compute_capability_major, + int compute_capability_minor) const; + + // Decompresses the PTX string using bzip2. + static string DecompressPtx(const char *ptx); + + private: + // PTX translation unit text contents in memory. The key is of as a tuple + // "<cc_major>,<cc_minor>", i.e., "2,0", "3,0", "3,5". Because CC's + // represented in this way have a clear sorting order, map::begin() will give + // the lowest-numbered version available, i.e. the default. + std::map<std::tuple<int, int>, const char *, + bool (*)(const std::tuple<int, int> &, const std::tuple<int, int> &)> + ptx_by_compute_capability_; + + // Stores all decompressed ptx strings, with original ptx string as keys. + // It is marked as mutable for lazy decompression. + mutable std::map<const char *, string> decompressed_ptx_; + mutable mutex mu_; + + // Defines the minimum compute capability possible. Used when PTX has no + // compute capability specified (in the single-PTX constructor). + static const std::tuple<int, int> kMinimumCapability; + + SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxInMemory); +}; + +// Kernel loader specification for OpenCL text that resides on disk. +class OpenCLTextOnDisk : public OnDiskKernelLoaderSpec { + public: + OpenCLTextOnDisk(port::StringPiece filename, port::StringPiece kernelname); + ~OpenCLTextOnDisk() override {} + + const char *CanonicalSuffix() const override { return ".ocl"; } + + private: + SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextOnDisk); +}; + +// Kernel loader specification for OpenCL binary that resides on disk. +class OpenCLBinaryOnDisk : public OnDiskKernelLoaderSpec { + public: + OpenCLBinaryOnDisk(port::StringPiece filename, port::StringPiece kernelname); + ~OpenCLBinaryOnDisk() override {} + + const char *CanonicalSuffix() const override { return ".aocx"; } + + private: + SE_DISALLOW_COPY_AND_ASSIGN(OpenCLBinaryOnDisk); +}; + +// Kernel loader specification for OpenCL text that resides in memory. +class OpenCLTextInMemory : public KernelLoaderSpec { + public: + OpenCLTextInMemory(port::StringPiece text, port::StringPiece kernelname); + ~OpenCLTextInMemory() override {} + + // Returns the OpenCL text contents. + const string &text() const { return text_; } + + private: + // OpenCL translation unit text contents in memory. + string text_; + + SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextInMemory); +}; + +// Kernel loader specification for a CUBIN blob that resides in memory. +class CudaCubinInMemory : public KernelLoaderSpec { + public: + CudaCubinInMemory(const char *bytes, port::StringPiece kernelname); + ~CudaCubinInMemory() override {} + + const char *bytes() const { return bytes_; } + + private: + const char *bytes_; + + SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinInMemory); +}; + +// Describes how to load a kernel on any subset of a number of target platforms. +class MultiKernelLoaderSpec { + public: + explicit MultiKernelLoaderSpec(size_t arity); + + // Returns the number of arguments that this kernel accepts. + size_t arity() const { return arity_; } + + // Convenience getters for testing whether these platform variants have + // kernel loader specifications available. + bool has_cuda_ptx_on_disk() const { return cuda_ptx_on_disk_ != nullptr; } + bool has_cuda_cubin_on_disk() const { return cuda_cubin_on_disk_ != nullptr; } + bool has_cuda_cubin_in_memory() const { + return cuda_cubin_in_memory_ != nullptr; + } + bool has_cuda_ptx_in_memory() const { return cuda_ptx_in_memory_ != nullptr; } + bool has_ocl_text_on_disk() const { return ocl_text_on_disk_ != nullptr; } + bool has_ocl_binary_on_disk() const { return ocl_binary_on_disk_ != nullptr; } + bool has_ocl_text_in_memory() const { return ocl_text_in_memory_ != nullptr; } + + // Accessors for platform variant kernel load specifications. + // Precondition: corresponding has_* is true. + const CudaPtxOnDisk &cuda_ptx_on_disk() const { + CHECK(has_cuda_ptx_on_disk()); + return *cuda_ptx_on_disk_; + } + const CudaCubinOnDisk &cuda_cubin_on_disk() const { + CHECK(has_cuda_cubin_on_disk()); + return *cuda_cubin_on_disk_; + } + const CudaCubinInMemory &cuda_cubin_in_memory() const { + CHECK(has_cuda_cubin_in_memory()); + return *cuda_cubin_in_memory_; + } + const CudaPtxInMemory &cuda_ptx_in_memory() const { + CHECK(has_cuda_ptx_in_memory()); + return *cuda_ptx_in_memory_; + } + const OpenCLTextOnDisk &ocl_text_on_disk() const { + CHECK(has_ocl_text_on_disk()); + return *ocl_text_on_disk_; + } + const OpenCLBinaryOnDisk &ocl_binary_on_disk() const { + CHECK(has_ocl_binary_on_disk()); + return *ocl_binary_on_disk_; + } + const OpenCLTextInMemory &ocl_text_in_memory() const { + CHECK(has_ocl_text_in_memory()); + return *ocl_text_in_memory_; + } + + // Builder-pattern-like methods for use in initializing a + // MultiKernelLoaderSpec. Each of these should be used at most once for a + // single MultiKernelLoaderSpec object. See file comment for example usage. + // + // Note that the kernelname parameter must be consistent with the kernel in + // the PTX or OpenCL being loaded. Also be aware that in CUDA C++ the kernel + // name may be mangled by the compiler if it is not declared in an + // extern "C" scope. + MultiKernelLoaderSpec *AddOpenCLTextOnDisk(port::StringPiece filename, + port::StringPiece kernelname); + MultiKernelLoaderSpec *AddOpenCLBinaryOnDisk(port::StringPiece filename, + port::StringPiece kernelname); + MultiKernelLoaderSpec *AddOpenCLTextInMemory(port::StringPiece ocl_text, + port::StringPiece kernelname); + MultiKernelLoaderSpec *AddCudaPtxOnDisk(port::StringPiece filename, + port::StringPiece kernelname); + MultiKernelLoaderSpec *AddCudaCubinOnDisk(port::StringPiece filename, + port::StringPiece kernelname); + MultiKernelLoaderSpec *AddCudaCubinInMemory(const char *cubin_bytes, + port::StringPiece kernelname); + MultiKernelLoaderSpec *AddCudaPtxInMemory(port::StringPiece ptx, + port::StringPiece kernelname); + MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory( + port::StringPiece ptx, port::StringPiece kernelname); + MultiKernelLoaderSpec *AddCudaPtxInMemory( + std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list, + port::StringPiece kernelname); + MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory( + std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list, + port::StringPiece kernelname); + + private: + std::unique_ptr<CudaPtxOnDisk> + cuda_ptx_on_disk_; // PTX text that resides in a file. + std::unique_ptr<CudaCubinOnDisk> + cuda_cubin_on_disk_; // Binary CUDA program in a file. + std::unique_ptr<CudaCubinInMemory> + cuda_cubin_in_memory_; // Binary CUDA program in memory. + std::unique_ptr<CudaPtxInMemory> + cuda_ptx_in_memory_; // PTX text that resides in memory. + std::unique_ptr<OpenCLTextOnDisk> + ocl_text_on_disk_; // OpenCL text that resides on disk. + std::unique_ptr<OpenCLBinaryOnDisk> + ocl_binary_on_disk_; // OpenCL binary that resides on disk. + std::unique_ptr<OpenCLTextInMemory> + ocl_text_in_memory_; // OpenCL text that resides in memory. + + // Number of parameters that the kernel takes. (This is nicer to have in a + // constexpr than having to determine it from the types via template + // metaprogramming). + size_t arity_; +}; + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_ |