diff options
Diffstat (limited to 'tensorflow/stream_executor')
113 files changed, 25529 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD new file mode 100644 index 0000000000..b91fe431f6 --- /dev/null +++ b/tensorflow/stream_executor/BUILD @@ -0,0 +1,39 @@ +licenses(["restricted"]) + +load("/tensorflow/tensorflow", "if_cuda") + +cc_library( + name = "stream_executor", + srcs = glob( + [ + "*.cc", + "lib/*.cc", + ], + exclude = [ + "**/*_test.cc", + ], + ) + if_cuda( + glob([ + "cuda/*.cc", + ]), + ), + hdrs = glob([ + "*.h", + "lib/*.h", + "platform/**/*.h", + ]), + data = [ + "//tensorflow/core:cuda", + "//third_party/gpus/cuda:cublas", + "//third_party/gpus/cuda:cudnn", + ], + linkopts = [ + "-ldl", + ], + visibility = ["//visibility:public"], + deps = [ + "//tensorflow/core:lib", + "//third_party/gpus/cuda:cuda_headers", + ], + alwayslink = 1, +) diff --git a/tensorflow/stream_executor/blas.cc b/tensorflow/stream_executor/blas.cc new file mode 100644 index 0000000000..70a6bb7030 --- /dev/null +++ b/tensorflow/stream_executor/blas.cc @@ -0,0 +1,57 @@ +#include "tensorflow/stream_executor/blas.h" + +#include "tensorflow/stream_executor/lib/strcat.h" + +namespace perftools { +namespace gputools { +namespace blas { + +string TransposeString(Transpose t) { + switch (t) { + case Transpose::kNoTranspose: + return "NoTranspose"; + case Transpose::kTranspose: + return "Transpose"; + case Transpose::kConjugateTranspose: + return "ConjugateTranspose"; + default: + LOG(FATAL) << "Unknown transpose " << static_cast<int32>(t); + } +} + +string UpperLowerString(UpperLower ul) { + switch (ul) { + case UpperLower::kUpper: + return "Upper"; + case UpperLower::kLower: + return "Lower"; + default: + LOG(FATAL) << "Unknown upperlower " << static_cast<int32>(ul); + } +} + +string DiagonalString(Diagonal d) { + switch (d) { + case Diagonal::kUnit: + return "Unit"; + case Diagonal::kNonUnit: + return "NonUnit"; + default: + LOG(FATAL) << "Unknown diagonal " << static_cast<int32>(d); + } +} + +string SideString(Side s) { + switch (s) { + case Side::kLeft: + return "Left"; + case Side::kRight: + return "Right"; + default: + LOG(FATAL) << "Unknown side " << static_cast<int32>(s); + } +} + +} // namespace blas +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h new file mode 100644 index 0000000000..f6ee29837d --- /dev/null +++ b/tensorflow/stream_executor/blas.h @@ -0,0 +1,1780 @@ +// Exposes the family of BLAS routines as pre-canned high performance calls for +// use in conjunction with the StreamExecutor abstraction. +// +// Note that this interface is optionally supported by platforms; see +// StreamExecutor::SupportsBlas() for details. +// +// This abstraction makes it simple to entrain BLAS operations on GPU data into +// a Stream -- users typically will not use this API directly, but will use the +// Stream builder methods to entrain these operations "under the hood". For +// example: +// +// DeviceMemory<float> x = stream_exec->AllocateArray<float>(1024); +// DeviceMemory<float> y = stream_exec->AllocateArray<float>(1024); +// // ... populate x and y ... +// Stream stream{stream_exec}; +// stream +// .Init() +// .ThenBlasAxpy(1024, 5.5, x, 1, &y, 1) +// .BlockHostUntilDone(); +// +// By using stream operations in this manner the user can easily intermix custom +// kernel launches (via StreamExecutor::ThenLaunch()) with these pre-canned BLAS +// routines. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_BLAS_H_ +#define TENSORFLOW_STREAM_EXECUTOR_BLAS_H_ + +#include <complex> +#include "tensorflow/stream_executor/platform/port.h" + +#include "tensorflow/stream_executor/lib/array_slice.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { + +class Stream; + +template <typename ElemT> +class DeviceMemory; + +namespace blas { + +// Specifies whether the input matrix will be transposed or +// transposed+conjugated before any BLAS operations. +enum class Transpose { kNoTranspose, kTranspose, kConjugateTranspose }; + +// Returns a name for t. +string TransposeString(Transpose t); + +// Specifies whether the upper or lower triangular part of a +// symmetric/Hermitian matrix is used. +enum class UpperLower { kUpper, kLower }; + +// Returns a name for ul. +string UpperLowerString(UpperLower ul); + +// Specifies whether a matrix is unit triangular. +enum class Diagonal { kUnit, kNonUnit }; + +// Returns a name for d. +string DiagonalString(Diagonal d); + +// Specifies whether a Hermitian matrix appears on the left or right in +// operation. +enum class Side { kLeft, kRight }; + +// Returns a name for s. +string SideString(Side s); + +// BLAS support interface -- this can be derived from a GPU executor when the +// underlying platform has an BLAS library implementation available. See +// StreamExecutor::AsBlas(). +// +// Thread-hostile: CUDA associates a CUDA-context with a particular thread in +// the system. Any operation that a user attempts to perform by enqueueing BLAS +// operations on a thread not-associated with the CUDA-context has unknown +// behavior at the current time; see b/13176597 +class BlasSupport { + public: + virtual ~BlasSupport() {} + + // Computes the sum of magnitudes of the vector elements. + // result <- |Re x(1)| + |Im x(1)| + |Re x(2)| + |Im x(2)|+ ... + |Re x(n)| + // + |Im x(n)|. + // Note that Im x(i) = 0 for real types float/double. + virtual bool DoBlasAsum(Stream *stream, uint64 elem_count, + const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *result) = 0; + virtual bool DoBlasAsum(Stream *stream, uint64 elem_count, + const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *result) = 0; + virtual bool DoBlasAsum(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<float> *result) = 0; + virtual bool DoBlasAsum(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<double> *result) = 0; + + // Performs a BLAS y <- ax+y operation. + virtual bool DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha, + const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *y, int incy) = 0; + virtual bool DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha, + const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *y, int incy) = 0; + virtual bool DoBlasAxpy(Stream *stream, uint64 elem_count, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<std::complex<float>> *y, int incy) = 0; + virtual bool DoBlasAxpy(Stream *stream, uint64 elem_count, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<std::complex<double>> *y, int incy) = 0; + + // Copies vector to another vector: y <- x. + virtual bool DoBlasCopy(Stream *stream, uint64 elem_count, + const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *y, int incy) = 0; + virtual bool DoBlasCopy(Stream *stream, uint64 elem_count, + const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *y, int incy) = 0; + virtual bool DoBlasCopy(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<std::complex<float>> *y, int incy) = 0; + virtual bool DoBlasCopy(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<std::complex<double>> *y, int incy) = 0; + + // Performs a BLAS dot product result <- x . y. + virtual bool DoBlasDot(Stream *stream, uint64 elem_count, + const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *result) = 0; + virtual bool DoBlasDot(Stream *stream, uint64 elem_count, + const DeviceMemory<double> &x, int incx, + const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *result) = 0; + + // Performs a BLAS dot product result <- conj(x) . y for complex types. + virtual bool DoBlasDotc(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *result) = 0; + virtual bool DoBlasDotc(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *result) = 0; + + // Performs a BLAS dot product result <- x . y for complex types. Note that + // x is unconjugated in this routine. + virtual bool DoBlasDotu(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *result) = 0; + virtual bool DoBlasDotu(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *result) = 0; + + // Computes the Euclidean norm of a vector: result <- ||x||. + // See the following link for more information of Euclidean norm: + // http://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm + virtual bool DoBlasNrm2(Stream *stream, uint64 elem_count, + const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *result) = 0; + virtual bool DoBlasNrm2(Stream *stream, uint64 elem_count, + const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *result) = 0; + virtual bool DoBlasNrm2(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<float> *result) = 0; + virtual bool DoBlasNrm2(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<double> *result) = 0; + + // Performs rotation of points in the plane: + // x(i) = c*x(i) + s*y(i) + // y(i) = c*y(i) - s*x(i). + virtual bool DoBlasRot(Stream *stream, uint64 elem_count, + DeviceMemory<float> *x, int incx, + DeviceMemory<float> *y, int incy, float c, + float s) = 0; + virtual bool DoBlasRot(Stream *stream, uint64 elem_count, + DeviceMemory<double> *x, int incx, + DeviceMemory<double> *y, int incy, double c, + double s) = 0; + virtual bool DoBlasRot(Stream *stream, uint64 elem_count, + DeviceMemory<std::complex<float>> *x, int incx, + DeviceMemory<std::complex<float>> *y, int incy, + float c, float s) = 0; + virtual bool DoBlasRot(Stream *stream, uint64 elem_count, + DeviceMemory<std::complex<double>> *x, int incx, + DeviceMemory<std::complex<double>> *y, int incy, + double c, double s) = 0; + + // Computes the parameters for a Givens rotation. + // Given the Cartesian coordinates (a, b) of a point, these routines return + // the parameters c, s, r, and z associated with the Givens rotation. The + // parameters c and s define a unitary matrix such that: + // + // | c s |.| a | = | r | + // | -s c | | b | | 0 | + // + // The parameter z is defined such that if |a| > |b|, z is s; otherwise if + // c is not 0 z is 1/c; otherwise z is 1. + virtual bool DoBlasRotg(Stream *stream, DeviceMemory<float> *a, + DeviceMemory<float> *b, DeviceMemory<float> *c, + DeviceMemory<float> *s) = 0; + virtual bool DoBlasRotg(Stream *stream, DeviceMemory<double> *a, + DeviceMemory<double> *b, DeviceMemory<double> *c, + DeviceMemory<double> *s) = 0; + virtual bool DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a, + DeviceMemory<std::complex<float>> *b, + DeviceMemory<float> *c, + DeviceMemory<std::complex<float>> *s) = 0; + virtual bool DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a, + DeviceMemory<std::complex<double>> *b, + DeviceMemory<double> *c, + DeviceMemory<std::complex<double>> *s) = 0; + + // Performs modified Givens rotation of points in the plane. + // Given two vectors x and y, each vector element of these vectors is replaced + // as follows: + // + // | x(i) | = H | x(i) | + // | y(i) | | y(i) | + // + // for i=1 to n, where H is a modified Givens transformation matrix whose + // values are stored in the param[1] through param[4] array. + // For more information please Google this routine. + virtual bool DoBlasRotm(Stream *stream, uint64 elem_count, + DeviceMemory<float> *x, int incx, + DeviceMemory<float> *y, int incy, + const DeviceMemory<float> ¶m) = 0; + virtual bool DoBlasRotm(Stream *stream, uint64 elem_count, + DeviceMemory<double> *x, int incx, + DeviceMemory<double> *y, int incy, + const DeviceMemory<double> ¶m) = 0; + + // Computes the parameters for a modified Givens rotation. + // Given Cartesian coordinates (x1, y1) of an input vector, these routines + // compute the components of a modified Givens transformation matrix H that + // zeros the y-component of the resulting vector: + // + // | x1 | = H | x1 * sqrt(d1) | + // | 0 | | y1 * sqrt(d1) | + // + // For more information please Google this routine. + virtual bool DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1, + DeviceMemory<float> *d2, DeviceMemory<float> *x1, + const DeviceMemory<float> &y1, + DeviceMemory<float> *param) = 0; + virtual bool DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1, + DeviceMemory<double> *d2, DeviceMemory<double> *x1, + const DeviceMemory<double> &y1, + DeviceMemory<double> *param) = 0; + + // Computes the product of a vector by a scalar: x <- a*x. + virtual bool DoBlasScal(Stream *stream, uint64 elem_count, float alpha, + DeviceMemory<float> *x, int incx) = 0; + virtual bool DoBlasScal(Stream *stream, uint64 elem_count, double alpha, + DeviceMemory<double> *x, int incx) = 0; + virtual bool DoBlasScal(Stream *stream, uint64 elem_count, float alpha, + DeviceMemory<std::complex<float>> *x, int incx) = 0; + virtual bool DoBlasScal(Stream *stream, uint64 elem_count, double alpha, + DeviceMemory<std::complex<double>> *x, int incx) = 0; + virtual bool DoBlasScal(Stream *stream, uint64 elem_count, + std::complex<float> alpha, + DeviceMemory<std::complex<float>> *x, int incx) = 0; + virtual bool DoBlasScal(Stream *stream, uint64 elem_count, + std::complex<double> alpha, + DeviceMemory<std::complex<double>> *x, int incx) = 0; + + // Swaps a vector with another vector. + virtual bool DoBlasSwap(Stream *stream, uint64 elem_count, + DeviceMemory<float> *x, int incx, + DeviceMemory<float> *y, int incy) = 0; + virtual bool DoBlasSwap(Stream *stream, uint64 elem_count, + DeviceMemory<double> *x, int incx, + DeviceMemory<double> *y, int incy) = 0; + virtual bool DoBlasSwap(Stream *stream, uint64 elem_count, + DeviceMemory<std::complex<float>> *x, int incx, + DeviceMemory<std::complex<float>> *y, int incy) = 0; + virtual bool DoBlasSwap(Stream *stream, uint64 elem_count, + DeviceMemory<std::complex<double>> *x, int incx, + DeviceMemory<std::complex<double>> *y, int incy) = 0; + + // Finds the index of the element with maximum absolute value. + virtual bool DoBlasIamax(Stream *stream, uint64 elem_count, + const DeviceMemory<float> &x, int incx, + DeviceMemory<int> *result) = 0; + virtual bool DoBlasIamax(Stream *stream, uint64 elem_count, + const DeviceMemory<double> &x, int incx, + DeviceMemory<int> *result) = 0; + virtual bool DoBlasIamax(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<int> *result) = 0; + virtual bool DoBlasIamax(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, + int incx, DeviceMemory<int> *result) = 0; + + // Finds the index of the element with minimum absolute value. + virtual bool DoBlasIamin(Stream *stream, uint64 elem_count, + const DeviceMemory<float> &x, int incx, + DeviceMemory<int> *result) = 0; + virtual bool DoBlasIamin(Stream *stream, uint64 elem_count, + const DeviceMemory<double> &x, int incx, + DeviceMemory<int> *result) = 0; + virtual bool DoBlasIamin(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<int> *result) = 0; + virtual bool DoBlasIamin(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, + int incx, DeviceMemory<int> *result) = 0; + + // Computes a matrix-vector product using a general band matrix: + // + // y <- alpha * a * x + beta * y, + // or + // y <- alpha * a' * x + beta * y, + // or + // y <- alpha * conj(a') * x + beta * y, + // + // alpha and beta are scalars; a is an m-by-n general band matrix, with kl + // sub-diagonals and ku super-diagonals; x is a vector with + // n(trans==kNoTranspose)/m(otherwise) elements; + // y is a vector with m(trans==kNoTranspose)/n(otherwise) elements. + virtual bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, uint64 kl, uint64 ku, float alpha, + const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &x, int incx, float beta, + DeviceMemory<float> *y, int incy) = 0; + virtual bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, uint64 kl, uint64 ku, double alpha, + const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &x, int incx, double beta, + DeviceMemory<double> *y, int incy) = 0; + virtual bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, uint64 kl, uint64 ku, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) = 0; + virtual bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, uint64 kl, uint64 ku, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) = 0; + + // Computes a matrix-vector product using a general matrix. + // + // y <- alpha * a * x + beta * y, + // or + // y <- alpha * a' * x + beta * y, + // or + // y <- alpha * conj(a') * x + beta * y, + // + // alpha and beta are scalars; a is an m-by-n general matrix; x is a vector + // with n(trans==kNoTranspose)/m(otherwise) elements; + // y is a vector with m(trans==kNoTranspose)/n(otherwise) elements. + virtual bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, float alpha, const DeviceMemory<float> &a, + int lda, const DeviceMemory<float> &x, int incx, + float beta, DeviceMemory<float> *y, int incy) = 0; + virtual bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, double alpha, const DeviceMemory<double> &a, + int lda, const DeviceMemory<double> &x, int incx, + double beta, DeviceMemory<double> *y, int incy) = 0; + virtual bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) = 0; + virtual bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) = 0; + + // Performs a rank-1 update of a general matrix. + // + // a <- alpha * x * y' + a, + // + // alpha is a scalar; x is an m-element vector; y is an n-element vector; a is + // an m-by-n general matrix. + virtual bool DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha, + const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *a, int lda) = 0; + virtual bool DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha, + const DeviceMemory<double> &x, int incx, + const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *a, int lda) = 0; + + // Performs a rank-1 update (conjugated) of a general matrix. + // + // a <- alpha * x * conj(y') + a, + // + // alpha is a scalar; x is an m-element vector; y is an n-element vector; a is + // an m-by-n general matrix. + virtual bool DoBlasGerc(Stream *stream, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *a, int lda) = 0; + virtual bool DoBlasGerc(Stream *stream, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *a, int lda) = 0; + + // Performs a rank-1 update (unconjugated) of a general matrix. + // + // a <- alpha * x * y' + a, + // + // alpha is a scalar; x is an m-element vector; y is an n-element vector; a is + // an m-by-n general matrix. + virtual bool DoBlasGeru(Stream *stream, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *a, int lda) = 0; + virtual bool DoBlasGeru(Stream *stream, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *a, int lda) = 0; + + // Computes a matrix-vector product using a Hermitian band matrix. + // + // y <- alpha * a * x + beta * y, + // + // alpha and beta are scalars; a is an n-by-n Hermitian band matrix, with k + // super-diagonals; x and y are n-element vectors. + virtual bool DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n, + uint64 k, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) = 0; + virtual bool DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n, + uint64 k, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) = 0; + + // Computes a matrix-vector product using a Hermitian matrix. + // + // y <- alpha * a * x + beta * y, + // + // alpha and beta are scalars; a is an n-by-n Hermitian matrix; x and y are + // n-element vectors. + virtual bool DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) = 0; + virtual bool DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) = 0; + + // Performs a rank-1 update of a Hermitian matrix. + // + // a <- alpha * x * conj(x') + a, + // + // alpha is a scalar; x is an n-element vector; a is an n-by-n Hermitian + // matrix. + virtual bool DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<std::complex<float>> *a, int lda) = 0; + virtual bool DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<std::complex<double>> *a, int lda) = 0; + + // Performs a rank-2 update of a Hermitian matrix. + // + // a <- alpha * x * conj(x') + conj(alpha) * y * conj(x') + a, + // + // alpha is a scalar; x and y are n-element vectors; a is an n-by-n Hermitian + // matrix. + virtual bool DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *a, int lda) = 0; + virtual bool DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *a, int lda) = 0; + + // Computes a matrix-vector product using a Hermitian packed matrix. + // + // y <- alpha * a * x + beta * y, + // + // alpha and beta are scalars; a is an n-by-n Hermitian matrix, supplied in + // packed form; x and y are n-element vectors. + virtual bool DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &ap, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) = 0; + virtual bool DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &ap, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) = 0; + + // Performs a rank-1 update of a Hermitian packed matrix. + // + // a <- alpha * x * conj(x') + a, + // + // alpha is a scalar; x is an n-element vector; a is an n-by-n Hermitian + // matrix, supplied in packed form. + virtual bool DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<std::complex<float>> *ap) = 0; + virtual bool DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<std::complex<double>> *ap) = 0; + + // Performs a rank-2 update of a Hermitian packed matrix. + // + // a <- alpha * x * conj(x') + conj(alpha) * y * conj(x') + a, + // + // alpha is a scalar; x and y are n-element vectors; a is an n-by-n Hermitian + // matrix, supplied in packed form. + virtual bool DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *ap) = 0; + virtual bool DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *ap) = 0; + + // Computes a matrix-vector product using a symmetric band matrix. + // + // y <- alpha * a * x + beta * y, + // + // alpha and beta are scalars; a is an n-by-n symmetric band matrix, with k + // super-diagonals; x and y are n-element vectors. + virtual bool DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n, + uint64 k, float alpha, const DeviceMemory<float> &a, + int lda, const DeviceMemory<float> &x, int incx, + float beta, DeviceMemory<float> *y, int incy) = 0; + virtual bool DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n, + uint64 k, double alpha, const DeviceMemory<double> &a, + int lda, const DeviceMemory<double> &x, int incx, + double beta, DeviceMemory<double> *y, int incy) = 0; + + // Computes a matrix-vector product using a symmetric packed matrix. + // + // y <- alpha * a * x + beta * y, + // + // alpha and beta are scalars; a is an n-by-n symmetric matrix, supplied in + // packed form; x and y are n-element vectors. + virtual bool DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory<float> &ap, + const DeviceMemory<float> &x, int incx, float beta, + DeviceMemory<float> *y, int incy) = 0; + virtual bool DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory<double> &ap, + const DeviceMemory<double> &x, int incx, double beta, + DeviceMemory<double> *y, int incy) = 0; + + // Performs a rank-1 update of a symmetric packed matrix. + // + // a <- alpha * x * x' + a, + // + // alpha is a scalar; x is an n-element vector; a is an n-by-n symmetric + // matrix, supplied in packed form. + virtual bool DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *ap) = 0; + virtual bool DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *ap) = 0; + + // Performs a rank-2 update of a symmetric packed matrix. + // + // a <- alpha * x * x' + alpha * y * x' + a, + // + // alpha is a scalar; x and y are n-element vectors; a is an n-by-n symmetric + // matrix, supplied in packed form. + virtual bool DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *ap) = 0; + virtual bool DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory<double> &x, int incx, + const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *ap) = 0; + + // Computes a matrix-vector product for a symmetric matrix. + // + // y <- alpha * a * x + beta * y, + // + // alpha and beta are scalars; a is an n-by-n symmetric matrix; x and y are + // n-element vectors. + virtual bool DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &x, int incx, float beta, + DeviceMemory<float> *y, int incy) = 0; + virtual bool DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &x, int incx, double beta, + DeviceMemory<double> *y, int incy) = 0; + + // Performs a rank-1 update of a symmetric matrix. + // + // a <- alpha * x * x' + a, + // + // alpha is a scalar; x is an n-element vector; a is an n-by-n symmetric + // matrix. + virtual bool DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *a, int lda) = 0; + virtual bool DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *a, int lda) = 0; + + // Performs a rank-2 update of symmetric matrix. + // + // a <- alpha * x * x' + alpha * y * x' + a, + // + // alpha is a scalar; x and y are n-element vectors; a is an n-by-n symmetric + // matrix. + virtual bool DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *a, int lda) = 0; + virtual bool DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory<double> &x, int incx, + const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *a, int lda) = 0; + + // Computes a matrix-vector product using a triangular band matrix. + // + // x <- a * x, + // or + // x <- a' * x, + // or + // x <- conj(a') * x, + // + // a is an n-by-n unit, or non-unit, upper or lower triangular band matrix, + // with k+1 diagonals; x is a n-element vector. + virtual bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx) = 0; + virtual bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx) = 0; + virtual bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<std::complex<float>> &a, + int lda, DeviceMemory<std::complex<float>> *x, + int incx) = 0; + virtual bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<std::complex<double>> &a, + int lda, DeviceMemory<std::complex<double>> *x, + int incx) = 0; + + // Solves a system of linear equations whose coefficients are in a triangular + // band matrix as below: + // + // a * x = b, + // or + // a' * x = b, + // or + // conj(a') * x = b, + // + // b and x are n-element vectors; a is an n-by-n unit, or non-unit, upper or + // lower triangular band matrix, with k+1 diagonals. + virtual bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx) = 0; + virtual bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx) = 0; + virtual bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<std::complex<float>> &a, + int lda, DeviceMemory<std::complex<float>> *x, + int incx) = 0; + virtual bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<std::complex<double>> &a, + int lda, DeviceMemory<std::complex<double>> *x, + int incx) = 0; + + // Computes a matrix-vector product using a triangular packed matrix. + // + // x <- a * x, + // or + // x <- a' * x, + // or + // x <- conj(a') * x, + // + // a is an n-by-n unit, or non-unit, upper or lower triangular matrix, + // supplied in packed form; x is a n-element vector. + virtual bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &ap, DeviceMemory<float> *x, + int incx) = 0; + virtual bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &ap, + DeviceMemory<double> *x, int incx) = 0; + virtual bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &ap, + DeviceMemory<std::complex<float>> *x, int incx) = 0; + virtual bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &ap, + DeviceMemory<std::complex<double>> *x, int incx) = 0; + + // Solves a system of linear equations whose coefficients are in a triangular + // packed matrix as below: + // + // a * x = b, + // or + // a' * x = b, + // or + // conj(a') * x = b, + // + // b and x are n-element vectors; a is an n-by-n unit, or non-unit, upper or + // lower triangular matrix, supplied in packed form. + virtual bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &ap, DeviceMemory<float> *x, + int incx) = 0; + virtual bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &ap, + DeviceMemory<double> *x, int incx) = 0; + virtual bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &ap, + DeviceMemory<std::complex<float>> *x, int incx) = 0; + virtual bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &ap, + DeviceMemory<std::complex<double>> *x, int incx) = 0; + + // Computes a matrix-vector product using a triangular matrix. + // + // x <- a * x, + // or + // x <- a' * x, + // or + // x <- conj(a') * x, + // + // a is an n-by-n unit, or non-unit, upper or lower triangular matrix; x is a + // n-element vector. + virtual bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx) = 0; + virtual bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx) = 0; + virtual bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &a, int lda, + DeviceMemory<std::complex<float>> *x, int incx) = 0; + virtual bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &a, int lda, + DeviceMemory<std::complex<double>> *x, int incx) = 0; + + // Solves a system of linear equations whose coefficients are in a triangular + // matrix as below: + // + // a * x = b, + // or + // a' * x = b, + // or + // conj(a') * x = b, + // + // b and x are n-element vectors; a is an n-by-n unit, or non-unit, upper or + // lower triangular matrix. + virtual bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx) = 0; + virtual bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx) = 0; + virtual bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &a, int lda, + DeviceMemory<std::complex<float>> *x, int incx) = 0; + virtual bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &a, int lda, + DeviceMemory<std::complex<double>> *x, int incx) = 0; + + // Computes a matrix-matrix product with general matrices: + // + // c <- alpha * op(a) * op(b) + beta * c, + // + // op(X) is one of op(X) = X, or op(X) = X', or op(X) = conj(X'); alpha and + // beta are scalars; a, b, and c are matrices; op(a) is an m-by-k matrix; + // op(b) is a k-by-n matrix; c is an m-by-n matrix. + virtual bool DoBlasGemm(Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, + float alpha, const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &b, int ldb, float beta, + DeviceMemory<float> *c, int ldc) = 0; + virtual bool DoBlasGemm(Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, + double alpha, const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &b, int ldb, double beta, + DeviceMemory<double> *c, int ldc) = 0; + virtual bool DoBlasGemm(Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) = 0; + virtual bool DoBlasGemm(Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) = 0; + + // Computes a batch of matrix-matrix product with general matrices. + // This is a batched version of DoBlasGemm. + // The batched GEMM computes matrix product for each input/output in a, b, + // and c, which contain batch_count DeviceMemory objects. + virtual bool DoBlasGemmBatched( + Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, float alpha, + const port::ArraySlice<DeviceMemory<float> *> &a, int lda, + const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, float beta, + const port::ArraySlice<DeviceMemory<float> *> &c, int ldc, + int batch_count) = 0; + virtual bool DoBlasGemmBatched( + Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, double alpha, + const port::ArraySlice<DeviceMemory<double> *> &a, int lda, + const port::ArraySlice<DeviceMemory<double> *> &b, int ldb, double beta, + const port::ArraySlice<DeviceMemory<double> *> &c, int ldc, + int batch_count) = 0; + virtual bool DoBlasGemmBatched( + Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, std::complex<float> alpha, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a, int lda, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb, + std::complex<float> beta, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc, + int batch_count) = 0; + virtual bool DoBlasGemmBatched( + Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, std::complex<double> alpha, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a, int lda, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b, int ldb, + std::complex<double> beta, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc, + int batch_count) = 0; + + // Computes a matrix-matrix product where one input matrix is Hermitian: + // + // c <- alpha * a * b + beta * c, + // or + // c <- alpha * b * a + beta * c, + // + // alpha and beta are scalars; a is a Hermitian matrix; b and c are m-by-n + // matrices. + virtual bool DoBlasHemm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) = 0; + virtual bool DoBlasHemm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) = 0; + + // Performs a Hermitian rank-k update. + // + // c <- alpha * a * conj(a') + beta * c, + // or + // c <- alpha * conj(a') * a + beta * c, + // + // alpha and beta are scalars; c is a n-by-n Hermitian matrix; a is an n-by-k + // matrix in the first case and a k-by-n matrix in the second case. + virtual bool DoBlasHerk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + float alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + float beta, DeviceMemory<std::complex<float>> *c, + int ldc) = 0; + virtual bool DoBlasHerk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + double alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + double beta, DeviceMemory<std::complex<double>> *c, + int ldc) = 0; + + // Performs a Hermitian rank-2k update. + // + // c <- alpha * a * conj(b') + conj(alpha) * b * conj(a') + beta * c, + // or + // c <- alpha * conj(b') * a + conj(alpha) * conj(a') * b + beta * c, + // + // alpha and beta are scalars; c is a n-by-n Hermitian matrix; a and b are + // n-by-k matrices in the first case and k-by-n matrices in the second case. + virtual bool DoBlasHer2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + float beta, DeviceMemory<std::complex<float>> *c, + int ldc) = 0; + virtual bool DoBlasHer2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + double beta, DeviceMemory<std::complex<double>> *c, + int ldc) = 0; + + // Computes a matrix-matrix product where one input matrix is symmetric. + // + // c <- alpha * a * b + beta * c, + // or + // c <- alpha * b * a + beta * c, + // + // alpha and beta are scalars; a is a symmetric matrix; b and c are m-by-n + // matrices. + virtual bool DoBlasSymm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + float alpha, const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &b, int ldb, float beta, + DeviceMemory<float> *c, int ldc) = 0; + virtual bool DoBlasSymm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + double alpha, const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &b, int ldb, double beta, + DeviceMemory<double> *c, int ldc) = 0; + virtual bool DoBlasSymm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) = 0; + virtual bool DoBlasSymm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) = 0; + + // Performs a symmetric rank-k update. + // + // c <- alpha * a * a' + beta * c, + // or + // c <- alpha * a' * a + beta * c, + // + // alpha and beta are scalars; c is a n-by-n symmetric matrix; a is an n-by-k + // matrix in the first case and a k-by-n matrix in the second case. + virtual bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + float alpha, const DeviceMemory<float> &a, int lda, + float beta, DeviceMemory<float> *c, int ldc) = 0; + virtual bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + double alpha, const DeviceMemory<double> &a, int lda, + double beta, DeviceMemory<double> *c, int ldc) = 0; + virtual bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) = 0; + virtual bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) = 0; + + // Performs a symmetric rank-2k update. + // + // c <- alpha * a * b' + alpha * b * a' + beta * c, + // or + // c <- alpha * b' * a + alpha * a' * b + beta * c, + // + // alpha and beta are scalars; c is a n-by-n symmetric matrix; a and b are + // n-by-k matrices in the first case and k-by-n matrices in the second case. + virtual bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + float alpha, const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &b, int ldb, float beta, + DeviceMemory<float> *c, int ldc) = 0; + virtual bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + double alpha, const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &b, int ldb, double beta, + DeviceMemory<double> *c, int ldc) = 0; + virtual bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) = 0; + virtual bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) = 0; + + // Computes a matrix-matrix product where one input matrix is triangular. + // + // b <- alpha * op(a) * b, + // or + // b <- alpha * b * op(a) + // + // alpha is a scalar; b is an m-by-n matrix; a is a unit, or non-unit, upper + // or lower triangular matrix; op(a) is one of op(a) = a, or op(a) = a', or + // op(a) = conj(a'). + virtual bool DoBlasTrmm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, float alpha, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *b, int ldb) = 0; + virtual bool DoBlasTrmm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, double alpha, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *b, int ldb) = 0; + virtual bool DoBlasTrmm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + DeviceMemory<std::complex<float>> *b, int ldb) = 0; + virtual bool DoBlasTrmm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + DeviceMemory<std::complex<double>> *b, int ldb) = 0; + + // Solves a triangular matrix equation. + // + // op(a) * x = alpha * b, + // or + // x * op(a) = alpha * b + // + // alpha is a scalar; x and b are m-by-n matrices; a is a unit, or non-unit, + // upper or lower triangular matrix; op(a) is one of op(a) = a, or op(a) = a', + // or op(a) = conj(a'). + virtual bool DoBlasTrsm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, float alpha, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *b, int ldb) = 0; + virtual bool DoBlasTrsm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, double alpha, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *b, int ldb) = 0; + virtual bool DoBlasTrsm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + DeviceMemory<std::complex<float>> *b, int ldb) = 0; + virtual bool DoBlasTrsm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + DeviceMemory<std::complex<double>> *b, int ldb) = 0; + + protected: + BlasSupport() {} + + private: + SE_DISALLOW_COPY_AND_ASSIGN(BlasSupport); +}; + +// Macro used to quickly declare overrides for abstract virtuals in the +// BlasSupport base class. +#define TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES \ + bool DoBlasAsum(Stream *stream, uint64 elem_count, \ + const DeviceMemory<float> &x, int incx, \ + DeviceMemory<float> *result) override; \ + bool DoBlasAsum(Stream *stream, uint64 elem_count, \ + const DeviceMemory<double> &x, int incx, \ + DeviceMemory<double> *result) override; \ + bool DoBlasAsum(Stream *stream, uint64 elem_count, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + DeviceMemory<float> *result) override; \ + bool DoBlasAsum(Stream *stream, uint64 elem_count, \ + const DeviceMemory<std::complex<double>> &x, int incx, \ + DeviceMemory<double> *result) override; \ + bool DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha, \ + const DeviceMemory<float> &x, int incx, \ + DeviceMemory<float> *y, int incy) override; \ + bool DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha, \ + const DeviceMemory<double> &x, int incx, \ + DeviceMemory<double> *y, int incy) override; \ + bool DoBlasAxpy(Stream *stream, uint64 elem_count, \ + std::complex<float> alpha, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + DeviceMemory<std::complex<float>> *y, int incy) override; \ + bool DoBlasAxpy(Stream *stream, uint64 elem_count, \ + std::complex<double> alpha, \ + const DeviceMemory<std::complex<double>> &x, int incx, \ + DeviceMemory<std::complex<double>> *y, int incy) override; \ + bool DoBlasCopy(Stream *stream, uint64 elem_count, \ + const DeviceMemory<float> &x, int incx, \ + DeviceMemory<float> *y, int incy) override; \ + bool DoBlasCopy(Stream *stream, uint64 elem_count, \ + const DeviceMemory<double> &x, int incx, \ + DeviceMemory<double> *y, int incy) override; \ + bool DoBlasCopy(Stream *stream, uint64 elem_count, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + DeviceMemory<std::complex<float>> *y, int incy) override; \ + bool DoBlasCopy(Stream *stream, uint64 elem_count, \ + const DeviceMemory<std::complex<double>> &x, int incx, \ + DeviceMemory<std::complex<double>> *y, int incy) override; \ + bool DoBlasDot(Stream *stream, uint64 elem_count, \ + const DeviceMemory<float> &x, int incx, \ + const DeviceMemory<float> &y, int incy, \ + DeviceMemory<float> *result) override; \ + bool DoBlasDot(Stream *stream, uint64 elem_count, \ + const DeviceMemory<double> &x, int incx, \ + const DeviceMemory<double> &y, int incy, \ + DeviceMemory<double> *result) override; \ + bool DoBlasDotc(Stream *stream, uint64 elem_count, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + const DeviceMemory<std::complex<float>> &y, int incy, \ + DeviceMemory<std::complex<float>> *result) override; \ + bool DoBlasDotc(Stream *stream, uint64 elem_count, \ + const DeviceMemory<std::complex<double>> &x, int incx, \ + const DeviceMemory<std::complex<double>> &y, int incy, \ + DeviceMemory<std::complex<double>> *result) override; \ + bool DoBlasDotu(Stream *stream, uint64 elem_count, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + const DeviceMemory<std::complex<float>> &y, int incy, \ + DeviceMemory<std::complex<float>> *result) override; \ + bool DoBlasDotu(Stream *stream, uint64 elem_count, \ + const DeviceMemory<std::complex<double>> &x, int incx, \ + const DeviceMemory<std::complex<double>> &y, int incy, \ + DeviceMemory<std::complex<double>> *result) override; \ + bool DoBlasNrm2(Stream *stream, uint64 elem_count, \ + const DeviceMemory<float> &x, int incx, \ + DeviceMemory<float> *result) override; \ + bool DoBlasNrm2(Stream *stream, uint64 elem_count, \ + const DeviceMemory<double> &x, int incx, \ + DeviceMemory<double> *result) override; \ + bool DoBlasNrm2(Stream *stream, uint64 elem_count, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + DeviceMemory<float> *result) override; \ + bool DoBlasNrm2(Stream *stream, uint64 elem_count, \ + const DeviceMemory<std::complex<double>> &x, int incx, \ + DeviceMemory<double> *result) override; \ + bool DoBlasRot(Stream *stream, uint64 elem_count, DeviceMemory<float> *x, \ + int incx, DeviceMemory<float> *y, int incy, float c, float s) \ + override; \ + bool DoBlasRot(Stream *stream, uint64 elem_count, DeviceMemory<double> *x, \ + int incx, DeviceMemory<double> *y, int incy, double c, \ + double s) override; \ + bool DoBlasRot(Stream *stream, uint64 elem_count, \ + DeviceMemory<std::complex<float>> *x, int incx, \ + DeviceMemory<std::complex<float>> *y, int incy, float c, \ + float s) override; \ + bool DoBlasRot(Stream *stream, uint64 elem_count, \ + DeviceMemory<std::complex<double>> *x, int incx, \ + DeviceMemory<std::complex<double>> *y, int incy, double c, \ + double s) override; \ + bool DoBlasRotg(Stream *stream, DeviceMemory<float> *a, \ + DeviceMemory<float> *b, DeviceMemory<float> *c, \ + DeviceMemory<float> *s) override; \ + bool DoBlasRotg(Stream *stream, DeviceMemory<double> *a, \ + DeviceMemory<double> *b, DeviceMemory<double> *c, \ + DeviceMemory<double> *s) override; \ + bool DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a, \ + DeviceMemory<std::complex<float>> *b, \ + DeviceMemory<float> *c, \ + DeviceMemory<std::complex<float>> *s) override; \ + bool DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a, \ + DeviceMemory<std::complex<double>> *b, \ + DeviceMemory<double> *c, \ + DeviceMemory<std::complex<double>> *s) override; \ + bool DoBlasRotm(Stream *stream, uint64 elem_count, DeviceMemory<float> *x, \ + int incx, DeviceMemory<float> *y, int incy, \ + const DeviceMemory<float> ¶m) override; \ + bool DoBlasRotm(Stream *stream, uint64 elem_count, DeviceMemory<double> *x, \ + int incx, DeviceMemory<double> *y, int incy, \ + const DeviceMemory<double> ¶m) override; \ + bool DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1, \ + DeviceMemory<float> *d2, DeviceMemory<float> *x1, \ + const DeviceMemory<float> &y1, DeviceMemory<float> *param) \ + override; \ + bool DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1, \ + DeviceMemory<double> *d2, DeviceMemory<double> *x1, \ + const DeviceMemory<double> &y1, \ + DeviceMemory<double> *param) override; \ + bool DoBlasScal(Stream *stream, uint64 elem_count, float alpha, \ + DeviceMemory<float> *x, int incx) override; \ + bool DoBlasScal(Stream *stream, uint64 elem_count, double alpha, \ + DeviceMemory<double> *x, int incx) override; \ + bool DoBlasScal(Stream *stream, uint64 elem_count, float alpha, \ + DeviceMemory<std::complex<float>> *x, int incx) override; \ + bool DoBlasScal(Stream *stream, uint64 elem_count, double alpha, \ + DeviceMemory<std::complex<double>> *x, int incx) override; \ + bool DoBlasScal(Stream *stream, uint64 elem_count, \ + std::complex<float> alpha, \ + DeviceMemory<std::complex<float>> *x, int incx) override; \ + bool DoBlasScal(Stream *stream, uint64 elem_count, \ + std::complex<double> alpha, \ + DeviceMemory<std::complex<double>> *x, int incx) override; \ + bool DoBlasSwap(Stream *stream, uint64 elem_count, DeviceMemory<float> *x, \ + int incx, DeviceMemory<float> *y, int incy) override; \ + bool DoBlasSwap(Stream *stream, uint64 elem_count, DeviceMemory<double> *x, \ + int incx, DeviceMemory<double> *y, int incy) override; \ + bool DoBlasSwap(Stream *stream, uint64 elem_count, \ + DeviceMemory<std::complex<float>> *x, int incx, \ + DeviceMemory<std::complex<float>> *y, int incy) override; \ + bool DoBlasSwap(Stream *stream, uint64 elem_count, \ + DeviceMemory<std::complex<double>> *x, int incx, \ + DeviceMemory<std::complex<double>> *y, int incy) override; \ + bool DoBlasIamax(Stream *stream, uint64 elem_count, \ + const DeviceMemory<float> &x, int incx, \ + DeviceMemory<int> *result) override; \ + bool DoBlasIamax(Stream *stream, uint64 elem_count, \ + const DeviceMemory<double> &x, int incx, \ + DeviceMemory<int> *result) override; \ + bool DoBlasIamax(Stream *stream, uint64 elem_count, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + DeviceMemory<int> *result) override; \ + bool DoBlasIamax(Stream *stream, uint64 elem_count, \ + const DeviceMemory<std::complex<double>> &x, int incx, \ + DeviceMemory<int> *result) override; \ + bool DoBlasIamin(Stream *stream, uint64 elem_count, \ + const DeviceMemory<float> &x, int incx, \ + DeviceMemory<int> *result) override; \ + bool DoBlasIamin(Stream *stream, uint64 elem_count, \ + const DeviceMemory<double> &x, int incx, \ + DeviceMemory<int> *result) override; \ + bool DoBlasIamin(Stream *stream, uint64 elem_count, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + DeviceMemory<int> *result) override; \ + bool DoBlasIamin(Stream *stream, uint64 elem_count, \ + const DeviceMemory<std::complex<double>> &x, int incx, \ + DeviceMemory<int> *result) override; \ + bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ + uint64 kl, uint64 ku, float alpha, \ + const DeviceMemory<float> &a, int lda, \ + const DeviceMemory<float> &x, int incx, float beta, \ + DeviceMemory<float> *y, int incy) override; \ + bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ + uint64 kl, uint64 ku, double alpha, \ + const DeviceMemory<double> &a, int lda, \ + const DeviceMemory<double> &x, int incx, double beta, \ + DeviceMemory<double> *y, int incy) override; \ + bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ + uint64 kl, uint64 ku, std::complex<float> alpha, \ + const DeviceMemory<std::complex<float>> &a, int lda, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + std::complex<float> beta, \ + DeviceMemory<std::complex<float>> *y, int incy) override; \ + bool DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ + uint64 kl, uint64 ku, std::complex<double> alpha, \ + const DeviceMemory<std::complex<double>> &a, int lda, \ + const DeviceMemory<std::complex<double>> &x, int incx, \ + std::complex<double> beta, \ + DeviceMemory<std::complex<double>> *y, int incy) override; \ + bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ + float alpha, const DeviceMemory<float> &a, int lda, \ + const DeviceMemory<float> &x, int incx, float beta, \ + DeviceMemory<float> *y, int incy) override; \ + bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ + double alpha, const DeviceMemory<double> &a, int lda, \ + const DeviceMemory<double> &x, int incx, double beta, \ + DeviceMemory<double> *y, int incy) override; \ + bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ + std::complex<float> alpha, \ + const DeviceMemory<std::complex<float>> &a, int lda, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + std::complex<float> beta, \ + DeviceMemory<std::complex<float>> *y, int incy) override; \ + bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ + std::complex<double> alpha, \ + const DeviceMemory<std::complex<double>> &a, int lda, \ + const DeviceMemory<std::complex<double>> &x, int incx, \ + std::complex<double> beta, \ + DeviceMemory<std::complex<double>> *y, int incy) override; \ + bool DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha, \ + const DeviceMemory<float> &x, int incx, \ + const DeviceMemory<float> &y, int incy, \ + DeviceMemory<float> *a, int lda) override; \ + bool DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha, \ + const DeviceMemory<double> &x, int incx, \ + const DeviceMemory<double> &y, int incy, \ + DeviceMemory<double> *a, int lda) override; \ + bool DoBlasGerc(Stream *stream, uint64 m, uint64 n, \ + std::complex<float> alpha, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + const DeviceMemory<std::complex<float>> &y, int incy, \ + DeviceMemory<std::complex<float>> *a, int lda) override; \ + bool DoBlasGerc(Stream *stream, uint64 m, uint64 n, \ + std::complex<double> alpha, \ + const DeviceMemory<std::complex<double>> &x, int incx, \ + const DeviceMemory<std::complex<double>> &y, int incy, \ + DeviceMemory<std::complex<double>> *a, int lda) override; \ + bool DoBlasGeru(Stream *stream, uint64 m, uint64 n, \ + std::complex<float> alpha, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + const DeviceMemory<std::complex<float>> &y, int incy, \ + DeviceMemory<std::complex<float>> *a, int lda) override; \ + bool DoBlasGeru(Stream *stream, uint64 m, uint64 n, \ + std::complex<double> alpha, \ + const DeviceMemory<std::complex<double>> &x, int incx, \ + const DeviceMemory<std::complex<double>> &y, int incy, \ + DeviceMemory<std::complex<double>> *a, int lda) override; \ + bool DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n, uint64 k, \ + std::complex<float> alpha, \ + const DeviceMemory<std::complex<float>> &a, int lda, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + std::complex<float> beta, \ + DeviceMemory<std::complex<float>> *y, int incy) override; \ + bool DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n, uint64 k, \ + std::complex<double> alpha, \ + const DeviceMemory<std::complex<double>> &a, int lda, \ + const DeviceMemory<std::complex<double>> &x, int incx, \ + std::complex<double> beta, \ + DeviceMemory<std::complex<double>> *y, int incy) override; \ + bool DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n, \ + std::complex<float> alpha, \ + const DeviceMemory<std::complex<float>> &a, int lda, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + std::complex<float> beta, \ + DeviceMemory<std::complex<float>> *y, int incy) override; \ + bool DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n, \ + std::complex<double> alpha, \ + const DeviceMemory<std::complex<double>> &a, int lda, \ + const DeviceMemory<std::complex<double>> &x, int incx, \ + std::complex<double> beta, \ + DeviceMemory<std::complex<double>> *y, int incy) override; \ + bool DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n, float alpha, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + DeviceMemory<std::complex<float>> *a, int lda) override; \ + bool DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n, \ + double alpha, const DeviceMemory<std::complex<double>> &x, \ + int incx, DeviceMemory<std::complex<double>> *a, int lda) \ + override; \ + bool DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n, \ + std::complex<float> alpha, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + const DeviceMemory<std::complex<float>> &y, int incy, \ + DeviceMemory<std::complex<float>> *a, int lda) override; \ + bool DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n, \ + std::complex<double> alpha, \ + const DeviceMemory<std::complex<double>> &x, int incx, \ + const DeviceMemory<std::complex<double>> &y, int incy, \ + DeviceMemory<std::complex<double>> *a, int lda) override; \ + bool DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n, \ + std::complex<float> alpha, \ + const DeviceMemory<std::complex<float>> &ap, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + std::complex<float> beta, \ + DeviceMemory<std::complex<float>> *y, int incy) override; \ + bool DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n, \ + std::complex<double> alpha, \ + const DeviceMemory<std::complex<double>> &ap, \ + const DeviceMemory<std::complex<double>> &x, int incx, \ + std::complex<double> beta, \ + DeviceMemory<std::complex<double>> *y, int incy) override; \ + bool DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n, float alpha, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + DeviceMemory<std::complex<float>> *ap) override; \ + bool DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n, \ + double alpha, const DeviceMemory<std::complex<double>> &x, \ + int incx, DeviceMemory<std::complex<double>> *ap) override; \ + bool DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n, \ + std::complex<float> alpha, \ + const DeviceMemory<std::complex<float>> &x, int incx, \ + const DeviceMemory<std::complex<float>> &y, int incy, \ + DeviceMemory<std::complex<float>> *ap) override; \ + bool DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n, \ + std::complex<double> alpha, \ + const DeviceMemory<std::complex<double>> &x, int incx, \ + const DeviceMemory<std::complex<double>> &y, int incy, \ + DeviceMemory<std::complex<double>> *ap) override; \ + bool DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n, uint64 k, \ + float alpha, const DeviceMemory<float> &a, int lda, \ + const DeviceMemory<float> &x, int incx, float beta, \ + DeviceMemory<float> *y, int incy) override; \ + bool DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n, uint64 k, \ + double alpha, const DeviceMemory<double> &a, int lda, \ + const DeviceMemory<double> &x, int incx, double beta, \ + DeviceMemory<double> *y, int incy) override; \ + bool DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n, \ + float alpha, const DeviceMemory<float> &ap, \ + const DeviceMemory<float> &x, int incx, float beta, \ + DeviceMemory<float> *y, int incy) override; \ + bool DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n, \ + double alpha, const DeviceMemory<double> &ap, \ + const DeviceMemory<double> &x, int incx, double beta, \ + DeviceMemory<double> *y, int incy) override; \ + bool DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n, float alpha, \ + const DeviceMemory<float> &x, int incx, \ + DeviceMemory<float> *ap) override; \ + bool DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n, \ + double alpha, const DeviceMemory<double> &x, int incx, \ + DeviceMemory<double> *ap) override; \ + bool DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n, \ + float alpha, const DeviceMemory<float> &x, int incx, \ + const DeviceMemory<float> &y, int incy, \ + DeviceMemory<float> *ap) override; \ + bool DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n, \ + double alpha, const DeviceMemory<double> &x, int incx, \ + const DeviceMemory<double> &y, int incy, \ + DeviceMemory<double> *ap) override; \ + bool DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n, \ + float alpha, const DeviceMemory<float> &a, int lda, \ + const DeviceMemory<float> &x, int incx, float beta, \ + DeviceMemory<float> *y, int incy) override; \ + bool DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n, \ + double alpha, const DeviceMemory<double> &a, int lda, \ + const DeviceMemory<double> &x, int incx, double beta, \ + DeviceMemory<double> *y, int incy) override; \ + bool DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n, float alpha, \ + const DeviceMemory<float> &x, int incx, \ + DeviceMemory<float> *a, int lda) override; \ + bool DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n, \ + double alpha, const DeviceMemory<double> &x, int incx, \ + DeviceMemory<double> *a, int lda) override; \ + bool DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n, \ + float alpha, const DeviceMemory<float> &x, int incx, \ + const DeviceMemory<float> &y, int incy, \ + DeviceMemory<float> *a, int lda) override; \ + bool DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n, \ + double alpha, const DeviceMemory<double> &x, int incx, \ + const DeviceMemory<double> &y, int incy, \ + DeviceMemory<double> *a, int lda) override; \ + bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + uint64 k, const DeviceMemory<float> &a, int lda, \ + DeviceMemory<float> *x, int incx) override; \ + bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + uint64 k, const DeviceMemory<double> &a, int lda, \ + DeviceMemory<double> *x, int incx) override; \ + bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + uint64 k, const DeviceMemory<std::complex<float>> &a, \ + int lda, DeviceMemory<std::complex<float>> *x, int incx) \ + override; \ + bool DoBlasTbmv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + uint64 k, const DeviceMemory<std::complex<double>> &a, \ + int lda, DeviceMemory<std::complex<double>> *x, int incx) \ + override; \ + bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + uint64 k, const DeviceMemory<float> &a, int lda, \ + DeviceMemory<float> *x, int incx) override; \ + bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + uint64 k, const DeviceMemory<double> &a, int lda, \ + DeviceMemory<double> *x, int incx) override; \ + bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + uint64 k, const DeviceMemory<std::complex<float>> &a, \ + int lda, DeviceMemory<std::complex<float>> *x, int incx) \ + override; \ + bool DoBlasTbsv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + uint64 k, const DeviceMemory<std::complex<double>> &a, \ + int lda, DeviceMemory<std::complex<double>> *x, int incx) \ + override; \ + bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + const DeviceMemory<float> &ap, DeviceMemory<float> *x, \ + int incx) override; \ + bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + const DeviceMemory<double> &ap, DeviceMemory<double> *x, \ + int incx) override; \ + bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + const DeviceMemory<std::complex<float>> &ap, \ + DeviceMemory<std::complex<float>> *x, int incx) override; \ + bool DoBlasTpmv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + const DeviceMemory<std::complex<double>> &ap, \ + DeviceMemory<std::complex<double>> *x, int incx) override; \ + bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + const DeviceMemory<float> &ap, DeviceMemory<float> *x, \ + int incx) override; \ + bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + const DeviceMemory<double> &ap, DeviceMemory<double> *x, \ + int incx) override; \ + bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + const DeviceMemory<std::complex<float>> &ap, \ + DeviceMemory<std::complex<float>> *x, int incx) override; \ + bool DoBlasTpsv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + const DeviceMemory<std::complex<double>> &ap, \ + DeviceMemory<std::complex<double>> *x, int incx) override; \ + bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + const DeviceMemory<float> &a, int lda, \ + DeviceMemory<float> *x, int incx) override; \ + bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + const DeviceMemory<double> &a, int lda, \ + DeviceMemory<double> *x, int incx) override; \ + bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + const DeviceMemory<std::complex<float>> &a, int lda, \ + DeviceMemory<std::complex<float>> *x, int incx) override; \ + bool DoBlasTrmv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + const DeviceMemory<std::complex<double>> &a, int lda, \ + DeviceMemory<std::complex<double>> *x, int incx) override; \ + bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + const DeviceMemory<float> &a, int lda, \ + DeviceMemory<float> *x, int incx) override; \ + bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + const DeviceMemory<double> &a, int lda, \ + DeviceMemory<double> *x, int incx) override; \ + bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + const DeviceMemory<std::complex<float>> &a, int lda, \ + DeviceMemory<std::complex<float>> *x, int incx) override; \ + bool DoBlasTrsv(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, blas::Diagonal diag, uint64 n, \ + const DeviceMemory<std::complex<double>> &a, int lda, \ + DeviceMemory<std::complex<double>> *x, int incx) override; \ + bool DoBlasGemm(Stream *stream, blas::Transpose transa, \ + blas::Transpose transb, uint64 m, uint64 n, uint64 k, \ + float alpha, const DeviceMemory<float> &a, int lda, \ + const DeviceMemory<float> &b, int ldb, float beta, \ + DeviceMemory<float> *c, int ldc) override; \ + bool DoBlasGemm(Stream *stream, blas::Transpose transa, \ + blas::Transpose transb, uint64 m, uint64 n, uint64 k, \ + double alpha, const DeviceMemory<double> &a, int lda, \ + const DeviceMemory<double> &b, int ldb, double beta, \ + DeviceMemory<double> *c, int ldc) override; \ + bool DoBlasGemm(Stream *stream, blas::Transpose transa, \ + blas::Transpose transb, uint64 m, uint64 n, uint64 k, \ + std::complex<float> alpha, \ + const DeviceMemory<std::complex<float>> &a, int lda, \ + const DeviceMemory<std::complex<float>> &b, int ldb, \ + std::complex<float> beta, \ + DeviceMemory<std::complex<float>> *c, int ldc) override; \ + bool DoBlasGemm(Stream *stream, blas::Transpose transa, \ + blas::Transpose transb, uint64 m, uint64 n, uint64 k, \ + std::complex<double> alpha, \ + const DeviceMemory<std::complex<double>> &a, int lda, \ + const DeviceMemory<std::complex<double>> &b, int ldb, \ + std::complex<double> beta, \ + DeviceMemory<std::complex<double>> *c, int ldc) override; \ + bool DoBlasGemmBatched( \ + Stream *stream, blas::Transpose transa, blas::Transpose transb, \ + uint64 m, uint64 n, uint64 k, float alpha, \ + const port::ArraySlice<DeviceMemory<float> *> &a, int lda, \ + const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, float beta, \ + const port::ArraySlice<DeviceMemory<float> *> &c, int ldc, \ + int batch_count) override; \ + bool DoBlasGemmBatched( \ + Stream *stream, blas::Transpose transa, blas::Transpose transb, \ + uint64 m, uint64 n, uint64 k, double alpha, \ + const port::ArraySlice<DeviceMemory<double> *> &a, int lda, \ + const port::ArraySlice<DeviceMemory<double> *> &b, int ldb, double beta, \ + const port::ArraySlice<DeviceMemory<double> *> &c, int ldc, \ + int batch_count) override; \ + bool DoBlasGemmBatched( \ + Stream *stream, blas::Transpose transa, blas::Transpose transb, \ + uint64 m, uint64 n, uint64 k, std::complex<float> alpha, \ + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a, int lda, \ + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb, \ + std::complex<float> beta, \ + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc, \ + int batch_count) override; \ + bool DoBlasGemmBatched( \ + Stream *stream, blas::Transpose transa, blas::Transpose transb, \ + uint64 m, uint64 n, uint64 k, std::complex<double> alpha, \ + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a, \ + int lda, \ + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b, \ + int ldb, std::complex<double> beta, \ + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, \ + int ldc, int batch_count) override; \ + bool DoBlasHemm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ + uint64 m, uint64 n, std::complex<float> alpha, \ + const DeviceMemory<std::complex<float>> &a, int lda, \ + const DeviceMemory<std::complex<float>> &b, int ldb, \ + std::complex<float> beta, \ + DeviceMemory<std::complex<float>> *c, int ldc) override; \ + bool DoBlasHemm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ + uint64 m, uint64 n, std::complex<double> alpha, \ + const DeviceMemory<std::complex<double>> &a, int lda, \ + const DeviceMemory<std::complex<double>> &b, int ldb, \ + std::complex<double> beta, \ + DeviceMemory<std::complex<double>> *c, int ldc) override; \ + bool DoBlasHerk(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, uint64 n, uint64 k, float alpha, \ + const DeviceMemory<std::complex<float>> &a, int lda, \ + float beta, DeviceMemory<std::complex<float>> *c, int ldc) \ + override; \ + bool DoBlasHerk(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, uint64 n, uint64 k, double alpha, \ + const DeviceMemory<std::complex<double>> &a, int lda, \ + double beta, DeviceMemory<std::complex<double>> *c, int ldc) \ + override; \ + bool DoBlasHer2k( \ + Stream *stream, blas::UpperLower uplo, blas::Transpose trans, uint64 n, \ + uint64 k, std::complex<float> alpha, \ + const DeviceMemory<std::complex<float>> &a, int lda, \ + const DeviceMemory<std::complex<float>> &b, int ldb, float beta, \ + DeviceMemory<std::complex<float>> *c, int ldc) override; \ + bool DoBlasHer2k( \ + Stream *stream, blas::UpperLower uplo, blas::Transpose trans, uint64 n, \ + uint64 k, std::complex<double> alpha, \ + const DeviceMemory<std::complex<double>> &a, int lda, \ + const DeviceMemory<std::complex<double>> &b, int ldb, double beta, \ + DeviceMemory<std::complex<double>> *c, int ldc) override; \ + bool DoBlasSymm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ + uint64 m, uint64 n, float alpha, \ + const DeviceMemory<float> &a, int lda, \ + const DeviceMemory<float> &b, int ldb, float beta, \ + DeviceMemory<float> *c, int ldc) override; \ + bool DoBlasSymm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ + uint64 m, uint64 n, double alpha, \ + const DeviceMemory<double> &a, int lda, \ + const DeviceMemory<double> &b, int ldb, double beta, \ + DeviceMemory<double> *c, int ldc) override; \ + bool DoBlasSymm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ + uint64 m, uint64 n, std::complex<float> alpha, \ + const DeviceMemory<std::complex<float>> &a, int lda, \ + const DeviceMemory<std::complex<float>> &b, int ldb, \ + std::complex<float> beta, \ + DeviceMemory<std::complex<float>> *c, int ldc) override; \ + bool DoBlasSymm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ + uint64 m, uint64 n, std::complex<double> alpha, \ + const DeviceMemory<std::complex<double>> &a, int lda, \ + const DeviceMemory<std::complex<double>> &b, int ldb, \ + std::complex<double> beta, \ + DeviceMemory<std::complex<double>> *c, int ldc) override; \ + bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, uint64 n, uint64 k, float alpha, \ + const DeviceMemory<float> &a, int lda, float beta, \ + DeviceMemory<float> *c, int ldc) override; \ + bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, uint64 n, uint64 k, double alpha, \ + const DeviceMemory<double> &a, int lda, double beta, \ + DeviceMemory<double> *c, int ldc) override; \ + bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, uint64 n, uint64 k, \ + std::complex<float> alpha, \ + const DeviceMemory<std::complex<float>> &a, int lda, \ + std::complex<float> beta, \ + DeviceMemory<std::complex<float>> *c, int ldc) override; \ + bool DoBlasSyrk(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, uint64 n, uint64 k, \ + std::complex<double> alpha, \ + const DeviceMemory<std::complex<double>> &a, int lda, \ + std::complex<double> beta, \ + DeviceMemory<std::complex<double>> *c, int ldc) override; \ + bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, uint64 n, uint64 k, float alpha, \ + const DeviceMemory<float> &a, int lda, \ + const DeviceMemory<float> &b, int ldb, float beta, \ + DeviceMemory<float> *c, int ldc) override; \ + bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, uint64 n, uint64 k, double alpha, \ + const DeviceMemory<double> &a, int lda, \ + const DeviceMemory<double> &b, int ldb, double beta, \ + DeviceMemory<double> *c, int ldc) override; \ + bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, uint64 n, uint64 k, \ + std::complex<float> alpha, \ + const DeviceMemory<std::complex<float>> &a, int lda, \ + const DeviceMemory<std::complex<float>> &b, int ldb, \ + std::complex<float> beta, \ + DeviceMemory<std::complex<float>> *c, int ldc) override; \ + bool DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, \ + blas::Transpose trans, uint64 n, uint64 k, \ + std::complex<double> alpha, \ + const DeviceMemory<std::complex<double>> &a, int lda, \ + const DeviceMemory<std::complex<double>> &b, int ldb, \ + std::complex<double> beta, \ + DeviceMemory<std::complex<double>> *c, int ldc) override; \ + bool DoBlasTrmm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ + blas::Transpose transa, blas::Diagonal diag, uint64 m, \ + uint64 n, float alpha, const DeviceMemory<float> &a, \ + int lda, DeviceMemory<float> *b, int ldb) override; \ + bool DoBlasTrmm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ + blas::Transpose transa, blas::Diagonal diag, uint64 m, \ + uint64 n, double alpha, const DeviceMemory<double> &a, \ + int lda, DeviceMemory<double> *b, int ldb) override; \ + bool DoBlasTrmm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ + blas::Transpose transa, blas::Diagonal diag, uint64 m, \ + uint64 n, std::complex<float> alpha, \ + const DeviceMemory<std::complex<float>> &a, int lda, \ + DeviceMemory<std::complex<float>> *b, int ldb) override; \ + bool DoBlasTrmm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ + blas::Transpose transa, blas::Diagonal diag, uint64 m, \ + uint64 n, std::complex<double> alpha, \ + const DeviceMemory<std::complex<double>> &a, int lda, \ + DeviceMemory<std::complex<double>> *b, int ldb) override; \ + bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ + blas::Transpose transa, blas::Diagonal diag, uint64 m, \ + uint64 n, float alpha, const DeviceMemory<float> &a, \ + int lda, DeviceMemory<float> *b, int ldb) override; \ + bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ + blas::Transpose transa, blas::Diagonal diag, uint64 m, \ + uint64 n, double alpha, const DeviceMemory<double> &a, \ + int lda, DeviceMemory<double> *b, int ldb) override; \ + bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ + blas::Transpose transa, blas::Diagonal diag, uint64 m, \ + uint64 n, std::complex<float> alpha, \ + const DeviceMemory<std::complex<float>> &a, int lda, \ + DeviceMemory<std::complex<float>> *b, int ldb) override; \ + bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo, \ + blas::Transpose transa, blas::Diagonal diag, uint64 m, \ + uint64 n, std::complex<double> alpha, \ + const DeviceMemory<std::complex<double>> &a, int lda, \ + DeviceMemory<std::complex<double>> *b, int ldb) override; + +} // namespace blas +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_BLAS_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_activation.cc b/tensorflow/stream_executor/cuda/cuda_activation.cc new file mode 100644 index 0000000000..32d2c0d424 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_activation.cc @@ -0,0 +1,30 @@ +#include "tensorflow/stream_executor/cuda/cuda_activation.h" + +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/stream_executor.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +CUcontext ExtractCudaContext(CUDAExecutor *cuda_exec); +CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec); + +ScopedActivateExecutorContext::ScopedActivateExecutorContext( + CUDAExecutor *cuda_exec, MultiOpActivation moa) + : cuda_exec_(cuda_exec), + driver_scoped_activate_context_( + new ScopedActivateContext{ExtractCudaContext(cuda_exec), moa}) {} + +ScopedActivateExecutorContext::ScopedActivateExecutorContext( + StreamExecutor *stream_exec, MultiOpActivation moa) + : ScopedActivateExecutorContext(ExtractCudaExecutor(stream_exec), moa) {} + +ScopedActivateExecutorContext::~ScopedActivateExecutorContext() { + delete static_cast<ScopedActivateContext *>(driver_scoped_activate_context_); +} + +} // namespace cuda +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/cuda/cuda_activation.h b/tensorflow/stream_executor/cuda/cuda_activation.h new file mode 100644 index 0000000000..4181d13d0a --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_activation.h @@ -0,0 +1,53 @@ +// This file contains APIs that assume a StreamExecutor is backed by CUDA. +// It reaches into the CUDA implementation to activate an underlying CUDA +// context. +// +// Having this file separate from cuda_gpu_executor.h means that dependent +// code does not also have to depend on cuda.h. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_ + +#include "tensorflow/stream_executor/cuda/multi_op_activation.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { + +class StreamExecutor; + +namespace cuda { + +class CUDAExecutor; +class ScopedActivateContext; + +// Activates a CUDA context within an enclosing scope. +class ScopedActivateExecutorContext { + public: + // Form that takes a CUDA executor implementation. + explicit ScopedActivateExecutorContext( + CUDAExecutor* cuda_exec, MultiOpActivation moa = MultiOpActivation::kNo); + + // Form that takes a pImpl executor and extracts a CUDA implementation -- + // fatal failure if it is not CUDA inside. + explicit ScopedActivateExecutorContext( + StreamExecutor* stream_exec, + MultiOpActivation moa = MultiOpActivation::kNo); + + ~ScopedActivateExecutorContext(); + + private: + // The CUDA executor implementation whose context is activated. + CUDAExecutor* cuda_exec_; + + // The cuda.h-using datatype that we wrap. + ScopedActivateContext* driver_scoped_activate_context_; + + SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivateExecutorContext); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc new file mode 100644 index 0000000000..ef1036bca3 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -0,0 +1,2184 @@ +#include "tensorflow/stream_executor/cuda/cuda_blas.h" + +#include <dlfcn.h> + +#include <complex> + +#include "tensorflow/stream_executor/cuda/cuda_activation.h" +#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" +#include "tensorflow/stream_executor/cuda/cuda_helpers.h" +#include "tensorflow/stream_executor/cuda/cuda_platform.h" +#include "tensorflow/stream_executor/device_memory.h" +#include "tensorflow/stream_executor/dso_loader.h" +#include "tensorflow/stream_executor/lib/initialize.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/status_macros.h" +#include "tensorflow/stream_executor/lib/strcat.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/plugin_registry.h" +#include "tensorflow/stream_executor/stream_executor.h" +#include "third_party/gpus/cuda/include/cublas_v2.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuBlasPlugin); + +namespace dynload { + +#define PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(__name) \ + struct DynLoadShim__##__name { \ + static const char *kName; \ + using FuncPointerT = std::add_pointer<decltype(::__name)>::type; \ + static void *GetDsoHandle() { \ + static auto status = internal::CachedDsoLoader::GetCublasDsoHandle(); \ + return status.ValueOrDie(); \ + } \ + static FuncPointerT DynLoad() { \ + static void *f = dlsym(GetDsoHandle(), kName); \ + CHECK(f != nullptr) << "could not find " << kName \ + << " in cuBLAS DSO; dlerror: " << dlerror(); \ + return reinterpret_cast<FuncPointerT>(f); \ + } \ + template <typename... Args> \ + cublasStatus_t operator()(CUDAExecutor * parent, Args... args) { \ + cuda::ScopedActivateExecutorContext sac{parent}; \ + return DynLoad()(args...); \ + } \ + } __name; \ + const char *DynLoadShim__##__name::kName = #__name; + +#define PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(__name) \ + PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(__name) + +#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ + __macro(cublasSnrm2) \ + __macro(cublasDnrm2) \ + __macro(cublasScnrm2) \ + __macro(cublasDznrm2) \ + __macro(cublasSdot) \ + __macro(cublasDdot) \ + __macro(cublasCdotu) \ + __macro(cublasCdotc) \ + __macro(cublasZdotu) \ + __macro(cublasZdotc) \ + __macro(cublasSscal) \ + __macro(cublasDscal) \ + __macro(cublasCscal) \ + __macro(cublasCsscal) \ + __macro(cublasZscal) \ + __macro(cublasZdscal) \ + __macro(cublasSaxpy) \ + __macro(cublasDaxpy) \ + __macro(cublasCaxpy) \ + __macro(cublasZaxpy) \ + __macro(cublasScopy) \ + __macro(cublasDcopy) \ + __macro(cublasCcopy) \ + __macro(cublasZcopy) \ + __macro(cublasSswap) \ + __macro(cublasDswap) \ + __macro(cublasCswap) \ + __macro(cublasZswap) \ + __macro(cublasIsamax) \ + __macro(cublasIdamax) \ + __macro(cublasIcamax) \ + __macro(cublasIzamax) \ + __macro(cublasIsamin) \ + __macro(cublasIdamin) \ + __macro(cublasIcamin) \ + __macro(cublasIzamin) \ + __macro(cublasSasum) \ + __macro(cublasDasum) \ + __macro(cublasScasum) \ + __macro(cublasDzasum) \ + __macro(cublasSrot) \ + __macro(cublasDrot) \ + __macro(cublasCrot) \ + __macro(cublasCsrot) \ + __macro(cublasZrot) \ + __macro(cublasZdrot) \ + __macro(cublasSrotg) \ + __macro(cublasDrotg) \ + __macro(cublasCrotg) \ + __macro(cublasZrotg) \ + __macro(cublasSrotm) \ + __macro(cublasDrotm) \ + __macro(cublasSrotmg) \ + __macro(cublasDrotmg) \ + __macro(cublasSgemv) \ + __macro(cublasDgemv) \ + __macro(cublasCgemv) \ + __macro(cublasZgemv) \ + __macro(cublasSgbmv) \ + __macro(cublasDgbmv) \ + __macro(cublasCgbmv) \ + __macro(cublasZgbmv) \ + __macro(cublasStrmv) \ + __macro(cublasDtrmv) \ + __macro(cublasCtrmv) \ + __macro(cublasZtrmv) \ + __macro(cublasStbmv) \ + __macro(cublasDtbmv) \ + __macro(cublasCtbmv) \ + __macro(cublasZtbmv) \ + __macro(cublasStpmv) \ + __macro(cublasDtpmv) \ + __macro(cublasCtpmv) \ + __macro(cublasZtpmv) \ + __macro(cublasStrsv) \ + __macro(cublasDtrsv) \ + __macro(cublasCtrsv) \ + __macro(cublasZtrsv) \ + __macro(cublasStpsv) \ + __macro(cublasDtpsv) \ + __macro(cublasCtpsv) \ + __macro(cublasZtpsv) \ + __macro(cublasStbsv) \ + __macro(cublasDtbsv) \ + __macro(cublasCtbsv) \ + __macro(cublasZtbsv) \ + __macro(cublasSsymv) \ + __macro(cublasDsymv) \ + __macro(cublasCsymv) \ + __macro(cublasZsymv) \ + __macro(cublasChemv) \ + __macro(cublasZhemv) \ + __macro(cublasSsbmv) \ + __macro(cublasDsbmv) \ + __macro(cublasChbmv) \ + __macro(cublasZhbmv) \ + __macro(cublasSspmv) \ + __macro(cublasDspmv) \ + __macro(cublasChpmv) \ + __macro(cublasZhpmv) \ + __macro(cublasSger) \ + __macro(cublasDger) \ + __macro(cublasCgeru) \ + __macro(cublasCgerc) \ + __macro(cublasZgeru) \ + __macro(cublasZgerc) \ + __macro(cublasSsyr) \ + __macro(cublasDsyr) \ + __macro(cublasCsyr) \ + __macro(cublasZsyr) \ + __macro(cublasCher) \ + __macro(cublasZher) \ + __macro(cublasSspr) \ + __macro(cublasDspr) \ + __macro(cublasChpr) \ + __macro(cublasZhpr) \ + __macro(cublasSsyr2) \ + __macro(cublasDsyr2) \ + __macro(cublasCsyr2) \ + __macro(cublasZsyr2) \ + __macro(cublasCher2) \ + __macro(cublasZher2) \ + __macro(cublasSspr2) \ + __macro(cublasDspr2) \ + __macro(cublasChpr2) \ + __macro(cublasZhpr2) \ + __macro(cublasSgemm) \ + __macro(cublasDgemm) \ + __macro(cublasCgemm) \ + __macro(cublasZgemm) \ + __macro(cublasSsyrk) \ + __macro(cublasDsyrk) \ + __macro(cublasCsyrk) \ + __macro(cublasZsyrk) \ + __macro(cublasCherk) \ + __macro(cublasZherk) \ + __macro(cublasSsyr2k) \ + __macro(cublasDsyr2k) \ + __macro(cublasCsyr2k) \ + __macro(cublasZsyr2k) \ + __macro(cublasCher2k) \ + __macro(cublasZher2k) \ + __macro(cublasSsyrkx) \ + __macro(cublasDsyrkx) \ + __macro(cublasCsyrkx) \ + __macro(cublasZsyrkx) \ + __macro(cublasCherkx) \ + __macro(cublasZherkx) \ + __macro(cublasSsymm) \ + __macro(cublasDsymm) \ + __macro(cublasCsymm) \ + __macro(cublasZsymm) \ + __macro(cublasChemm) \ + __macro(cublasZhemm) \ + __macro(cublasStrsm) \ + __macro(cublasDtrsm) \ + __macro(cublasCtrsm) \ + __macro(cublasZtrsm) \ + __macro(cublasStrmm) \ + __macro(cublasDtrmm) \ + __macro(cublasCtrmm) \ + __macro(cublasZtrmm) \ + __macro(cublasSgeam) \ + __macro(cublasDgeam) \ + __macro(cublasCgeam) \ + __macro(cublasZgeam) \ + __macro(cublasSdgmm) \ + __macro(cublasDdgmm) \ + __macro(cublasCdgmm) \ + __macro(cublasZdgmm) + +PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasCreate) +PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasDestroy) +PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasSetStream) +PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasSetPointerMode) +PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasGetPointerMode) +PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasSgemmBatched) +PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasDgemmBatched) +PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasCgemmBatched) +PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasZgemmBatched) +CUBLAS_BLAS_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP) + +} // namespace dynload + +static string ToString(cublasStatus_t status) { + switch (status) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + default: + return port::StrCat("<invalid cublas status: ", status, ">"); + } +} + +// cuBLAS has interfaces that permit pointers to be passed from either the host +// memory space or the device memory space; however, you must instruct it as to +// which address space those pointers are in with cublasSetPointerMode. +// +// This helper sets the cuBLAS pointer mode to a desired value for a cuBLAS call +// you are about to perform in a given scope. +// +// The prior cuBLAS pointer mode is retained and restored when this object goes +// out of scope. +class ScopedCublasPointerMode { + public: + // Note that, because the setting of the cublas pointer mode is fallible, + // construction of this scoped datatype must be paired with a call to + // Init(). + // + // Parameters: + // handle: The cublas library handle to act upon in setting the pointer mode. + explicit ScopedCublasPointerMode(CUDAExecutor *parent, cublasHandle_t handle) + : parent_(parent), handle_(handle), ok_(false) {} + + // Attempts the switch to the requested scoped pointer mode, new_mode. + // + // Note that when false is returned, an appropriate error has already been + // logged. + bool Init(cublasPointerMode_t new_mode) { + cublasStatus_t ret = + dynload::cublasGetPointerMode_v2(parent_, handle_, &old_mode_); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to get old cublas pointer mode: " << ToString(ret); + return ok_ = false; + } + + ret = dynload::cublasSetPointerMode_v2(parent_, handle_, new_mode); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set new cublas pointer mode: " << ToString(ret); + return ok_ = false; + } + + return ok_ = true; + } + + // Switches back to the prior pointer mode, if the switch operation was + // successful in the first place. + ~ScopedCublasPointerMode() { + if (ok_) { + cublasStatus_t ret = + dynload::cublasSetPointerMode_v2(parent_, handle_, old_mode_); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set former cublas pointer mode: " + << ToString(ret); + } + } + } + + private: + CUDAExecutor *parent_; // Executor establishing this pointer mode for. + cublasHandle_t handle_; // Handle to the cuBLAS instance of interest. + cublasPointerMode_t old_mode_; // Prior cuBLAS pointer mode, to be restored. + bool ok_; // Whether the change was successful. +}; + +bool CUDABlas::Init() { + cublasStatus_t ret = dynload::cublasCreate_v2(parent_, &blas_); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to create cublas handle: " << ToString(ret); + return false; + } + + return true; +} + +CUDABlas::CUDABlas(cuda::CUDAExecutor *parent) + : parent_(CHECK_NOTNULL(parent)), blas_(nullptr) {} + +CUDABlas::~CUDABlas() { + if (blas_ != nullptr) { + dynload::cublasDestroy_v2(parent_, blas_); + } +} + +bool CUDABlas::SetStream(Stream *stream) { + CHECK(stream != nullptr); + CHECK(AsCUDAStreamValue(stream) != nullptr); + CHECK(blas_ != nullptr); + cublasStatus_t ret = + dynload::cublasSetStream_v2(parent_, blas_, AsCUDAStreamValue(stream)); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set stream for cuBLAS calls: " << ToString(ret); + return false; + } + + return true; +} + +namespace { + +// Helper functions transforming blas arguments into cuBLAS arguments. + +cublasOperation_t CUDABlasTranspose(blas::Transpose trans) { + switch (trans) { + case blas::Transpose::kNoTranspose: + return CUBLAS_OP_N; + case blas::Transpose::kTranspose: + return CUBLAS_OP_T; + case blas::Transpose::kConjugateTranspose: + return CUBLAS_OP_C; + default: + LOG(FATAL) << "Invalid value of blas::Transpose."; + } +} + +cublasFillMode_t CUDABlasUpperLower(blas::UpperLower uplo) { + switch (uplo) { + case blas::UpperLower::kUpper: + return CUBLAS_FILL_MODE_UPPER; + case blas::UpperLower::kLower: + return CUBLAS_FILL_MODE_LOWER; + default: + LOG(FATAL) << "Invalid value of blas::UpperLower."; + } +} + +cublasDiagType_t CUDABlasDiagonal(blas::Diagonal diag) { + switch (diag) { + case blas::Diagonal::kUnit: + return CUBLAS_DIAG_UNIT; + case blas::Diagonal::kNonUnit: + return CUBLAS_DIAG_NON_UNIT; + default: + LOG(FATAL) << "Invalid value of blas::Diagonal."; + } +} + +cublasSideMode_t CUDABlasSide(blas::Side side) { + switch (side) { + case blas::Side::kLeft: + return CUBLAS_SIDE_LEFT; + case blas::Side::kRight: + return CUBLAS_SIDE_RIGHT; + default: + LOG(FATAL) << "Invalid value of blas::Side."; + } +} + +} // namespace + +template <typename FuncT, typename... Args> +bool CUDABlas::DoBlasInternal(FuncT cublas_func, Stream *stream, + bool pointer_mode_host, Args... args) { + mutex_lock lock{mu_}; + + CHECK(blas_ != nullptr); + if (!SetStream(stream)) { + return false; + } + + ScopedCublasPointerMode pointer_mode{parent_, blas_}; + if (!pointer_mode.Init(pointer_mode_host ? CUBLAS_POINTER_MODE_HOST + : CUBLAS_POINTER_MODE_DEVICE)) { + return false; + } + + cublasStatus_t ret = cublas_func(parent_, blas_, args...); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to run cuBLAS routine " << cublas_func.kName << ": " + << ToString(ret); + return false; + } + + return true; +} + +bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count, + const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *result) { + return DoBlasInternal(dynload::cublasSasum, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count, + const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *result) { + return DoBlasInternal(dynload::cublasDasum, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<float> *result) { + return DoBlasInternal( + dynload::cublasScasum, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<double> *result) { + return DoBlasInternal( + dynload::cublasDzasum, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha, + const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *y, int incy) { + return DoBlasInternal(dynload::cublasSaxpy, stream, + true /* = pointer_mode_host */, elem_count, &alpha, + CUDAMemory(x), incx, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha, + const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *y, int incy) { + return DoBlasInternal(dynload::cublasDaxpy, stream, + true /* = pointer_mode_host */, elem_count, &alpha, + CUDAMemory(x), incx, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<std::complex<float>> *y, int incy) { + return DoBlasInternal(dynload::cublasCaxpy, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<std::complex<double>> *y, int incy) { + return DoBlasInternal(dynload::cublasZaxpy, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count, + const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *y, int incy) { + return DoBlasInternal(dynload::cublasScopy, stream, + true /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count, + const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *y, int incy) { + return DoBlasInternal(dynload::cublasDcopy, stream, + true /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<std::complex<float>> *y, int incy) { + return DoBlasInternal(dynload::cublasCcopy, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<std::complex<double>> *y, int incy) { + return DoBlasInternal(dynload::cublasZcopy, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count, + const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *result) { + return DoBlasInternal( + dynload::cublasSdot, stream, false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count, + const DeviceMemory<double> &x, int incx, + const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *result) { + return DoBlasInternal( + dynload::cublasDdot, stream, false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *result) { + return DoBlasInternal( + dynload::cublasCdotc, stream, false /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(result))); +} + +bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *result) { + return DoBlasInternal( + dynload::cublasZdotc, stream, false /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(result))); +} + +bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *result) { + return DoBlasInternal( + dynload::cublasCdotu, stream, false /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(result))); +} + +bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *result) { + return DoBlasInternal( + dynload::cublasZdotu, stream, false /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(result))); +} + +bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count, + const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *result) { + return DoBlasInternal(dynload::cublasSnrm2, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count, + const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *result) { + return DoBlasInternal(dynload::cublasDnrm2, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<float> *result) { + return DoBlasInternal( + dynload::cublasScnrm2, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<double> *result) { + return DoBlasInternal( + dynload::cublasDznrm2, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count, + DeviceMemory<float> *x, int incx, + DeviceMemory<float> *y, int incy, float c, float s) { + return DoBlasInternal( + dynload::cublasSrot, stream, true /* = pointer_mode_host */, elem_count, + CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, &c, &s); +} + +bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count, + DeviceMemory<double> *x, int incx, + DeviceMemory<double> *y, int incy, double c, + double s) { + return DoBlasInternal( + dynload::cublasDrot, stream, true /* = pointer_mode_host */, elem_count, + CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, &c, &s); +} + +bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count, + DeviceMemory<std::complex<float>> *x, int incx, + DeviceMemory<std::complex<float>> *y, int incy, + float c, float s) { + return DoBlasInternal(dynload::cublasCsrot, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemoryMutable(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy, &c, &s); +} + +bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count, + DeviceMemory<std::complex<double>> *x, int incx, + DeviceMemory<std::complex<double>> *y, int incy, + double c, double s) { + return DoBlasInternal(dynload::cublasZdrot, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemoryMutable(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy, &c, &s); +} + +bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<float> *a, + DeviceMemory<float> *b, DeviceMemory<float> *c, + DeviceMemory<float> *s) { + return DoBlasInternal(dynload::cublasSrotg, stream, + false /* = pointer_mode_host */, CUDAMemoryMutable(a), + CUDAMemoryMutable(b), CUDAMemoryMutable(c), + CUDAMemoryMutable(s)); +} + +bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<double> *a, + DeviceMemory<double> *b, DeviceMemory<double> *c, + DeviceMemory<double> *s) { + return DoBlasInternal(dynload::cublasDrotg, stream, + false /* = pointer_mode_host */, + CUDAComplex(CUDAMemoryMutable(a)), CUDAMemoryMutable(b), + CUDAMemoryMutable(c), CUDAMemoryMutable(s)); +} + +bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a, + DeviceMemory<std::complex<float>> *b, + DeviceMemory<float> *c, + DeviceMemory<std::complex<float>> *s) { + return DoBlasInternal( + dynload::cublasCrotg, stream, false /* = pointer_mode_host */, + CUDAComplex(CUDAMemoryMutable(a)), CUDAComplex(CUDAMemoryMutable(b)), + CUDAComplex(CUDAMemoryMutable(c)), CUDAComplex(CUDAMemoryMutable(s))); +} + +bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a, + DeviceMemory<std::complex<double>> *b, + DeviceMemory<double> *c, + DeviceMemory<std::complex<double>> *s) { + return DoBlasInternal( + dynload::cublasZrotg, stream, false /* = pointer_mode_host */, + CUDAComplex(CUDAMemoryMutable(a)), CUDAComplex(CUDAMemoryMutable(b)), + CUDAComplex(CUDAMemoryMutable(c)), CUDAComplex(CUDAMemoryMutable(s))); +} + +bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count, + DeviceMemory<float> *x, int incx, + DeviceMemory<float> *y, int incy, + const DeviceMemory<float> ¶m) { + return DoBlasInternal(dynload::cublasSrotm, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, + CUDAMemory(param)); +} + +bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count, + DeviceMemory<double> *x, int incx, + DeviceMemory<double> *y, int incy, + const DeviceMemory<double> ¶m) { + return DoBlasInternal(dynload::cublasDrotm, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, + CUDAMemory(param)); +} + +bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1, + DeviceMemory<float> *d2, DeviceMemory<float> *x1, + const DeviceMemory<float> &y1, + DeviceMemory<float> *param) { + return DoBlasInternal(dynload::cublasSrotmg, stream, + false /* = pointer_mode_host */, CUDAMemoryMutable(d1), + CUDAMemoryMutable(d2), CUDAMemoryMutable(x1), + CUDAMemory(y1), CUDAMemoryMutable(param)); +} + +bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1, + DeviceMemory<double> *d2, DeviceMemory<double> *x1, + const DeviceMemory<double> &y1, + DeviceMemory<double> *param) { + return DoBlasInternal(dynload::cublasDrotmg, stream, + false /* = pointer_mode_host */, CUDAMemoryMutable(d1), + CUDAMemoryMutable(d2), CUDAMemoryMutable(x1), + CUDAMemory(y1), CUDAMemoryMutable(param)); +} + +bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha, + DeviceMemory<float> *x, int incx) { + return DoBlasInternal(dynload::cublasSscal, stream, + true /* = pointer_mode_host */, elem_count, &alpha, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha, + DeviceMemory<double> *x, int incx) { + return DoBlasInternal(dynload::cublasDscal, stream, + true /* = pointer_mode_host */, elem_count, &alpha, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha, + DeviceMemory<std::complex<float>> *x, int incx) { + return DoBlasInternal( + dynload::cublasCsscal, stream, true /* = pointer_mode_host */, elem_count, + CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha, + DeviceMemory<std::complex<double>> *x, int incx) { + return DoBlasInternal( + dynload::cublasZdscal, stream, true /* = pointer_mode_host */, elem_count, + CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, + std::complex<float> alpha, + DeviceMemory<std::complex<float>> *x, int incx) { + return DoBlasInternal( + dynload::cublasCscal, stream, true /* = pointer_mode_host */, elem_count, + CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, + std::complex<double> alpha, + DeviceMemory<std::complex<double>> *x, int incx) { + return DoBlasInternal( + dynload::cublasZscal, stream, true /* = pointer_mode_host */, elem_count, + CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count, + DeviceMemory<float> *x, int incx, + DeviceMemory<float> *y, int incy) { + return DoBlasInternal(dynload::cublasSswap, stream, + true /* = pointer_mode_host */, elem_count, + CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count, + DeviceMemory<double> *x, int incx, + DeviceMemory<double> *y, int incy) { + return DoBlasInternal(dynload::cublasDswap, stream, + true /* = pointer_mode_host */, elem_count, + CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count, + DeviceMemory<std::complex<float>> *x, int incx, + DeviceMemory<std::complex<float>> *y, int incy) { + return DoBlasInternal(dynload::cublasCswap, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemoryMutable(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count, + DeviceMemory<std::complex<double>> *x, int incx, + DeviceMemory<std::complex<double>> *y, int incy) { + return DoBlasInternal(dynload::cublasZswap, stream, + true /* = pointer_mode_host */, elem_count, + CUDAComplex(CUDAMemoryMutable(x)), incx, + CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count, + const DeviceMemory<float> &x, int incx, + DeviceMemory<int> *result) { + return DoBlasInternal(dynload::cublasIsamax, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count, + const DeviceMemory<double> &x, int incx, + DeviceMemory<int> *result) { + return DoBlasInternal(dynload::cublasIdamax, stream, + false /* = pointer_mode_host */, elem_count, + CUDAMemory(x), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<int> *result) { + return DoBlasInternal( + dynload::cublasIcamax, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, + int incx, DeviceMemory<int> *result) { + return DoBlasInternal( + dynload::cublasIzamax, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count, + const DeviceMemory<float> &x, int incx, + DeviceMemory<int> *result) { + return DoBlasInternal( + dynload::cublasIsamin, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count, + const DeviceMemory<double> &x, int incx, + DeviceMemory<int> *result) { + return DoBlasInternal( + dynload::cublasIdamin, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<int> *result) { + return DoBlasInternal( + dynload::cublasIcamin, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, + int incx, DeviceMemory<int> *result) { + return DoBlasInternal( + dynload::cublasIzamin, stream, false /* = pointer_mode_host */, + elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result)); +} + +bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, uint64 kl, uint64 ku, float alpha, + const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &x, int incx, float beta, + DeviceMemory<float> *y, int incy) { + return DoBlasInternal( + dynload::cublasSgbmv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, kl, ku, &alpha, CUDAMemory(a), lda, + CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, uint64 kl, uint64 ku, double alpha, + const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &x, int incx, double beta, + DeviceMemory<double> *y, int incy) { + return DoBlasInternal( + dynload::cublasDgbmv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, kl, ku, &alpha, CUDAMemory(a), lda, + CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, uint64 kl, uint64 ku, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) { + return DoBlasInternal( + dynload::cublasCgbmv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, kl, ku, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, uint64 kl, uint64 ku, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) { + return DoBlasInternal( + dynload::cublasZgbmv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, kl, ku, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, float alpha, const DeviceMemory<float> &a, + int lda, const DeviceMemory<float> &x, int incx, + float beta, DeviceMemory<float> *y, int incy) { + return DoBlasInternal( + dynload::cublasSgemv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, &alpha, CUDAMemory(a), lda, CUDAMemory(x), + incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, double alpha, const DeviceMemory<double> &a, + int lda, const DeviceMemory<double> &x, int incx, + double beta, DeviceMemory<double> *y, int incy) { + return DoBlasInternal( + dynload::cublasDgemv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, &alpha, CUDAMemory(a), lda, CUDAMemory(x), + incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) { + return DoBlasInternal( + dynload::cublasCgemv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m, + uint64 n, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) { + return DoBlasInternal( + dynload::cublasZgemv, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(trans), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha, + const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *a, int lda) { + return DoBlasInternal( + dynload::cublasSger, stream, true /* = pointer_mode_host */, m, n, &alpha, + CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda); +} + +bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha, + const DeviceMemory<double> &x, int incx, + const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *a, int lda) { + return DoBlasInternal( + dynload::cublasDger, stream, true /* = pointer_mode_host */, m, n, &alpha, + CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda); +} + +bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *a, int lda) { + return DoBlasInternal( + dynload::cublasCgerc, stream, true /* = pointer_mode_host */, m, n, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *a, int lda) { + return DoBlasInternal( + dynload::cublasZgerc, stream, true /* = pointer_mode_host */, m, n, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *a, int lda) { + return DoBlasInternal( + dynload::cublasCgeru, stream, true /* = pointer_mode_host */, m, n, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *a, int lda) { + return DoBlasInternal( + dynload::cublasZgeru, stream, true /* = pointer_mode_host */, m, n, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n, + uint64 k, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) { + return DoBlasInternal( + dynload::cublasChbmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, k, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n, + uint64 k, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) { + return DoBlasInternal( + dynload::cublasZhbmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, k, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) { + return DoBlasInternal( + dynload::cublasChemv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) { + return DoBlasInternal( + dynload::cublasZhemv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<std::complex<float>> *a, int lda) { + return DoBlasInternal( + dynload::cublasCher, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<std::complex<double>> *a, int lda) { + return DoBlasInternal( + dynload::cublasZher, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *a, int lda) { + return DoBlasInternal( + dynload::cublasCher2, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *a, int lda) { + return DoBlasInternal( + dynload::cublasZher2, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(a)), lda); +} + +bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &ap, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) { + return DoBlasInternal( + dynload::cublasChpmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(ap)), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &ap, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) { + return DoBlasInternal( + dynload::cublasZhpmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(ap)), CUDAComplex(CUDAMemory(x)), incx, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy); +} + +bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<std::complex<float>> *ap) { + return DoBlasInternal( + dynload::cublasChpr, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemoryMutable(ap))); +} + +bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<std::complex<double>> *ap) { + return DoBlasInternal( + dynload::cublasZhpr, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemoryMutable(ap))); +} + +bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *ap) { + return DoBlasInternal( + dynload::cublasChpr2, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(ap))); +} + +bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *ap) { + return DoBlasInternal( + dynload::cublasZhpr2, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy, + CUDAComplex(CUDAMemoryMutable(ap))); +} + +bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n, + uint64 k, float alpha, const DeviceMemory<float> &a, + int lda, const DeviceMemory<float> &x, int incx, + float beta, DeviceMemory<float> *y, int incy) { + return DoBlasInternal( + dynload::cublasSsbmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, k, &alpha, CUDAMemory(a), lda, CUDAMemory(x), + incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n, + uint64 k, double alpha, const DeviceMemory<double> &a, + int lda, const DeviceMemory<double> &x, int incx, + double beta, DeviceMemory<double> *y, int incy) { + return DoBlasInternal( + dynload::cublasDsbmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, k, &alpha, CUDAMemory(a), lda, CUDAMemory(x), + incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory<float> &ap, + const DeviceMemory<float> &x, int incx, float beta, + DeviceMemory<float> *y, int incy) { + return DoBlasInternal(dynload::cublasSspmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(ap), + CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory<double> &ap, + const DeviceMemory<double> &x, int incx, double beta, + DeviceMemory<double> *y, int incy) { + return DoBlasInternal(dynload::cublasDspmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(ap), + CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *ap) { + return DoBlasInternal(dynload::cublasSspr, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemoryMutable(ap)); +} + +bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *ap) { + return DoBlasInternal(dynload::cublasDspr, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemoryMutable(ap)); +} + +bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *ap) { + return DoBlasInternal(dynload::cublasSspr2, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemory(y), incy, CUDAMemoryMutable(ap)); +} + +bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory<double> &x, int incx, + const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *ap) { + return DoBlasInternal(dynload::cublasDspr2, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemory(y), incy, CUDAMemoryMutable(ap)); +} + +bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &x, int incx, float beta, + DeviceMemory<float> *y, int incy) { + return DoBlasInternal(dynload::cublasSsymv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(a), lda, + CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &x, int incx, double beta, + DeviceMemory<double> *y, int incy) { + return DoBlasInternal(dynload::cublasDsymv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(a), lda, + CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy); +} + +bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *a, int lda) { + return DoBlasInternal(dynload::cublasSsyr, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemoryMutable(a), lda); +} + +bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *a, int lda) { + return DoBlasInternal(dynload::cublasDsyr, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemoryMutable(a), lda); +} + +bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n, + float alpha, const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *a, int lda) { + return DoBlasInternal(dynload::cublasSsyr2, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda); +} + +bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n, + double alpha, const DeviceMemory<double> &x, int incx, + const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *a, int lda) { + return DoBlasInternal(dynload::cublasDsyr2, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x), + incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda); +} + +bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx) { + return DoBlasInternal(dynload::cublasStbmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx) { + return DoBlasInternal(dynload::cublasDtbmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<std::complex<float>> &a, + int lda, DeviceMemory<std::complex<float>> *x, + int incx) { + return DoBlasInternal( + dynload::cublasCtbmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<std::complex<double>> &a, + int lda, DeviceMemory<std::complex<double>> *x, + int incx) { + return DoBlasInternal( + dynload::cublasZtbmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx) { + return DoBlasInternal(dynload::cublasStbsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx) { + return DoBlasInternal(dynload::cublasDtbsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<std::complex<float>> &a, + int lda, DeviceMemory<std::complex<float>> *x, + int incx) { + return DoBlasInternal( + dynload::cublasCtbsv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + uint64 k, const DeviceMemory<std::complex<double>> &a, + int lda, DeviceMemory<std::complex<double>> *x, + int incx) { + return DoBlasInternal( + dynload::cublasZtbsv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &ap, DeviceMemory<float> *x, + int incx) { + return DoBlasInternal( + dynload::cublasStpmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &ap, + DeviceMemory<double> *x, int incx) { + return DoBlasInternal( + dynload::cublasDtpmv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &ap, + DeviceMemory<std::complex<float>> *x, int incx) { + return DoBlasInternal(dynload::cublasCtpmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)), + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &ap, + DeviceMemory<std::complex<double>> *x, int incx) { + return DoBlasInternal(dynload::cublasZtpmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)), + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &ap, DeviceMemory<float> *x, + int incx) { + return DoBlasInternal( + dynload::cublasStpsv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &ap, + DeviceMemory<double> *x, int incx) { + return DoBlasInternal( + dynload::cublasDtpsv, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &ap, + DeviceMemory<std::complex<float>> *x, int incx) { + return DoBlasInternal(dynload::cublasCtpsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)), + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &ap, + DeviceMemory<std::complex<double>> *x, int incx) { + return DoBlasInternal(dynload::cublasZtpsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)), + CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx) { + return DoBlasInternal(dynload::cublasStrmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx) { + return DoBlasInternal(dynload::cublasDtrmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &a, int lda, + DeviceMemory<std::complex<float>> *x, int incx) { + return DoBlasInternal(dynload::cublasCtrmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)), + lda, CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &a, int lda, + DeviceMemory<std::complex<double>> *x, int incx) { + return DoBlasInternal(dynload::cublasZtrmv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)), + lda, CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx) { + return DoBlasInternal(dynload::cublasStrsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx) { + return DoBlasInternal(dynload::cublasDtrsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAMemory(a), lda, + CUDAMemoryMutable(x), incx); +} + +bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &a, int lda, + DeviceMemory<std::complex<float>> *x, int incx) { + return DoBlasInternal(dynload::cublasCtrsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)), + lda, CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &a, int lda, + DeviceMemory<std::complex<double>> *x, int incx) { + return DoBlasInternal(dynload::cublasZtrsv, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), + CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)), + lda, CUDAComplex(CUDAMemoryMutable(x)), incx); +} + +bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, + float alpha, const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &b, int ldb, float beta, + DeviceMemory<float> *c, int ldc) { + VLOG(1) << port::Printf( + "doing cuBLAS SGEMM: at=%d bt=%d m=%llu n=%llu " + "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f " + "c=%p ldc=%d", + static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha, + a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc); + if (transa == blas::Transpose::kNoTranspose) { + if (lda < static_cast<int64>(m)) { + LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); " + "precondition violation"; + } + } else { + if (lda < static_cast<int64>(k)) { + LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k + << ") (transpose case); precondition violation"; + } + } + if (transb == blas::Transpose::kNoTranspose) { + if (ldb < static_cast<int64>(k)) { + LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k + << ") (no transpose case); precondition violation"; + } + } else { + if (ldb < static_cast<int64>(n)) { + LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); " + "precondition violation"; + } + } + return DoBlasInternal( + dynload::cublasSgemm, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha, + CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, + double alpha, const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &b, int ldb, double beta, + DeviceMemory<double> *c, int ldc) { + return DoBlasInternal( + dynload::cublasDgemm, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha, + CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) { + return DoBlasInternal( + dynload::cublasCgemm, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta), + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) { + return DoBlasInternal( + dynload::cublasZgemm, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta), + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +template <typename T, typename FuncT> +port::Status CUDABlas::DoBlasGemmBatchedInternal( + FuncT cublas_func, Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha, + const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda, + const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta, + const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc, + int batch_count) { + std::vector<T *> a_ptr_vec, b_ptr_vec, c_ptr_vec; + for (int i = 0; i < batch_count; ++i) { + a_ptr_vec.push_back(static_cast<T *>(a_array[i]->opaque())); + b_ptr_vec.push_back(static_cast<T *>(b_array[i]->opaque())); + c_ptr_vec.push_back(static_cast<T *>(c_array[i]->opaque())); + } + + typedef typename CUDAComplexT<T>::type CUDA_T; + SE_ASSIGN_OR_RETURN( + std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> a_ptr_array, + stream->AllocateTemporaryArray<CUDA_T *>(batch_count)); + SE_ASSIGN_OR_RETURN( + std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> b_ptr_array, + stream->AllocateTemporaryArray<CUDA_T *>(batch_count)); + SE_ASSIGN_OR_RETURN( + std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> c_ptr_array, + stream->AllocateTemporaryArray<CUDA_T *>(batch_count)); + + if (!stream->ThenMemcpy(a_ptr_array->mutable_device_memory(), + a_ptr_vec.data(), batch_count * sizeof(T *)) + .ok() || + !stream->ThenMemcpy(b_ptr_array->mutable_device_memory(), + b_ptr_vec.data(), batch_count * sizeof(T *)) + .ok() || + !stream->ThenMemcpy(c_ptr_array->mutable_device_memory(), + c_ptr_vec.data(), batch_count * sizeof(T *)) + .ok()) { + return port::Status(port::error::INTERNAL, + "failed to copy memory from host to device in " + "CUDABlas::DoBlasGemmBatched"); + } + + bool ok = DoBlasInternal( + cublas_func, stream, true /* = pointer_mode_host */, + CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, + CUDAComplex(&alpha), + const_cast<const CUDA_T **>(CUDAMemory(a_ptr_array->device_memory())), + lda, + const_cast<const CUDA_T **>(CUDAMemory(b_ptr_array->device_memory())), + ldb, CUDAComplex(&beta), + const_cast<CUDA_T **>(CUDAMemory(c_ptr_array->device_memory())), ldc, + batch_count); + + if (ok) { + return port::Status::OK(); + } + return port::Status(port::error::INTERNAL, + "failed BLAS call, see log for details"); +} + +bool CUDABlas::DoBlasGemmBatched( + Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, float alpha, + const port::ArraySlice<DeviceMemory<float> *> &a_array, int lda, + const port::ArraySlice<DeviceMemory<float> *> &b_array, int ldb, float beta, + const port::ArraySlice<DeviceMemory<float> *> &c_array, int ldc, + int batch_count) { + SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal( + dynload::cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, + a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count)); +} + +bool CUDABlas::DoBlasGemmBatched( + Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, double alpha, + const port::ArraySlice<DeviceMemory<double> *> &a_array, int lda, + const port::ArraySlice<DeviceMemory<double> *> &b_array, int ldb, + double beta, const port::ArraySlice<DeviceMemory<double> *> &c_array, + int ldc, int batch_count) { + SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal( + dynload::cublasDgemmBatched, stream, transa, transb, m, n, k, alpha, + a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count)); +} + +bool CUDABlas::DoBlasGemmBatched( + Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, std::complex<float> alpha, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a_array, + int lda, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b_array, + int ldb, std::complex<float> beta, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c_array, + int ldc, int batch_count) { + SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal( + dynload::cublasCgemmBatched, stream, transa, transb, m, n, k, alpha, + a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count)); +} + +bool CUDABlas::DoBlasGemmBatched( + Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, std::complex<double> alpha, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a_array, + int lda, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b_array, + int ldb, std::complex<double> beta, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c_array, + int ldc, int batch_count) { + SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal( + dynload::cublasZgemmBatched, stream, transa, transb, m, n, k, alpha, + a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count)); +} + +bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) { + return DoBlasInternal( + dynload::cublasChemm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) { + return DoBlasInternal( + dynload::cublasZhemm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + float alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + float beta, DeviceMemory<std::complex<float>> *c, + int ldc) { + return DoBlasInternal(dynload::cublasCherk, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, + k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + &beta, CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + double alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + double beta, DeviceMemory<std::complex<double>> *c, + int ldc) { + return DoBlasInternal(dynload::cublasZherk, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, + k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + &beta, CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + float beta, DeviceMemory<std::complex<float>> *c, + int ldc) { + return DoBlasInternal(dynload::cublasCher2k, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, + k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemory(b)), ldb, &beta, + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + double beta, DeviceMemory<std::complex<double>> *c, + int ldc) { + return DoBlasInternal(dynload::cublasZher2k, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, + k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemory(b)), ldb, &beta, + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + float alpha, const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &b, int ldb, float beta, + DeviceMemory<float> *c, int ldc) { + return DoBlasInternal( + dynload::cublasSsymm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, CUDAMemory(a), + lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + double alpha, const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &b, int ldb, double beta, + DeviceMemory<double> *c, int ldc) { + return DoBlasInternal( + dynload::cublasDsymm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, CUDAMemory(a), + lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) { + return DoBlasInternal( + dynload::cublasCsymm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side, + blas::UpperLower uplo, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) { + return DoBlasInternal( + dynload::cublasZsymm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb, + CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + float alpha, const DeviceMemory<float> &a, int lda, + float beta, DeviceMemory<float> *c, int ldc) { + return DoBlasInternal( + dynload::cublasSsyrk, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha, + CUDAMemory(a), lda, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + double alpha, const DeviceMemory<double> &a, int lda, + double beta, DeviceMemory<double> *c, int ldc) { + return DoBlasInternal( + dynload::cublasDsyrk, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha, + CUDAMemory(a), lda, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) { + return DoBlasInternal( + dynload::cublasCsyrk, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(&beta), + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) { + return DoBlasInternal( + dynload::cublasZsyrk, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, + CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(&beta), + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + float alpha, const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &b, int ldb, float beta, + DeviceMemory<float> *c, int ldc) { + return DoBlasInternal( + dynload::cublasSsyr2k, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha, + CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + double alpha, const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &b, int ldb, double beta, + DeviceMemory<double> *c, int ldc) { + return DoBlasInternal( + dynload::cublasDsyr2k, stream, true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha, + CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc); +} + +bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) { + return DoBlasInternal(dynload::cublasCsyr2k, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, + k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta), + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo, + blas::Transpose trans, uint64 n, uint64 k, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) { + return DoBlasInternal(dynload::cublasZsyr2k, stream, + true /* = pointer_mode_host */, + CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, + k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, + CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta), + CUDAComplex(CUDAMemoryMutable(c)), ldc); +} + +bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, float alpha, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *b, int ldb) { + return DoBlasInternal( + dynload::cublasStrmm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), lda, + CUDAMemoryMutable(b), ldb, CUDAMemoryMutable(b), ldb); +} + +bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, double alpha, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *b, int ldb) { + return DoBlasInternal( + dynload::cublasDtrmm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), lda, + CUDAMemoryMutable(b), ldb, CUDAMemoryMutable(b), ldb); +} + +bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + DeviceMemory<std::complex<float>> *b, int ldb) { + return DoBlasInternal( + dynload::cublasCtrmm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb, + CUDAComplex(CUDAMemoryMutable(b)), ldb); +} + +bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + DeviceMemory<std::complex<double>> *b, int ldb) { + return DoBlasInternal( + dynload::cublasZtrmm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb, + CUDAComplex(CUDAMemoryMutable(b)), ldb); +} + +bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, float alpha, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *b, int ldb) { + return DoBlasInternal(dynload::cublasStrsm, stream, + true /* = pointer_mode_host */, CUDABlasSide(side), + CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), + lda, CUDAMemoryMutable(b), ldb); +} + +bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, double alpha, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *b, int ldb) { + return DoBlasInternal(dynload::cublasDtrsm, stream, + true /* = pointer_mode_host */, CUDABlasSide(side), + CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), + lda, CUDAMemoryMutable(b), ldb); +} + +bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + DeviceMemory<std::complex<float>> *b, int ldb) { + return DoBlasInternal( + dynload::cublasCtrsm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb); +} + +bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side, + blas::UpperLower uplo, blas::Transpose transa, + blas::Diagonal diag, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + DeviceMemory<std::complex<double>> *b, int ldb) { + return DoBlasInternal( + dynload::cublasZtrsm, stream, true /* = pointer_mode_host */, + CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa), + CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha), + CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb); +} + +} // namespace cuda + +namespace gpu = ::perftools::gputools; + +void initialize_cublas() { + gpu::port::Status status = + gpu::PluginRegistry::Instance() + ->RegisterFactory<gpu::PluginRegistry::BlasFactory>( + gpu::cuda::kCudaPlatformId, gpu::cuda::kCuBlasPlugin, "cuBLAS", + [](gpu::internal::StreamExecutorInterface + *parent) -> gpu::blas::BlasSupport * { + gpu::cuda::CUDAExecutor *cuda_executor = + dynamic_cast<gpu::cuda::CUDAExecutor *>(parent); + if (cuda_executor == nullptr) { + LOG(ERROR) + << "Attempting to initialize an instance of the cuBLAS " + << "support library with a non-CUDA StreamExecutor"; + return nullptr; + } + + gpu::cuda::CUDABlas *blas = + new gpu::cuda::CUDABlas(cuda_executor); + if (!blas->Init()) { + // Note: Init() will log a more specific error. + delete blas; + return nullptr; + } + return blas; + }); + + if (!status.ok()) { + LOG(ERROR) << "Unable to register cuBLAS factory: " + << status.error_message(); + } + + // Prime the cuBLAS DSO. The loader will log more information. + auto statusor = gpu::internal::CachedDsoLoader::GetCublasDsoHandle(); + if (!statusor.ok()) { + LOG(INFO) << "Unable to load cuBLAS DSO."; + } + + gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId, + gpu::PluginKind::kBlas, + gpu::cuda::kCuBlasPlugin); +} + +} // namespace gputools +} // namespace perftools + +REGISTER_MODULE_INITIALIZER(register_cublas, + { perftools::gputools::initialize_cublas(); }); diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h new file mode 100644 index 0000000000..1dfec2ebc5 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_blas.h @@ -0,0 +1,100 @@ +// CUDA-specific support for BLAS functionality -- this wraps the cuBLAS library +// capabilities, and is only included into CUDA implementation code -- it will +// not introduce cuda headers into other code. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_ + +#include "tensorflow/stream_executor/blas.h" +#include "tensorflow/stream_executor/lib/stringpiece.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/platform/thread_annotations.h" +#include "tensorflow/stream_executor/plugin_registry.h" + +typedef struct cublasContext *cublasHandle_t; + +namespace perftools { +namespace gputools { + +class Stream; + +namespace cuda { + +// Opaque and unique identifier for the cuBLAS plugin. +extern const PluginId kCuBlasPlugin; + +class CUDAExecutor; + +// BLAS plugin for CUDA platform via cuBLAS library. +// +// This satisfies the platform-agnostic BlasSupport interface. +// +// Note that the cuBLAS handle that this encapsulates is implicitly tied to the +// context (and, as a result, the device) that the parent CUDAExecutor is tied +// to. This simply happens as an artifact of creating the cuBLAS handle when a +// CUDA context is active. +// +// Thread-safe post-initialization. +class CUDABlas : public blas::BlasSupport { + public: + explicit CUDABlas(CUDAExecutor *parent); + + // Allocates a cuBLAS handle. + bool Init(); + + // Releases the cuBLAS handle, if present. + ~CUDABlas() override; + + TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES + + private: + // Tells cuBLAS to enqueue the BLAS operation onto a particular Stream. + // + // cuBLAS is stateful, and only be associated with one stream (in order to + // enqueue dispatch) at a given time. As a result, this generally must be + // invoked before calling into cuBLAS. + bool SetStream(Stream *stream) EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // A helper function that calls the real cuBLAS function together with error + // handling. + // + // cublas_func: cuBLAS function pointer. + // cublas_name: cuBLAS function name. + // stream: Stream to enqueue the BLAS operation onto. + // pointer_mode_host: Indicate if the pointer to a scalar value is from host + // (true) or device (false). + // args: Arguments of cuBLAS function. + template <typename FuncT, typename... Args> + bool DoBlasInternal(FuncT cublas_func, Stream *stream, bool pointer_mode_host, + Args... args); + + // A helper function to implement DoBlasGemmBatched interfaces for generic + // types. + template <typename T, typename FuncT> + port::Status DoBlasGemmBatchedInternal( + FuncT cublas_func, Stream *stream, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha, + const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda, + const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta, + const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc, + int batch_count); + + // mutex that guards the cuBLAS handle for this device. + mutex mu_; + + // CUDAExecutor which instantiated this CUDABlas. + // Immutable post-initialization. + CUDAExecutor *parent_; + + // cuBLAS library handle on the device. + cublasHandle_t blas_ GUARDED_BY(mu_); + + SE_DISALLOW_COPY_AND_ASSIGN(CUDABlas); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc new file mode 100644 index 0000000000..c01c9978a1 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc @@ -0,0 +1,260 @@ +#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" + +#include <dirent.h> +#include <limits.h> +#include <link.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/sysmacros.h> +#include <unistd.h> +#include <algorithm> +#include <memory> +#include <vector> + +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/inlined_vector.h" +#include "tensorflow/stream_executor/lib/numbers.h" +#include "tensorflow/stream_executor/lib/process_state.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/str_util.h" +#include "tensorflow/stream_executor/lib/strcat.h" +#include "tensorflow/stream_executor/lib/stringpiece.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" +#include "tensorflow/stream_executor/platform/logging.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +static const char *kDriverVersionPath = "/proc/driver/nvidia/version"; + +string DriverVersionToString(DriverVersion version) { + return port::Printf("%d.%d", std::get<0>(version), std::get<1>(version)); +} + +string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) { + if (!version.ok()) { + return version.status().ToString(); + } + + return DriverVersionToString(version.ValueOrDie()); +} + +port::StatusOr<DriverVersion> StringToDriverVersion(const string &value) { + std::vector<string> pieces = port::Split(value, '.'); + if (pieces.size() != 2) { + return port::Status{ + port::error::INVALID_ARGUMENT, + port::Printf("expected %%d.%%d form for driver version; got \"%s\"", + value.c_str())}; + } + + int major; + int minor; + if (!port::safe_strto32(pieces[0], &major)) { + return port::Status{ + port::error::INVALID_ARGUMENT, + port::Printf("could not parse major version number \"%s\" as an " + "integer from string \"%s\"", + pieces[0].c_str(), value.c_str())}; + } + if (!port::safe_strto32(pieces[1], &minor)) { + return port::Status{ + port::error::INVALID_ARGUMENT, + port::Printf("could not parse minor version number \"%s\" as an " + "integer from string \"%s\"", + pieces[1].c_str(), value.c_str())}; + } + + DriverVersion result{major, minor}; + VLOG(2) << "version string \"" << value << "\" made value " + << DriverVersionToString(result); + return result; +} + +// -- class Diagnostician + +string Diagnostician::GetDevNodePath(int dev_node_ordinal) { + return port::StrCat("/dev/nvidia", dev_node_ordinal); +} + +void Diagnostician::LogDiagnosticInformation() { + if (access(kDriverVersionPath, F_OK) != 0) { + LOG(INFO) << "kernel driver does not appear to be running on this host " + << "(" << port::Hostname() << "): " + << "/proc/driver/nvidia/version does not exist"; + return; + } + auto dev0_path = GetDevNodePath(0); + if (access(dev0_path.c_str(), F_OK) != 0) { + LOG(INFO) << "no NVIDIA GPU device is present: " << dev0_path + << " does not exist"; + return; + } + + LOG(INFO) << "retrieving CUDA diagnostic information for host: " + << port::Hostname(); + + + LogDriverVersionInformation(); +} + +/* static */ void Diagnostician::LogDriverVersionInformation() { + LOG(INFO) << "hostname: " << port::Hostname(); + + if (VLOG_IS_ON(1)) { + const char *value = getenv("LD_LIBRARY_PATH"); + string library_path = value == nullptr ? "" : value; + VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\""; + + std::vector<string> pieces = port::Split(library_path, ':'); + for (auto piece : pieces) { + if (piece.empty()) { + continue; + } + DIR *dir = opendir(piece.c_str()); + if (dir == nullptr) { + VLOG(1) << "could not open \"" << piece << "\""; + continue; + } + while (dirent *entity = readdir(dir)) { + VLOG(1) << piece << " :: " << entity->d_name; + } + closedir(dir); + } + } + + port::StatusOr<DriverVersion> dso_version = FindDsoVersion(); + LOG(INFO) << "libcuda reported version is: " + << DriverVersionStatusToString(dso_version); + + port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion(); + LOG(INFO) << "kernel reported version is: " + << DriverVersionStatusToString(kernel_version); + if (kernel_version.ok() && dso_version.ok()) { + WarnOnDsoKernelMismatch(dso_version, kernel_version); + } +} + +// Iterates through loaded DSOs with DlIteratePhdrCallback to find the +// driver-interfacing DSO version number. Returns it as a string. +port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() { + port::StatusOr<DriverVersion> result{port::Status{ + port::error::NOT_FOUND, + "was unable to find libcuda.so DSO loaded into this program"}}; + + // Callback used when iterating through DSOs. Looks for the driver-interfacing + // DSO and yields its version number into the callback data, when found. + auto iterate_phdr = + [](struct dl_phdr_info *info, size_t size, void *data) -> int { + if (strstr(info->dlpi_name, "libcuda.so")) { + VLOG(1) << "found DLL info with name: " << info->dlpi_name; + char resolved_path[PATH_MAX] = {0}; + if (realpath(info->dlpi_name, resolved_path) == nullptr) { + return 0; + } + VLOG(1) << "found DLL info with resolved path: " << resolved_path; + const char *slash = rindex(resolved_path, '/'); + if (slash == nullptr) { + return 0; + } + const char *so_suffix = ".so."; + const char *dot = strstr(slash, so_suffix); + if (dot == nullptr) { + return 0; + } + string dso_version = dot + strlen(so_suffix); + // TODO(b/22689637): Eliminate the explicit namespace if possible. + auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64"); + auto result = static_cast<port::StatusOr<DriverVersion> *>(data); + *result = StringToDriverVersion(stripped_dso_version); + return 1; + } + return 0; + }; + + dl_iterate_phdr(iterate_phdr, &result); + + return result; +} + +port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion( + const string &driver_version_file_contents) { + static const char *kDriverFilePrelude = "Kernel Module "; + size_t offset = driver_version_file_contents.find(kDriverFilePrelude); + if (offset == string::npos) { + return port::Status{ + port::error::NOT_FOUND, + port::StrCat("could not find kernel module information in " + "driver version file contents: \"", + driver_version_file_contents, "\"")}; + } + + string version_and_rest = driver_version_file_contents.substr( + offset + strlen(kDriverFilePrelude), string::npos); + size_t space_index = version_and_rest.find(" "); + auto kernel_version = version_and_rest.substr(0, space_index); + // TODO(b/22689637): Eliminate the explicit namespace if possible. + auto stripped_kernel_version = + port::StripSuffixString(kernel_version, ".ld64"); + return StringToDriverVersion(stripped_kernel_version); +} + +void Diagnostician::WarnOnDsoKernelMismatch( + port::StatusOr<DriverVersion> dso_version, + port::StatusOr<DriverVersion> kernel_version) { + if (kernel_version.ok() && dso_version.ok() && + dso_version.ValueOrDie() == kernel_version.ValueOrDie()) { + LOG(INFO) << "kernel version seems to match DSO: " + << DriverVersionToString(kernel_version.ValueOrDie()); + } else { + LOG(ERROR) << "kernel version " + << DriverVersionStatusToString(kernel_version) + << " does not match DSO version " + << DriverVersionStatusToString(dso_version) + << " -- cannot find working devices in this configuration"; + } +} + + +port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() { + FILE *driver_version_file = fopen(kDriverVersionPath, "r"); + if (driver_version_file == nullptr) { + return port::Status{ + port::error::PERMISSION_DENIED, + port::StrCat("could not open driver version path for reading: ", + kDriverVersionPath)}; + } + + static const int kContentsSize = 1024; + port::InlinedVector<char, 4> contents(kContentsSize); + size_t retcode = + fread(contents.begin(), 1, kContentsSize - 2, driver_version_file); + if (retcode < kContentsSize - 1) { + contents[retcode] = '\0'; + } + contents[kContentsSize - 1] = '\0'; + + if (retcode != 0) { + LOG(INFO) << "driver version file contents: \"\"\"" << contents.begin() + << "\"\"\""; + fclose(driver_version_file); + return FindKernelModuleVersion(string{contents.begin()}); + } + + auto status = + port::Status{port::error::INTERNAL, + port::StrCat("failed to read driver version file contents: ", + kDriverVersionPath, "; ferror: ", + ferror(driver_version_file))}; + fclose(driver_version_file); + return status; +} + + +} // namespace cuda +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.h b/tensorflow/stream_executor/cuda/cuda_diagnostics.h new file mode 100644 index 0000000000..005b3dc310 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.h @@ -0,0 +1,85 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_ + +#include <tuple> + +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +// e.g. DriverVersion{331, 79} +using DriverVersion = std::tuple<int, int>; + +// Converts a parsed driver version to string form. +string DriverVersionToString(DriverVersion version); + +// Converts a parsed driver version or status value to natural string form. +string DriverVersionStatusToString(port::StatusOr<DriverVersion> version); + +// Converts a string of a form like "331.79" to a DriverVersion{331, 79}. +port::StatusOr<DriverVersion> StringToDriverVersion(const string &value); + +class Diagnostician { + public: + // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is + // not initializing). + // + // Note: if we're running on a machine that has no GPUs, we don't want to + // produce very much log spew beyond saying, "looks like there's no CUDA + // kernel + // module running". + // + // Note: we use non-Google-File:: API here because we may be called before + // InitGoogle has completed. + static void LogDiagnosticInformation(); + + // Given the driver version file contents, finds the kernel module version and + // returns it as a string. + // + // This is solely used for more informative log messages when the user is + // running on a machine that happens to have a libcuda/kernel driver mismatch. + static port::StatusOr<DriverVersion> FindKernelModuleVersion( + const string &driver_version_file_contents); + + // Extracts the kernel driver version from the current host. + static port::StatusOr<DriverVersion> FindKernelDriverVersion(); + + // Iterates through loaded DSOs with DlIteratePhdrCallback to find the + // driver-interfacing DSO version number. Returns it as a string. + static port::StatusOr<DriverVersion> FindDsoVersion(); + + // Logs information about the kernel driver version and userspace driver + // library version. + static void LogDriverVersionInformation(); + + private: + // Logs information about the loaded nvidia-related kernel modules. + static void LogKernelModuleInformation(); + + // Given the DSO version number and the driver version file contents, extracts + // the driver version and compares, warning the user in the case of + // incompatability. + // + // This is solely used for more informative log messages when the user is + // running on a machine that happens to have a libcuda/kernel driver mismatch. + static void WarnOnDsoKernelMismatch( + port::StatusOr<DriverVersion> dso_version, + port::StatusOr<DriverVersion> kernel_version); + + // Logs information about the dev nodes present on this machine: their + // existence, permissions, accessibility from this uid/gid. + static void LogDevNodeDiagnosticInformation(); + + static string GetDevNodePath(int dev_node_ordinal); + + SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc new file mode 100644 index 0000000000..6e4403512b --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -0,0 +1,1074 @@ +#include "tensorflow/stream_executor/cuda/cuda_dnn.h" + +#include <dlfcn.h> +#include <functional> + +#include "tensorflow/stream_executor/dnn.h" +#include "tensorflow/stream_executor/dso_loader.h" +#include "tensorflow/stream_executor/lib/env.h" +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/initialize.h" +#include "tensorflow/stream_executor/lib/strcat.h" +#include "tensorflow/stream_executor/lib/threadpool.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/plugin_registry.h" +#include "tensorflow/stream_executor/stream.h" +#include "tensorflow/stream_executor/stream_executor_pimpl.h" +#include "tensorflow/stream_executor/cuda/cuda_activation.h" +#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" +#include "tensorflow/stream_executor/cuda/cuda_platform.h" +#include "third_party/gpus/cuda/include/cudnn.h" + +namespace { + +// Converts (via narrowing) a type T value to a type U, and checks that the +// value has no value change due to the conversion. +template <typename WideT, typename NarrowT> +NarrowT CheckedNarrowing(const WideT& wide) { + NarrowT narrow = wide; + CHECK_EQ(narrow, wide) + << "checked narrowing failed; values not equal post-conversion"; + return narrow; +} + +} // namespace + +namespace perftools { +namespace gputools { + +using dnn::BatchDescriptor; +using dnn::FilterDescriptor; +using dnn::ConvolutionDescriptor; +using dnn::PoolingDescriptor; + +namespace cuda { + +PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuDnnPlugin); + +extern CUstream AsCUDAStreamValue(Stream* stream); + +string ToString(cudnnStatus_t status) { + switch (status) { + case CUDNN_STATUS_SUCCESS: + return "CUDNN_STATUS_SUCCESS"; + case CUDNN_STATUS_NOT_INITIALIZED: + return "CUDNN_STATUS_NOT_INITIALIZED"; + case CUDNN_STATUS_ALLOC_FAILED: + return "CUDNN_STATUS_ALLOC_FAILED"; + case CUDNN_STATUS_BAD_PARAM: + return "CUDNN_STATUS_BAD_PARAM"; + case CUDNN_STATUS_INTERNAL_ERROR: + return "CUDNN_STATUS_INTERNAL_ERROR"; + case CUDNN_STATUS_INVALID_VALUE: + return "CUDNN_STATUS_INVALID_VALUE"; + case CUDNN_STATUS_ARCH_MISMATCH: + return "CUDNN_STATUS_ARCH_MISMATCH"; + case CUDNN_STATUS_MAPPING_ERROR: + return "CUDNN_STATUS_MAPPING_ERROR"; + case CUDNN_STATUS_EXECUTION_FAILED: + return "CUDNN_STATUS_EXECUTION_FAILED"; + case CUDNN_STATUS_NOT_SUPPORTED: + return "CUDNN_STATUS_NOT_SUPPORTED"; + case CUDNN_STATUS_LICENSE_ERROR: + return "CUDNN_STATUS_LICENSE_ERROR"; + default: + return port::StrCat("<unknown cudnn status: ", static_cast<int>(status), + ">"); + } +} + +namespace dynload { + +static port::ThreadPool* InitCudnnThreadpool() { + port::ThreadPool* cudnn_threadpool_; + port::ThreadOptions options; + // TBD(keveman): Conservatively setting the stack size and guard size to 2MB, + // until we can get some guarantees from NVIDIA on the minimum stack space + // they will work with. + options.stack_size = 2 * 1024 * 1024; + options.guard_size = 2 * 1024 * 1024; + cudnn_threadpool_ = new port::ThreadPool(port::Env::Default(), options, + "cudnn_threadpool", 1); + CHECK(cudnn_threadpool_); + return cudnn_threadpool_; +} + +static mutex cudnn_threadpool_mu(LINKER_INITIALIZED); +static port::ThreadPool* GetCudaThreadpool() { + mutex_lock lock(cudnn_threadpool_mu); + static port::ThreadPool* cudnn_threadpool = InitCudnnThreadpool(); + return cudnn_threadpool; +} + +#define PERFTOOLS_GPUTOOLS_CUDNN_WRAP(__name) \ + struct DynLoadShim__##__name { \ + static const char* kName; \ + typedef std::add_pointer<decltype(::__name)>::type FuncPointerT; \ + static void* GetDsoHandle() { \ + static auto result = internal::CachedDsoLoader::GetCudnnDsoHandle(); \ + return result.ValueOrDie(); \ + } \ + static FuncPointerT DynLoad() { \ + static void* f = dlsym(GetDsoHandle(), kName); \ + if (f == nullptr) { \ + LOG(FATAL) << "could not find " << kName \ + << " in cudnn DSO; dlerror: " << dlerror(); \ + } \ + return reinterpret_cast<FuncPointerT>(f); \ + } \ + template <typename... Args> \ + void CallWrapper(CUDAExecutor* parent, port::Notification* n, \ + cudnnStatus_t* retval, const Args&... args) { \ + cuda::ScopedActivateExecutorContext sac{parent}; \ + *retval = DynLoad()(args...); \ + n->Notify(); \ + } \ + template <typename... Args> \ + cudnnStatus_t operator()(CUDAExecutor* parent, Args... args) { \ + port::Notification n; \ + cudnnStatus_t retval; \ + auto call_func_closure = \ + std::bind(&DynLoadShim__##__name::CallWrapper<Args...>, this, \ + parent, &n, &retval, args...); \ + GetCudaThreadpool()->Schedule(call_func_closure); \ + n.WaitForNotification(); \ + return retval; \ + } \ + } __name; \ + const char* DynLoadShim__##__name::kName = #__name; + +#define CUDNN_DNN_ROUTINE_EACH(__macro) \ + __macro(cudnnSetTensor4dDescriptor) __macro( \ + cudnnGetConvolutionNdForwardOutputDim) \ + __macro(cudnnGetConvolutionForwardAlgorithm) __macro( \ + cudnnCreateTensorDescriptor) __macro(cudnnDestroyTensorDescriptor) \ + __macro(cudnnCreateFilterDescriptor) \ + __macro(cudnnSetFilter4dDescriptor) \ + __macro(cudnnSetPooling2dDescriptor) \ + __macro(cudnnDestroyFilterDescriptor) \ + __macro(cudnnCreateConvolutionDescriptor) \ + __macro(cudnnCreatePoolingDescriptor) \ + __macro(cudnnAddTensor) \ + __macro(cudnnDestroyPoolingDescriptor) + +CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP) +#undef CUDNN_DNN_ROUTINE_EACH + +// clang-format off +#define CUDNN_DNN_ROUTINE_EACH(__macro) \ + __macro(cudnnSetConvolution2dDescriptor) \ + __macro(cudnnDestroyConvolutionDescriptor) \ + __macro(cudnnCreate) \ + __macro(cudnnDestroy) \ + __macro(cudnnSetStream) \ + __macro(cudnnActivationForward) \ + __macro(cudnnConvolutionForward) \ + __macro(cudnnConvolutionBackwardData) \ + __macro(cudnnConvolutionBackwardFilter) \ + __macro(cudnnGetConvolutionForwardWorkspaceSize) \ + __macro(cudnnTransformTensor) \ + __macro(cudnnPoolingForward) \ + __macro(cudnnPoolingBackward) +// clang-format on + +CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP) +#undef CUDNN_DNN_ROUTINE_EACH + +} // namespace dynload + +namespace { + +cudnnHandle_t ToHandle(void* opaque_handle) { + return static_cast<cudnnHandle_t>(opaque_handle); +} + +} // namespace + +CudnnSupport::CudnnSupport(CUDAExecutor* parent) + : parent_(parent), dnn_handle_(nullptr) {} + +CudnnSupport::~CudnnSupport() { + auto status = dynload::cudnnDestroy(parent_, ToHandle(dnn_handle_)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "could not destroy cudnn handle: " << ToString(status); + } +} + +port::Status CudnnSupport::Init() { + auto status = dynload::cudnnCreate( + parent_, reinterpret_cast<cudnnHandle_t*>(&dnn_handle_)); + if (status == CUDNN_STATUS_SUCCESS) { + return port::Status::OK(); + } + + LOG(ERROR) << "could not create cudnn handle: " << ToString(status); + if (status == CUDNN_STATUS_NOT_INITIALIZED) { + // This is the error code that the driver returns when we're not running a + // sufficient CUDA driver -- cudnn requires 6.5+ compatibility, which + // starts with the 340.XX driver series. + auto result = cuda::Diagnostician::FindKernelDriverVersion(); + if (!result.ok()) { + LOG(ERROR) << "error retrieving driver version: " + << DriverVersionStatusToString(result); + } else { + const auto& version = result.ValueOrDie(); + LOG(INFO) << "running driver version: " << DriverVersionToString(version); + if (std::get<0>(version) < 340) { + LOG(ERROR) + << "cudnn library is only supported on 340.XX+ driver versions"; + } + } + } + return port::Status{port::error::INTERNAL, + port::StrCat("cudnn library could not create a handle: ", + ToString(status))}; +} + +// Turns a BatchDescriptor structure into a cudnn tensor handle within a scope. +class ScopedTensorDescriptor { + public: + ScopedTensorDescriptor(CUDAExecutor* parent, + const BatchDescriptor& batch_descriptor, + cudnnDataType_t elem_type) + : parent_(parent), handle_(nullptr) { + cudnnStatus_t status = + dynload::cudnnCreateTensorDescriptor(parent_, &handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not create cudnn tensor descriptor: " + << ToString(status); + } + + cudnnTensorFormat_t format; + switch (batch_descriptor.layout()) { + case dnn::DataLayout::kBatchYXDepth: + format = CUDNN_TENSOR_NHWC; + break; + case dnn::DataLayout::kBatchDepthYX: + format = CUDNN_TENSOR_NCHW; + break; + default: + LOG(FATAL) << "Unsupported tensor format " + << DataLayoutString(batch_descriptor.layout()); + break; + } + + status = dynload::cudnnSetTensor4dDescriptor( + parent_, handle_, format, elem_type, + CheckedNarrowing<int64, int>(batch_descriptor.count()), + CheckedNarrowing<int64, int>(batch_descriptor.feature_map_count()), + CheckedNarrowing<int64, int>(batch_descriptor.height()), + CheckedNarrowing<int64, int>(batch_descriptor.width())); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not set cudnn tensor descriptor: " + << ToString(status); + } + } + + ~ScopedTensorDescriptor() { + cudnnStatus_t status = + dynload::cudnnDestroyTensorDescriptor(parent_, handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "could not destroy cudnn tensor descriptor: " + << ToString(status); + } + } + + cudnnTensorDescriptor_t handle() const { return handle_; } + + private: + CUDAExecutor* parent_; // Parent executor. Not owned. + cudnnTensorDescriptor_t handle_; // Owned. + + SE_DISALLOW_COPY_AND_ASSIGN(ScopedTensorDescriptor); +}; + +// Turns a FilterDescriptor structure into a cudnn filter handle within a scope. +class ScopedFilterDescriptor { + public: + ScopedFilterDescriptor(CUDAExecutor* parent, + const FilterDescriptor& filter_descriptor, + cudnnDataType_t elem_type) + : parent_(parent), handle_(nullptr) { + cudnnStatus_t status = + dynload::cudnnCreateFilterDescriptor(parent_, &handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not create cudnn filter descriptor: " + << ToString(status); + } + + // TODO(b/23032134): Even if the filter layout is not supported, + // cudnnSetFilter4DDescriptor will return CUDNN_STATUS_SUCCESS because it + // does not take layout as an input. Maybe force cuDNN by giving wrong + // inputs intentionally? + switch (filter_descriptor.layout()) { + case dnn::FilterLayout::kOutputInputYX: + break; + default: + LOG(FATAL) << "Unsupported filter format " + << FilterLayoutString(filter_descriptor.layout()); + break; + } + + status = dynload::cudnnSetFilter4dDescriptor( + parent_, handle_, elem_type, + CheckedNarrowing<int64, int>( + filter_descriptor.output_feature_map_count()), + CheckedNarrowing<int64, int>( + filter_descriptor.input_feature_map_count()), + CheckedNarrowing<int64, int>(filter_descriptor.input_filter_height()), + CheckedNarrowing<int64, int>(filter_descriptor.input_filter_width())); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not set cudnn filter descriptor: " + << ToString(status); + } + } + + ~ScopedFilterDescriptor() { + cudnnStatus_t status = + dynload::cudnnDestroyFilterDescriptor(parent_, handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "could not destroy cudnn filter descriptor: " + << ToString(status); + } + } + + cudnnFilterDescriptor_t handle() const { return handle_; } + + private: + // Parent executor object. Not owned. + CUDAExecutor* parent_; + + // cudnn filter descriptor this object creates. Owned. + cudnnFilterDescriptor_t handle_; + + SE_DISALLOW_COPY_AND_ASSIGN(ScopedFilterDescriptor); +}; + +// Turns a ConvolutionDescriptor structure into a cudnn convolution handle +// within a scope. +class ScopedConvolutionDescriptor { + public: + ScopedConvolutionDescriptor( + CUDAExecutor* parent, const ConvolutionDescriptor& convolution_descriptor) + : parent_(parent), handle_(nullptr) { + cudnnStatus_t status = + dynload::cudnnCreateConvolutionDescriptor(parent_, &handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not create cudnn convolution descriptor: " + << ToString(status); + } + + status = dynload::cudnnSetConvolution2dDescriptor( + parent_, handle_, CheckedNarrowing<int64, int>( + convolution_descriptor.zero_padding_height()), + CheckedNarrowing<int64, int>( + convolution_descriptor.zero_padding_width()), + CheckedNarrowing<int64, int>( + convolution_descriptor.vertical_filter_stride()), + CheckedNarrowing<int64, int>( + convolution_descriptor.horizontal_filter_stride()), + // TODO(leary) not sure what the following two params do. + 1 /* = upscale_input_x */, 1 /* = upscale_input_y */, + // NOTE(keveman): cuDNN supports convolution and cross correlation. + // However, almost all the use cases do cross correlation, so just hard + // coding it here. + CUDNN_CROSS_CORRELATION); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not set cudnn convolution descriptor: " + << ToString(status); + } + } + + ~ScopedConvolutionDescriptor() { + cudnnStatus_t status = + dynload::cudnnDestroyConvolutionDescriptor(parent_, handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "could not destroy cudnn convolution descriptor: " + << ToString(status); + } + } + + cudnnConvolutionDescriptor_t handle() const { return handle_; } + + private: + CUDAExecutor* parent_; // Parent executor. Not owned. + cudnnConvolutionDescriptor_t handle_; // Owned. + + SE_DISALLOW_COPY_AND_ASSIGN(ScopedConvolutionDescriptor); +}; + +// Turns a PoolingDescriptor structure into a cudnn pooling descriptor handle +// within a scope. +class ScopedPoolingDescriptor { + public: + ScopedPoolingDescriptor(CUDAExecutor* parent, + const PoolingDescriptor& pooling_descriptor) + : parent_(parent), handle_(nullptr) { + cudnnStatus_t status = + dynload::cudnnCreatePoolingDescriptor(parent_, &handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not create cudnn pooling descriptor: " + << ToString(status); + } + status = dynload::cudnnSetPooling2dDescriptor( + parent_, handle_, + (pooling_descriptor.mode() == dnn::PoolingMode::kMaximum + ? CUDNN_POOLING_MAX + : CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING), + CheckedNarrowing<int64, int>(pooling_descriptor.window_height()), + CheckedNarrowing<int64, int>(pooling_descriptor.window_width()), + CheckedNarrowing<int64, int>(pooling_descriptor.vertical_padding()), + CheckedNarrowing<int64, int>(pooling_descriptor.horizontal_padding()), + CheckedNarrowing<int64, int>(pooling_descriptor.vertical_stride()), + CheckedNarrowing<int64, int>(pooling_descriptor.horizontal_stride())); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "could not set cudnn pooling descriptor: " + << ToString(status); + } + } + ~ScopedPoolingDescriptor() { + cudnnStatus_t status = + dynload::cudnnDestroyPoolingDescriptor(parent_, handle_); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "could not destroy cudnn pooling descriptor: " + << ToString(status); + } + } + + cudnnPoolingDescriptor_t handle() const { return handle_; } + + private: + CUDAExecutor* parent_; // Parent executor. Not owned. + cudnnPoolingDescriptor_t handle_; // Owned. + + SE_DISALLOW_COPY_AND_ASSIGN(ScopedPoolingDescriptor); +}; + +bool CudnnSupport::DoConvolve( + Stream* stream, const BatchDescriptor& batch_descriptor, + const DeviceMemory<float>& input_data, + const FilterDescriptor& filter_descriptor, + const DeviceMemory<float>& filter_data, + const ConvolutionDescriptor& convolution_descriptor, + const BatchDescriptor& output_descriptor, + DeviceMemory<float>* output_data) { + ScopedTensorDescriptor input_4d{parent_, batch_descriptor, CUDNN_DATA_FLOAT}; + ScopedTensorDescriptor output_4d{parent_, output_descriptor, + CUDNN_DATA_FLOAT}; + ScopedFilterDescriptor filter{parent_, filter_descriptor, CUDNN_DATA_FLOAT}; + ScopedConvolutionDescriptor conv{parent_, convolution_descriptor}; + + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status); + } + // Alpha is the scaling factor for input. + float alpha = 1.0; + // Beta is the scaling factor for output. + float beta = 0.0; + + // The NO_WORKSPACE versions are possibly slower for certain shapes, but + // not so for the shapes currently used by Brain. Also, it seems prudent to + // keep cuMemAlloc off the critical path. + cudnnConvolutionFwdAlgo_t algo; + status = dynload::cudnnGetConvolutionForwardAlgorithm( + parent_, ToHandle(dnn_handle_), input_4d.handle(), filter.handle(), + conv.handle(), output_4d.handle(), CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, 0, + &algo); + + CHECK_EQ(status, CUDNN_STATUS_SUCCESS) + << "Unable to find a suitable algorithm for doing forward convolution"; + + status = dynload::cudnnConvolutionForward( + parent_, ToHandle(dnn_handle_), &alpha, input_4d.handle(), + input_data.opaque(), filter.handle(), filter_data.opaque(), conv.handle(), + algo, nullptr /* workspace ptr */, 0 /* workspace size */, &beta, + output_4d.handle(), output_data->opaque()); + + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "failed to enqueue convolution on stream: " + << ToString(status); + return false; + } + + return true; +} + +bool CudnnSupport::DoConvolve( + Stream* stream, const BatchDescriptor& batch_descriptor, + const DeviceMemory<double>& input_data, + const FilterDescriptor& filter_descriptor, + const DeviceMemory<double>& filter_data, + const ConvolutionDescriptor& convolution_descriptor, + const BatchDescriptor& output_descriptor, + DeviceMemory<double>* output_data) { + LOG(ERROR) << "double-based DNN not yet implemented"; + return false; +} + +DeviceMemory<float> CudnnSupport::MaybeTransformLayout( + Stream* stream, BatchDescriptor* output_descriptor, + DeviceMemory<float> backward_output_data, + std::unique_ptr<TemporaryDeviceMemory<float>>* transform_scratch) { + if (output_descriptor->layout() == dnn::DataLayout::kBatchDepthYX) { + return backward_output_data; + } + CHECK(output_descriptor->layout() == dnn::DataLayout::kBatchYXDepth); + *transform_scratch = + stream->AllocateTemporaryArray<float>(backward_output_data.ElementCount()) + .ConsumeValueOrDie(); + BatchDescriptor transformed_output_descriptor; + transformed_output_descriptor.CloneFrom(*output_descriptor); + transformed_output_descriptor.set_layout(dnn::DataLayout::kBatchDepthYX); + ScopedTensorDescriptor orig_out_back_4d{parent_, *output_descriptor, + CUDNN_DATA_FLOAT}; + ScopedTensorDescriptor transformed_out_back_4d{ + parent_, transformed_output_descriptor, CUDNN_DATA_FLOAT}; + + float alpha = 1.0f; + float beta = 0.0f; + auto status = dynload::cudnnTransformTensor( + parent_, ToHandle(dnn_handle_), &alpha, orig_out_back_4d.handle(), + backward_output_data.opaque(), &beta, transformed_out_back_4d.handle(), + (*transform_scratch)->mutable_device_memory()->opaque()); + + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "Failed to transform the data layout."; + } + output_descriptor->set_layout(dnn::DataLayout::kBatchDepthYX); + return (*transform_scratch)->device_memory(); +} + +bool CudnnSupport::DoConvolveBackwardData( + Stream* stream, const FilterDescriptor& filter_descriptor, + const DeviceMemory<float>& filter_data, + const BatchDescriptor& output_descriptor_in, + DeviceMemory<float> backward_output_data, + const ConvolutionDescriptor& convolution_descriptor, + const BatchDescriptor& input_descriptor, + DeviceMemory<float>* backward_input_data) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status); + } + + // Alpha is the scaling factor for input. + float alpha = 1.0; + // Beta is the scaling factor for output. + float beta = 0.0; + + // TBD(keveman): remove once cuDNN supports kBatchYXDepth for backward pass. + BatchDescriptor output_descriptor; + output_descriptor.CloneFrom(output_descriptor_in); + std::unique_ptr<TemporaryDeviceMemory<float>> transform_scratch; + backward_output_data = MaybeTransformLayout( + stream, &output_descriptor, backward_output_data, &transform_scratch); + + ScopedTensorDescriptor out_back_4d{parent_, output_descriptor, + CUDNN_DATA_FLOAT}; + ScopedTensorDescriptor in_back_4d{parent_, input_descriptor, + CUDNN_DATA_FLOAT}; + ScopedFilterDescriptor filter{parent_, filter_descriptor, CUDNN_DATA_FLOAT}; + ScopedConvolutionDescriptor conv{parent_, convolution_descriptor}; + + status = dynload::cudnnConvolutionBackwardData( + parent_, ToHandle(dnn_handle_), &alpha, filter.handle(), + filter_data.opaque(), out_back_4d.handle(), backward_output_data.opaque(), + conv.handle(), &beta, in_back_4d.handle(), backward_input_data->opaque()); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "failed to enqueue convolution on stream: " + << ToString(status); + return false; + } + return true; +} + +bool CudnnSupport::DoConvolveBackwardFilter( + Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_descriptor_in, + DeviceMemory<float> backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::FilterDescriptor& filter_descriptor, + DeviceMemory<float>* backward_filter_data) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status); + } + + // Alpha is the scaling factor for input. + float alpha = 1.0; + // Beta is the scaling factor for output. + float beta = 0.0; + + // TBD(keveman): remove once cuDNN supports kBatchYXDepth for backward pass. + BatchDescriptor output_descriptor; + output_descriptor.CloneFrom(output_descriptor_in); + std::unique_ptr<TemporaryDeviceMemory<float>> transform_scratch; + backward_output_data = MaybeTransformLayout( + stream, &output_descriptor, backward_output_data, &transform_scratch); + + ScopedTensorDescriptor out_back_4d{parent_, output_descriptor, + CUDNN_DATA_FLOAT}; + ScopedTensorDescriptor input_4d{parent_, input_descriptor, CUDNN_DATA_FLOAT}; + ScopedFilterDescriptor filter{parent_, filter_descriptor, CUDNN_DATA_FLOAT}; + ScopedConvolutionDescriptor conv{parent_, convolution_descriptor}; + + status = dynload::cudnnConvolutionBackwardFilter( + parent_, ToHandle(dnn_handle_), &alpha, input_4d.handle(), + input_data.opaque(), out_back_4d.handle(), backward_output_data.opaque(), + conv.handle(), &beta, filter.handle(), backward_filter_data->opaque()); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(FATAL) << "failed to enqueue convolution on stream: " + << ToString(status); + return false; + } + return true; +} + +bool CudnnSupport::DoMatMul(Stream* stream, + const DeviceMemory<float>& input_data, + const DeviceMemory<float>& weights, + const dnn::BatchDescriptor& input_dimensions, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) { + if (input_dimensions.count() != output_dimensions.count()) { + LOG(ERROR) << "MatMul input and output dimensions are not compatible."; + return false; + } + + // We do not permute the input or output, instead we just + // reinterpret the layout. We are working with row-major matrices + // and the rows of the input and output correspond to batch, so + // batch has to be outermost in both the input and output. + // + // By adding transposes to the BLAS gemm call we could perhaps make + // the kYXDepthBatch layout work as well, but there has been no need + // for that so far. + if (input_dimensions.layout() != dnn::DataLayout::kBatchYXDepth && + input_dimensions.layout() != dnn::DataLayout::kBatchDepthYX) { + LOG(ERROR) << "Unsupported MatMul input layout."; + return false; + } + if (output_dimensions.layout() != dnn::DataLayout::kBatchYXDepth && + output_dimensions.layout() != dnn::DataLayout::kBatchDepthYX) { + LOG(ERROR) << "Unsupported MatMul output layout."; + return false; + } + + if (output_dimensions.width() == 1 && output_dimensions.height() == 1) { + // This is a fast path that also supports the kBatchYXDepth layout. + + // The matrices here are in row-major format while BLAS expects + // column-major, i.e. our matrices are transposed as far as BLAS + // is concerned. So we need to compute output^T = + // input^T*weights^T. There is no parameter for transposing the + // output in BLAS gemm, but instead we can transpose both sides of + // the equality to see that this is equivalent to + // output=weights*input. So we only need to swap the order of + // weights and input in the matrix product to correct for the + // row-major versus column-major difference. + const float alpha = 1.0f; // Take the matrix product without scaling it. + const float beta = 0.0f; // Ignore the original values in output_data. + const int64 m = output_dimensions.NodesAcrossFeatureMaps(); + const int64 n = input_dimensions.count(); + const int64 k = input_dimensions.NodesAcrossFeatureMaps(); + stream->ThenBlasGemm(blas::Transpose::kNoTranspose, + blas::Transpose::kNoTranspose, m, n, k, alpha, weights, + m, input_data, k, beta, output_data, m); + } else { + // This is a slower and more complex path that supports output + // width() * height() > 1, though it only supports the + // kBatchYXDepth layout. Does support kBatchDepthYX if output + // feature_map_count() == 1, as then there is no difference + // between the two layouts. + // + // The operation here is the same as above, except that we have to + // do the matrix multiplication for each (y,x) output coordinate + // separately. We then interpret weights as containing K = width() + // * height() different matrices, which we all multiply onto the + // matrix from input_data, yielding K matrix products. We then + // combine these together into one matrix by concatenating all the + // first rows of these matrices, then all the seconds rows and so + // on. We can do this with a batched matrix multiplication, where + // the result is written to a different submatrix of the output + // for each matrix multiplication. + // + // The reason that we only support the kBatchYXDepth output layout + // is that we have to do something in the depth for each (y,x) + // coordinate. The kBatchYXDepth layout has the depth information + // for each point (y,x) in contiguous memory while the + // kBatchDepthYX layout does not. + // + // TODO(broune): Consider a special case for when output depth == + // 1, as then possibly this could all be done as one matrix + // multiplication instead of a batched one, which should be + // faster. Another possibility would be to add a weights layout + // parameter and then support kBatchDepthYX for a different + // weights layout. + if (output_dimensions.layout() != dnn::DataLayout::kBatchYXDepth && + !(output_dimensions.layout() == dnn::DataLayout::kBatchDepthYX && + output_dimensions.feature_map_count() == 1)) { + LOG(ERROR) << "Unsupported MatMul output layout."; + return false; + } + + const float alpha = 1.0f; // Take the matrix product without scaling it. + const float beta = 0.0f; // Ignore the original values in output_data. + const uint64 m = output_dimensions.feature_map_count(); + const uint64 n = input_dimensions.count(); + const uint64 k = input_dimensions.NodesAcrossFeatureMaps(); + const int lda = m; + const int ldb = k; + const int ldc = output_dimensions.NodesAcrossFeatureMaps(); + const int batch_count = output_dimensions.NodesPerFeatureMap(); + + std::vector<DeviceMemory<float>> a(batch_count); + std::vector<DeviceMemory<float>> b(batch_count); + std::vector<DeviceMemory<float>> c(batch_count); + for (int i = 0; i < batch_count; ++i) { + const int weights_offset = i * input_dimensions.NodesAcrossFeatureMaps() * + output_dimensions.feature_map_count(); + a[i] = DeviceMemory<float>::MakeFromByteSize( + const_cast<float*>(reinterpret_cast<const float*>(weights.opaque())) + + weights_offset, + weights.ElementCount() - weights_offset); + + b[i] = input_data; + + const int output_offset = i * output_dimensions.feature_map_count(); + c[i] = DeviceMemory<float>::MakeFromByteSize( + const_cast<float*>( + reinterpret_cast<const float*>(output_data->opaque())) + + output_offset, + output_data->ElementCount() - output_offset); + } + const auto toPtrs = [](std::vector<DeviceMemory<float>>& v) { + std::vector<DeviceMemory<float>*> ptrs; + for (auto& mem : v) { + ptrs.push_back(&mem); + } + return ptrs; + }; + + stream->ThenBlasGemmBatched(blas::Transpose::kNoTranspose, + blas::Transpose::kNoTranspose, m, n, k, alpha, + toPtrs(a), lda, toPtrs(b), ldb, beta, toPtrs(c), + ldc, batch_count); + } + + return stream->ok(); +} + +bool CudnnSupport::DoBiasAdd(Stream* stream, + const DeviceMemory<float>& input_data, + const DeviceMemory<float>& biases, + const dnn::BatchDescriptor& dimensions, + DeviceMemory<float>* output_data) { + ScopedTensorDescriptor input_descriptor{parent_, dimensions, + CUDNN_DATA_FLOAT}; + + BatchDescriptor bias_dimensions; + bias_dimensions.set_count(1) + .set_feature_map_count(dimensions.feature_map_count()) + .set_height(1) + .set_width(1) + .set_layout(dnn::DataLayout::kBatchYXDepth); + ScopedTensorDescriptor bias_descriptor{parent_, bias_dimensions, + CUDNN_DATA_FLOAT}; + + // cudnnAddTensor is in-place, so we need to copy input_data to + // output_data before doing the addition, unless the input and + // output are at the same address. + if (input_data.opaque() != output_data->opaque()) { + stream->ThenMemcpy(output_data, input_data, + dimensions.ElementCount() * sizeof(float)); + if (!stream->ok()) { + LOG(ERROR) + << "stream " << stream + << " could not enqueue a tensor copy as part of bias addition."; + return false; + } + } + + mutex_lock lock{dnn_handle_mutex_}; + + const float alpha = 1.0f; + const float beta = 1.0f; + auto status = dynload::cudnnAddTensor( + parent_, ToHandle(dnn_handle_), CUDNN_ADD_SAME_C, &alpha, + bias_descriptor.handle(), biases.opaque(), &beta, + input_descriptor.handle(), output_data->opaque()); + + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "stream " << stream << " could not enqueue bias addition."; + return false; + } + + return true; +} + +bool CudnnSupport::DoActivate(Stream* stream, + dnn::ActivationMode activation_mode, + const dnn::BatchDescriptor& dimensions, + const DeviceMemory<float>& input_data, + DeviceMemory<float>* output_data) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status); + return false; + } + cudnnActivationMode_t mode; + switch (activation_mode) { + case dnn::ActivationMode::kRelu6: + // TODO(leary) should probably do a post-pass to clip at 6? + LOG(WARNING) << "user requested Relu6, but providing Relu instead"; + mode = CUDNN_ACTIVATION_RELU; + break; + case dnn::ActivationMode::kReluX: + // TODO(broune) should probably do a post-pass to clip at X? + LOG(WARNING) << "user requested ReluX, but providing Relu instead"; + mode = CUDNN_ACTIVATION_RELU; + break; + case dnn::ActivationMode::kRelu: + mode = CUDNN_ACTIVATION_RELU; + break; + case dnn::ActivationMode::kSigmoid: + mode = CUDNN_ACTIVATION_SIGMOID; + break; + case dnn::ActivationMode::kTanh: + mode = CUDNN_ACTIVATION_TANH; + break; + default: + LOG(ERROR) << "unrecognized activation mode: " + << static_cast<int>(activation_mode); + return false; + } + + ScopedTensorDescriptor input_4d{parent_, dimensions, CUDNN_DATA_FLOAT}; + // Alpha is the input scaling factor. + float alpha = 1.0; + // Beta is the output scaling factor. + float beta = 0.0; + status = dynload::cudnnActivationForward( + parent_, ToHandle(dnn_handle_), mode, &alpha, input_4d.handle(), + input_data.opaque(), &beta, input_4d.handle(), output_data->opaque()); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "stream " << stream + << " could not enqueue activation: " << ToString(status); + return false; + } + + return true; +} + +bool CudnnSupport::DoPoolForward( + Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status); + return false; + } + + // Alpha is the scaling factor for input. + float alpha = 1.0; + // Beta is the scaling factor for output. + float beta = 0.0; + + ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_FLOAT}; + ScopedTensorDescriptor dest_desc{parent_, output_dimensions, + CUDNN_DATA_FLOAT}; + ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions}; + status = dynload::cudnnPoolingForward( + parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha, + src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(), + output_data->opaque()); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to enqueue forward pooling on stream: " + << ToString(status); + return false; + } + return true; +} + +bool CudnnSupport::DoPoolBackward( + Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_dimensions, + const DeviceMemory<float>& output_data, + const DeviceMemory<float>& input_diff_data, + DeviceMemory<float>* output_diff_data) { + mutex_lock lock{dnn_handle_mutex_}; + auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_), + AsCUDAStreamValue(stream)); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status); + return false; + } + + // Alpha is the scaling factor for input. + float alpha = 1.0; + // Beta is the scaling factor for output. + float beta = 0.0; + + ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_FLOAT}; + ScopedTensorDescriptor dest_desc{parent_, output_dimensions, + CUDNN_DATA_FLOAT}; + ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions}; + status = dynload::cudnnPoolingBackward( + parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha, + dest_desc.handle(), output_data.opaque(), dest_desc.handle(), + input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta, + src_desc.handle(), output_diff_data->opaque()); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "failed to enqueue backward pooling on stream: " + << ToString(status); + return false; + } + return true; +} + +bool CudnnSupport::DoNormalize( + Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor, + const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) { + LOG(FATAL) << "not yet implemented"; // TODO(leary) +} + +bool CudnnSupport::DoDepthConcatenate( + Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float>*> input_data, + DeviceMemory<float>* output_data) { + LOG(FATAL) << "not yet implemented"; // TODO(leary) +} + +bool CudnnSupport::DoElementwiseOperate( + Stream* stream, dnn::ElementwiseOperation operation, + port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float>*> input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) { + LOG(FATAL) << "not yet implemented"; // TODO(leary) +} + +bool CudnnSupport::DoMemcpyD2HQuantized( + Stream* stream, const DeviceMemory<float>& gpu_unquantized_src, + port::MutableArraySlice<uint8> host_dst) { + LOG(ERROR) << "quantized memcpy not supported by cuDNN"; + return false; +} + +bool CudnnSupport::DoMemcpyD2HQuantized( + Stream* stream, const DeviceMemory<float>& device_unquantized_src, + port::MutableArraySlice<uint16> host_dst) { + LOG(ERROR) << "quantized memcpy not supported by cuDNN"; + return false; +} + +bool CudnnSupport::DoMemcpyD2HQuantized( + Stream* stream, const DeviceMemory<float>& device_unquantized_src, + port::MutableArraySlice<int32> host_dst) { + LOG(ERROR) << "quantized memcpy not supported by cuDNN"; + return false; +} + +bool CudnnSupport::DoMemcpyH2DQuantized( + Stream* stream, port::ArraySlice<uint8> host_src, + DeviceMemory<float>* gpu_unquantized_dst) { + LOG(ERROR) << "quantized memcpy not supported by cuDNN"; + return false; +} + +bool CudnnSupport::DeriveOutputBatchDescriptor( + const BatchDescriptor& batch_descriptor, + const FilterDescriptor& filter_descriptor, + const dnn::ConvolutionDescriptor& convolution_descriptor, + dnn::BatchDescriptor* output_batch_descriptor) { + ScopedTensorDescriptor input_4d{parent_, batch_descriptor, CUDNN_DATA_FLOAT}; + ScopedFilterDescriptor filter{parent_, filter_descriptor, CUDNN_DATA_FLOAT}; + ScopedConvolutionDescriptor conv{parent_, convolution_descriptor}; + + int dims[4]; + auto status = dynload::cudnnGetConvolutionNdForwardOutputDim( + parent_, conv.handle(), input_4d.handle(), filter.handle(), 4, dims); + if (status != CUDNN_STATUS_SUCCESS) { + LOG(ERROR) << "could not get output tensor for convolution: " + << ToString(status); + return false; + } + + output_batch_descriptor->set_count(dims[0]) + .set_feature_map_count(dims[1]) + .set_height(dims[2]) + .set_width(dims[3]) + .set_layout(batch_descriptor.layout()); + return true; +} + +} // namespace cuda + +namespace gpu = ::perftools::gputools; + +void initialize_cudnn() { + gpu::port::Status status = + gpu::PluginRegistry::Instance() + ->RegisterFactory<gpu::PluginRegistry::DnnFactory>( + gpu::cuda::kCudaPlatformId, gpu::cuda::kCuDnnPlugin, "cuDNN", + [](gpu::internal::StreamExecutorInterface* + parent) -> gpu::dnn::DnnSupport* { + gpu::cuda::CUDAExecutor* cuda_executor = + dynamic_cast<gpu::cuda::CUDAExecutor*>(parent); + if (cuda_executor == nullptr) { + LOG(ERROR) + << "Attempting to initialize an instance of the cuBLAS " + << "support library with a non-CUDA StreamExecutor"; + return nullptr; + } + + gpu::cuda::CudnnSupport* dnn = + new gpu::cuda::CudnnSupport(cuda_executor); + if (!dnn->Init().ok()) { + // Note: Init() will log a more specific error. + delete dnn; + return nullptr; + } + return dnn; + }); + + if (!status.ok()) { + LOG(ERROR) << "Unable to register cuDNN factory: " + << status.error_message(); + } + + // Prime the cuDNN DSO. The loader will log more information. + auto statusor = gpu::internal::CachedDsoLoader::GetCudnnDsoHandle(); + if (!statusor.ok()) { + LOG(INFO) << "Unable to load cuDNN DSO."; + } + + gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId, + gpu::PluginKind::kDnn, + gpu::cuda::kCuDnnPlugin); +} + +} // namespace gputools +} // namespace perftools + +REGISTER_MODULE_INITIALIZER(register_cudnn, + { perftools::gputools::initialize_cudnn(); }); diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h new file mode 100644 index 0000000000..08e952cee0 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_dnn.h @@ -0,0 +1,206 @@ +// The CUDA-specific DNN library support, implementing the general DnnSupport +// interface. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_ + +#include "tensorflow/stream_executor/dnn.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/thread_annotations.h" +#include "tensorflow/stream_executor/plugin_registry.h" +#include "tensorflow/stream_executor/temporary_device_memory.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +class CUDAExecutor; + +// Opaque and unique identifer for the cuDNN plugin. +extern const PluginId kCuDnnPlugin; + +// cudnn-library based DNN support. For details on overridden interface +// functions, see dnn.h. +class CudnnSupport : public dnn::DnnSupport { + public: + explicit CudnnSupport(CUDAExecutor* parent); + ~CudnnSupport() override; + + port::Status Init() override; + + bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory<float>& input_data, + const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory<float>& filter_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory<float>* output_data) override; + + bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor, + const DeviceMemory<double>& input_data, + const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory<double>& filter_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory<double>* output_data) override; + + bool DoSeparableConvolve( + Stream* stream, const dnn::BatchDescriptor& batch_descriptor, + const DeviceMemory<float>& input_data, + const dnn::FilterDescriptor& filter_descriptor, int depth_multiplier, + const DeviceMemory<float>& first_weights, + const DeviceMemory<float>& second_weights, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory<float>* output_data) override { + LOG(ERROR) << "separable convolution not supported by CUDNN"; + return false; + } + + bool DoConvolveBackwardData( + Stream* stream, const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory<float>& filter_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory<float> backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& input_descriptor, + DeviceMemory<float>* backward_input_data) override; + + bool DoConvolveBackwardFilter( + Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory<float> backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::FilterDescriptor& filter_descriptor, + DeviceMemory<float>* backward_filter_data) override; + + bool DoMatMul(Stream* stream, const DeviceMemory<float>& input_data, + const DeviceMemory<float>& weights, + const dnn::BatchDescriptor& input_dimensions, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) override; + + bool DoMatMulQuantized(Stream* stream, const DeviceMemory<float>& input_data, + const DeviceMemory<int8>& quantized_weights, + const DeviceMemory<float>& weight_scales, + const dnn::BatchDescriptor& input_dimensions, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) override { + LOG(ERROR) << "DNN MatMulQuantized not supported by CUDNN"; + return false; + } + + bool DoMatMulQuantized(Stream* stream, const DeviceMemory<float>& input_data, + const DeviceMemory<int16>& quantized_weights, + const DeviceMemory<float>& weight_scales, + const dnn::BatchDescriptor& input_dimensions, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) override { + LOG(ERROR) << "DNN MatMulQuantized not supported by CUDNN"; + return false; + } + + bool DoBiasAdd(Stream* stream, const DeviceMemory<float>& input_data, + const DeviceMemory<float>& biases, + const dnn::BatchDescriptor& dimensions, + DeviceMemory<float>* output_data) override; + + bool DoActivate(Stream* stream, dnn::ActivationMode activation_mode, + const dnn::BatchDescriptor& dimensions, + const DeviceMemory<float>& input_data, + DeviceMemory<float>* output_data) override; + + bool DoPoolForward(Stream* stream, + const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) override; + + bool DoPoolBackward(Stream* stream, + const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_dimensions, + const DeviceMemory<float>& output_data, + const DeviceMemory<float>& input_diff_data, + DeviceMemory<float>* output_diff_data) override; + + bool DoNormalize(Stream* stream, + const dnn::NormalizeDescriptor& normalize_descriptor, + const DeviceMemory<float>& input_data, + DeviceMemory<float>* output_data) override; + + bool DoDepthConcatenate( + Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float>*> input_data, + DeviceMemory<float>* output_data) override; + + bool DoElementwiseOperate( + Stream* stream, dnn::ElementwiseOperation operation, + port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float>*> input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) override; + + bool DoMemcpyD2HQuantized(Stream* stream, + const DeviceMemory<float>& device_unquantized_src, + port::MutableArraySlice<uint8> host_dst) override; + + bool DoMemcpyD2HQuantized(Stream* stream, + const DeviceMemory<float>& device_unquantized_src, + port::MutableArraySlice<uint16> host_dst) override; + + bool DoMemcpyD2HQuantized(Stream* stream, + const DeviceMemory<float>& device_unquantized_src, + port::MutableArraySlice<int32> host_dst) override; + + bool DoMemcpyH2DQuantized( + Stream* stream, port::ArraySlice<uint8> host_src, + DeviceMemory<float>* device_unquantized_dst) override; + + // Derives an output batch descriptor from an input batch and convolution + // descriptors. + bool DeriveOutputBatchDescriptor( + const dnn::BatchDescriptor& batch_descriptor, + const dnn::FilterDescriptor& filter_descriptor, + const dnn::ConvolutionDescriptor& convolution_descriptor, + dnn::BatchDescriptor* output_batch_descriptor); + + private: + // Guards the enqueueing of DNN operations via the dnn_handle_ below. + mutex dnn_handle_mutex_; + + CUDAExecutor* parent_; // Parent executor object. Not owned. + + // cudnn library handle. cudnnHandle_t type is not present in this header to + // prevent third-party library header inclusions from leaking outside the + // single cuda_dnn translation unit. + void* dnn_handle_ GUARDED_BY(dnn_handle_mutex_); + + // NOTE(keveman): Temporary data layout transformation until cuDNN supports + // kBatchYXDepth for backward pass. This function allocates temporary memory, + // lays out the source data into the temporary but in the kBatchDepthXY + // layout, and returns the temporary memory. The caller is responsible for + // deallocating the temporary. Since the allocation is done using Stream's + // AllocateTemporaryMemory, a later BlockHostUntilDone could be used for + // deallocation. + // + // transform_scratch is populated with a legitimate temporary allocation iff + // the original output data needs to be transformed. + DeviceMemory<float> MaybeTransformLayout( + Stream* stream, dnn::BatchDescriptor* output_descriptor, + DeviceMemory<float> backward_output_data, + std::unique_ptr<TemporaryDeviceMemory<float>>* transform_scratch) + EXCLUSIVE_LOCKS_REQUIRED(dnn_handle_mutex_); + + SE_DISALLOW_COPY_AND_ASSIGN(CudnnSupport); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc new file mode 100644 index 0000000000..8c4316b4c1 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_driver.cc @@ -0,0 +1,1608 @@ +#include "tensorflow/stream_executor/cuda/cuda_driver.h" + +#include <dlfcn.h> +#include <stdint.h> +#include <stdlib.h> +#include <set> +#include "tensorflow/stream_executor/platform/port.h" + +#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" +#include "tensorflow/stream_executor/dso_loader.h" +#include "tensorflow/stream_executor/lib/casts.h" +#include "tensorflow/stream_executor/lib/env.h" +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/human_readable.h" +#include "tensorflow/stream_executor/lib/notification.h" +#include "tensorflow/stream_executor/lib/threadpool.h" +#include "tensorflow/stream_executor/lib/stacktrace.h" +#include "tensorflow/stream_executor/lib/static_threadlocal.h" +#include "tensorflow/stream_executor/lib/strcat.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/lib/inlined_vector.h" + +bool FLAGS_gpuexec_cuda_driver_inject_init_error = false; +bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false; +bool FLAGS_gpuexec_cuda_device_0_only = false; + +namespace perftools { +namespace gputools { +namespace cuda { + +namespace dynload { + +#define PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(__name) \ + struct DynLoadShim__##__name { \ + static const char *kName; \ + using FuncPointerT = std::add_pointer<decltype(::__name)>::type; \ + static void *GetDsoHandle() { \ + static auto status = internal::CachedDsoLoader::GetLibcudaDsoHandle(); \ + return status.ValueOrDie(); \ + } \ + static FuncPointerT DynLoad() { \ + static void *f = dlsym(GetDsoHandle(), kName); \ + CHECK(f != nullptr) << "could not find " << kName \ + << "in libcuda DSO; dlerror: " << dlerror(); \ + return reinterpret_cast<FuncPointerT>(f); \ + } \ + template <typename... Args> \ + CUresult operator()(Args... args) { \ + return DynLoad()(args...); \ + } \ + } __name; \ + const char *DynLoadShim__##__name::kName = #__name; + +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxCreate_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxDestroy); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxEnablePeerAccess); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetCurrent); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetDevice); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetSharedMemConfig); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxPopCurrent_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSetCurrent); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSetSharedMemConfig); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSynchronize); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceComputeCapability); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceCanAccessPeer); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGet); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetAttribute); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetCount); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetName); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetPCIBusId); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetProperties); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceTotalMem); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDriverGetVersion); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventCreate); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventDestroy_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventElapsedTime); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventQuery); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventRecord); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncGetAttribute); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncSetCacheConfig); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuGetErrorName); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuGetErrorString); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuInit); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuLaunchKernel); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemAlloc_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoD_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoH_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyHtoD_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoDAsync_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoHAsync_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyHtoDAsync_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemGetAddressRange_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemFree_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemFreeHost); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemGetInfo_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostAlloc); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostRegister_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostUnregister); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD32_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD32Async); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD8_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleGetFunction); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleGetGlobal_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleLoadDataEx); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleLoadFatBinary); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleUnload); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuOccupancyMaxActiveBlocksPerMultiprocessor); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuPointerGetAttribute); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamAddCallback); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamCreate); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamDestroy_v2); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamQuery); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamSynchronize); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamWaitEvent); + +} // namespace dynload + +namespace { + +// Manages the singleton set of contexts that we've created. This is used for +// checking that no CUDA-runtime-created contexts have been generated +// accidentally. CUDA-runtime-created contexts are avoided, if triple angle +// brace launches are required, by using the scoped activations in +// cuda_activation.h. +class CreatedContexts { + public: + // Returns whether context is a member of the live set. + static bool Has(CUcontext context) { + shared_lock lock{mu_}; + return Live()->find(context) != Live()->end(); + } + + // Adds context to the live set. + static void Add(CUcontext context) { + CHECK(context != nullptr); + mutex_lock lock{mu_}; + Live()->emplace(context); + } + + // Removes context from the live set. + static void Remove(CUcontext context) { + CHECK(context != nullptr); + mutex_lock lock{mu_}; + Live()->erase(context); + } + + private: + // Returns the live set singleton. + static std::set<CUcontext> *Live() { + static auto singleton = new std::set<CUcontext>; + return singleton; + } + + // Lock that guards access-to/mutation-of the live set. + static mutex mu_; +}; + +/* static */ mutex CreatedContexts::mu_{LINKER_INITIALIZED}; + +// Formats CUresult to output prettified values into a log stream. +// Error summaries taken from: +// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc6c391505e117393cc2558fff6bfc2e9 +// +// TODO(leary) switch to cuGetErrorName when updated cuda.h is available. +string ToString(CUresult result) { +#define OSTREAM_CUDA_ERROR(__name) \ + case CUDA_ERROR_##__name: \ + return "CUDA_ERROR_" #__name; + +/////////////// +// NOTE: here we specify return code values outside of the enum explicitly +// because our in-tree cuda.h is from the CUDA 5.5 SDK, but CUDA 6.0+ driver +// libraries are deployed in the fleet these error codes are backwards +// compatible, but if we see a "new" one, we want to be able to identify it in +// the logs. +// +// Once we get a cuda.h that has cuGetErrorName (TODO is above) we can +// eliminate this function and just rely on the driver to provide us these +// strings. +// +// NOTE: "Must reboot all context" below is shorthand for, "must +// destroy/recreate the offending context and any allocation which come from +// it if you are to continue using CUDA." +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wswitch" + switch (result) { + OSTREAM_CUDA_ERROR(INVALID_VALUE) + OSTREAM_CUDA_ERROR(OUT_OF_MEMORY) + OSTREAM_CUDA_ERROR(NOT_INITIALIZED) + OSTREAM_CUDA_ERROR(DEINITIALIZED) + OSTREAM_CUDA_ERROR(NO_DEVICE) + OSTREAM_CUDA_ERROR(INVALID_DEVICE) + OSTREAM_CUDA_ERROR(INVALID_IMAGE) + OSTREAM_CUDA_ERROR(INVALID_CONTEXT) + OSTREAM_CUDA_ERROR(INVALID_HANDLE) + OSTREAM_CUDA_ERROR(NOT_FOUND) + OSTREAM_CUDA_ERROR(NOT_READY) + OSTREAM_CUDA_ERROR(NO_BINARY_FOR_GPU) + + // Encountered an uncorrectable ECC error during execution. + OSTREAM_CUDA_ERROR(ECC_UNCORRECTABLE) + + // Load/store on an invalid address. Must reboot all context. + case 700: + return "CUDA_ERROR_ILLEGAL_ADDRESS"; + // Passed too many / wrong arguments, too many threads for register count. + case 701: + return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; + // Kernel took too long to execute. + case 702: + return "CUDA_ERROR_LAUNCH_TIMEOUT"; + // Kernel launch uses an incompatible texturing mode. + case 703: + return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"; + // Trying to re-enable peer access that already has it enabled. + case 704: + return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"; + // Trying to disable peer access that has not yet been enabled. + case 705: + return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"; + // Primary context for the specified device has already been initialized. + case 708: + return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"; + // Context current to calling thread has been destroyed or is a primary + // context that has not yet been initialized. + case 709: + return "CUDA_ERROR_CONTEXT_IS_DESTROYED"; + // Device-side assert triggered during kernel execution. Must reboot all + // context. + case 710: + return "CUDA_ERROR_ASSERT"; + // Hardware resources to enable peer access have been exhausted. + case 711: + return "CUDA_ERROR_TOO_MANY_PEERS"; + // Memory range has already been registered. + case 712: + return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"; + // Pointer does not correspond to any currently registered memory region. + case 713: + return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"; + // Due to stack corruption or exceeding stack size limit. Must reboot all + // context. + case 714: + return "CUDA_ERROR_HARDWARE_STACK_ERROR"; + case 715: + return "CUDA_ERROR_ILLEGAL_INSTRUCTION"; + // Load/store on an unaligned memory address. Must reboot all context. + case 716: + return "CUDA_ERROR_MISALIGNED_ADDRESS"; + // Device instruction with specific address space given address not + // belonging to allowed address space. Must reboot all context. + case 717: + return "CUDA_ERROR_INVALID_ADDRESS_SPACE"; + // Device program counter wrapped its address space. Must reboot all + // context. + case 718: + return "CUDA_ERROR_INVALID_PC"; + // Exception on device while executing a kernel; e.g. deref invalid device + // pointer, accessing OOB shared memory. Must reboot all context. + case 719: + return "CUDA_ERROR_LAUNCH_FAILED"; + + OSTREAM_CUDA_ERROR(CONTEXT_ALREADY_IN_USE) + OSTREAM_CUDA_ERROR(PEER_ACCESS_UNSUPPORTED) + OSTREAM_CUDA_ERROR(NOT_PERMITTED) + OSTREAM_CUDA_ERROR(NOT_SUPPORTED) + OSTREAM_CUDA_ERROR(UNKNOWN) // Unknown internal error to CUDA. + default: + return port::StrCat("CUresult(", static_cast<int>(result), ")"); + } +#pragma GCC diagnostic pop +} + +// Returns the current context and checks that it is in the set of CUDA contexts +// created by StreamExecutor (to ensure that the CUDA runtime didn't create a +// context behind our backs). +CUcontext CurrentContext() { + CUcontext current = nullptr; + CUresult result = dynload::cuCtxGetCurrent(¤t); + if (result != CUDA_SUCCESS) { + LOG(FATAL) << "failed to query current context: " << ToString(result); + } + if (current != nullptr && !CreatedContexts::Has(current)) { + LOG(FATAL) << "current context was not created by the StreamExecutor " + "cuda_driver API: " + << current + << "; a CUDA runtime call " + "was likely performed without using a StreamExecutor context"; + } + return current; +} + +// "Pops" the current context, checks that it matches expected, and checks the +// postcondition that the current context is nullptr. +// +// This is not done when we're nested within a MultiOpActivation, as we want to +// persist the active context until the MultiOpActivation is popped. +void PopContextAndCheckNowNull(CUcontext expected) { + CUcontext actual = CurrentContext(); + CHECK_EQ(expected, actual) << "would pop unexpected context"; + CUcontext popped; + CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxPopCurrent_v2(&popped)); + CHECK_EQ(expected, popped); + CHECK(nullptr == CurrentContext()); + VLOG(3) << "popped context " << expected + << " and current context is now null"; +} + +// CUDA driver routines may require a large amount of stack (particularly +// cuModuleLoadDataEx, in our experience). To avoid stack overflow when using +// stack-limited threads (such as those spawned by a default-argument +// thread::ThreadPool on some platforms), we run certain routines in this pool +// and wait for completion. +static mutex driver_executor_threadpool_mu(LINKER_INITIALIZED); +static port::ThreadPool *InitializeDriverExecutor() { + return new port::ThreadPool(port::Env::Default(), port::ThreadOptions(), + "cuda_driver", 1); +} + +port::ThreadPool *GetDriverExecutor() { + mutex_lock lock(driver_executor_threadpool_mu); + static port::ThreadPool *thread_pool = InitializeDriverExecutor(); + return thread_pool; +} + +} // namespace + + +// Thread-local storage that indicates whether a CUDA context activation is +// being nested within an outer, MultiOpActivation. In that case, we should not +// pop the context to nullptr when we are done with the current activation. +SE_STATIC_THREAD_LOCAL_POD(bool, tls_in_multi_op_activation); + +string MemorySpaceString(MemorySpace memory_space) { + switch (memory_space) { + case MemorySpace::kHost: + return "host"; + case MemorySpace::kDevice: + return "device"; + default: + LOG(FATAL) << "impossible memory space"; + } +} + +// Implementation note: the CUDA context is held, per-thread, in TLS. We avoid +// setting all the time because it's not clear what side effects might occur for +// a "set" operation, whereas a "get" operation we can reasonably assume is a +// TLS read. +// +// We cannot race here because CUcontext is associated with a particular thread +// and stored in TLS; and these interfaces should not be used from signal +// handlers. +ScopedActivateContext::ScopedActivateContext(CUcontext context, + MultiOpActivation moa) + : context_(CHECK_NOTNULL(context)), + previously_in_multi_op_activation_(tls_in_multi_op_activation.get()) { + if (static_cast<bool>(moa)) { + tls_in_multi_op_activation.get() = true; + } + + CUcontext current = prior_context_ = CurrentContext(); + if (current != context) { + VLOG(3) << "ScopedActivateContext switching context from " << current + << " to " << context; + CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(context)); + if (FLAGS_gpuexec_cuda_sync_around_driver_calls) { + auto res = dynload::cuCtxSynchronize(); + if (res != CUDA_SUCCESS) { + LOG(FATAL) << "gpuexec_cuda_sync_around_driver_calls found " + << ToString(res) + << " immediately after establishing the device context " + << context << " :: " << port::CurrentStackTrace(); + } + } + } +} + +ScopedActivateContext::~ScopedActivateContext() { + if (tls_in_multi_op_activation.get()) { + CHECK_EQ(context_, CurrentContext()); + if (FLAGS_gpuexec_cuda_sync_around_driver_calls) { + auto res = dynload::cuCtxSynchronize(); + if (res != CUDA_SUCCESS) { + LOG(FATAL) << "gpuexec_cuda_sync_around_driver_calls found " + << ToString(res) + << " immediately after de-establishing the device context " + << context_ << " :: " << port::CurrentStackTrace(); + } + } + CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(prior_context_)); + } else { + PopContextAndCheckNowNull(context_); + } + tls_in_multi_op_activation.get() = previously_in_multi_op_activation_; +} + +namespace { + +// Returns a stringified device number associated with pointer, primarily for +// logging purposes. Returns "?" if the device could not be successfully +// queried. +string CUDAPointerToDeviceString(CUdeviceptr pointer) { + auto value = CUDADriver::GetPointerDevice(pointer); + if (value.ok()) { + return port::StrCat(value.ValueOrDie()); + } + LOG(ERROR) << "could not query device: " << value.status(); + return "?"; +} + +// Returns a stringified memory space associated with pointer, primarily for +// logging purposes. Returns "?" if the memory space could not be successfully +// queried. +string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) { + auto value = CUDADriver::GetPointerMemorySpace(pointer); + if (value.ok()) { + return MemorySpaceString(value.ValueOrDie()); + } + LOG(ERROR) << "could not query device: " << value.status(); + return "?"; +} + +// Returns a stringified representation of whether or not peer access is +// permitted between the "from" and "to" pointers' associated contexts, +// primarily for logging purposes. Returns "error" if an error is encountered +// in the process of querying. +string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) { + auto from_context = CUDADriver::GetPointerContext(from); + if (!from_context.ok()) { + LOG(ERROR) << "could not retrieve source pointer's context: " + << from_context.status(); + return "error"; + } + auto to_context = CUDADriver::GetPointerContext(to); + if (!to_context.ok()) { + LOG(ERROR) << "could not retrieve destination pointer's context: " + << to_context.status(); + return "error"; + } + return CUDADriver::CanEnablePeerAccess(from_context.ValueOrDie(), + to_context.ValueOrDie()) + ? "true" + : "false"; +} + + +// Actually performs the work of CUDA initialization. Wrapped up in one-time +// execution guard. +static port::Status InternalInit() { + CUresult res = CUDA_ERROR_NO_DEVICE; + if (FLAGS_gpuexec_cuda_driver_inject_init_error) { + LOG(ERROR) << "injecting CUDA init error; initialization will fail"; + } else if (internal::CachedDsoLoader::GetLibcudaDsoHandle().ok()) { + // We only call cuInit if we can dynload libcuda. + + res = dynload::cuInit(0 /* = flags */); + } + + if (res == CUDA_SUCCESS) { + return port::Status::OK(); + } + + LOG(ERROR) << "failed call to cuInit: " << ToString(res); + Diagnostician::LogDiagnosticInformation(); + return port::Status{port::error::ABORTED, + port::StrCat("failed call to cuInit: ", ToString(res))}; +} + +} // namespace + +/* static */ port::Status CUDADriver::Init() { + // Cached return value from calling InternalInit(), as cuInit need only be + // called once, but CUDADriver::Init may be called many times. + static port::Status init_retval; + static bool set = false; + static mutex init_mu(LINKER_INITIALIZED); + + mutex_lock lock(init_mu); + if (!set) { + init_retval = InternalInit(); + set = true; + } + + return init_retval; +} + +/* static */ port::Status CUDADriver::GetDevice(int device_ordinal, + CUdevice *device) { + CUresult res = dynload::cuDeviceGet(device, device_ordinal); + if (res == CUDA_SUCCESS) { + return port::Status::OK(); + } + + return port::Status{ + port::error::INTERNAL, + port::StrCat("failed call to cuDeviceGet: ", ToString(res))}; +} + +/* static */ bool CUDADriver::GetDeviceName(CUdevice device, + string *device_name) { + static const size_t kCharLimit = 64; + port::InlinedVector<char, 4> chars(kCharLimit); + CUresult res = + dynload::cuDeviceGetName(chars.begin(), kCharLimit - 1, device); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to get device name for " << device << ": " + << ToString(res); + return false; + } + chars[kCharLimit - 1] = '\0'; + *device_name = chars.begin(); + return true; +} + +bool DeviceOptionsToContextFlags(DeviceOptions device_options, int *flags) { + static_assert(DeviceOptions::kMask == 0xf, + "needs update for new device options"); + + if (device_options.flags() & DeviceOptions::kDoNotReclaimStackAllocation) { + *flags |= CU_CTX_LMEM_RESIZE_TO_MAX; + } + + // If no flags are set the default is CU_CTX_SCHED_AUTO, which + // in Google environments is very likely to mean SPIN. + if (device_options.flags() & DeviceOptions::kScheduleSpin) { + *flags |= CU_CTX_SCHED_SPIN; + } + if (device_options.flags() & DeviceOptions::kScheduleYield) { + *flags |= CU_CTX_SCHED_YIELD; + } + if (device_options.flags() & DeviceOptions::kScheduleBlockingSync) { + *flags |= CU_CTX_SCHED_BLOCKING_SYNC; + } + + return true; +} + +/* static */ port::Status CUDADriver::CreateContext( + CUdevice device, DeviceOptions device_options, CUcontext *context) { + CUcontext former_context = CurrentContext(); + if (former_context != nullptr) { + LOG(WARNING) << "creating context when one is currently active; existing: " + << former_context; + } + + int flags = 0; + if (!DeviceOptionsToContextFlags(device_options, &flags)) { + LOG(WARNING) << "could not convert all device options into context flags"; + } + + CUresult res; + { + // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their + // context creation: see http://b/13248943 + + res = dynload::cuCtxCreate_v2(context, flags, device); + } + if (res == CUDA_SUCCESS) { + CreatedContexts::Add(*context); + PopContextAndCheckNowNull(*context); + CHECK(*context != nullptr) + << "success in this call must entail non-null result"; + VLOG(2) << "created context " << context << " for this thread"; + return port::Status::OK(); + } + + string message = "failed call to cuCtxCreate: " + ToString(res); + if (res == CUDA_ERROR_OUT_OF_MEMORY) { + uint64 total_memory; + if (GetDeviceTotalMemory(device, &total_memory)) { + port::StrAppend(&message, "; total memory reported: ", total_memory); + } else { + port::StrAppend(&message, "; could not query total memory"); + } + } + + return port::Status{port::error::INTERNAL, message}; +} + +/* static */ void CUDADriver::DestroyContext(CUcontext context) { + if (context == nullptr) { + return; + } + + CUresult res = dynload::cuCtxDestroy_v2(context); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to destroy CUDA context; leaking: " << ToString(res); + } + + CreatedContexts::Remove(context); +} + +/* static */ bool CUDADriver::FuncGetAttribute(CUfunction_attribute attribute, + CUfunction func, + int *attribute_value) { + CUresult res = dynload::cuFuncGetAttribute(attribute_value, attribute, func); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query kernel attribute. kernel: " << func + << ", attribute: " << attribute; + return false; + } + return true; +} + +/* static */ bool CUDADriver::FuncSetCacheConfig(CUfunction function, + CUfunc_cache cache_config) { + CUresult res = dynload::cuFuncSetCacheConfig(function, cache_config); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to set CUDA kernel cache config. kernel: " << function + << ", config: " << cache_config << ", result: " << ToString(res); + return false; + } + + return true; +} + +/* static */ port::StatusOr<CUsharedconfig> +CUDADriver::ContextGetSharedMemConfig(CUcontext context) { + CUsharedconfig shared_mem_config; + ScopedActivateContext activation{context}; + CUresult result = dynload::cuCtxGetSharedMemConfig(&shared_mem_config); + if (result != CUDA_SUCCESS) { + CUdevice device; + dynload::cuCtxGetDevice(&device); + LOG(ERROR) << "failed to get CUDA device shared memory config. " + << "Context device ID: " << device + << ", result: " << ToString(result); + return port::Status{ + port::error::INTERNAL, + port::StrCat("failed to get shared memory config: ", ToString(result))}; + } + return shared_mem_config; +} + +/* static */ port::Status CUDADriver::ContextSetSharedMemConfig( + CUcontext context, CUsharedconfig shared_mem_config) { + ScopedActivateContext activation{context}; + CUresult result = dynload::cuCtxSetSharedMemConfig(shared_mem_config); + if (result != CUDA_SUCCESS) { + CUdevice device; + dynload::cuCtxGetDevice(&device); + LOG(ERROR) << "failed to set CUDA device shared memory config. " + << "Context device ID: " << device + << ", config: " << shared_mem_config + << ", result: " << ToString(result); + return port::Status{ + port::error::INTERNAL, + port::StrCat("failed to set shared memory config: ", ToString(result))}; + } + return port::Status::OK(); +} + +/* static */ bool CUDADriver::LaunchKernel( + CUcontext context, CUfunction function, unsigned int grid_dim_x, + unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x, + unsigned int block_dim_y, unsigned int block_dim_z, + unsigned int shared_mem_bytes, CUstream stream, void **kernel_params, + void **extra) { + ScopedActivateContext activation{context}; + VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x + << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z + << " bdx: " << block_dim_x << " bdy: " << block_dim_y + << " bdz: " << block_dim_z; + CUresult res = dynload::cuLaunchKernel( + function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y, + block_dim_z, shared_mem_bytes, stream, kernel_params, extra); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to launch CUDA kernel: " << function + << "; result: " << ToString(res); + return false; + } + VLOG(2) << "successfully launched kernel"; + return true; +} + +/* static */ port::Status CUDADriver::LoadCubin(CUcontext context, + const char *cubin_bytes, + CUmodule *module) { + ScopedActivateContext activation{context}; + CUresult result = dynload::cuModuleLoadFatBinary(module, cubin_bytes); + if (result != CUDA_SUCCESS) { + return port::Status{port::error::INTERNAL, + "failed to load in-memory CUBIN: " + ToString(result)}; + } + + return port::Status::OK(); +} + +/* static */ bool CUDADriver::LoadPtx(CUcontext context, + const char *ptx_contents, + CUmodule *module) { + port::Notification notification; + bool ret = true; + GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret, + ¬ification]() { + ScopedActivateContext activation{context}; + void *ptx_data = const_cast<char *>(ptx_contents); + static const unsigned int kLogBufferBytesLimit = 1024; + unsigned int error_log_buffer_bytes = kLogBufferBytesLimit; + unsigned int info_log_buffer_bytes = kLogBufferBytesLimit; + port::InlinedVector<char, 4> error_log_buffer(error_log_buffer_bytes); + port::InlinedVector<char, 4> info_log_buffer(info_log_buffer_bytes); + bool log_verbose = true; + CUjit_option options[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + CU_JIT_ERROR_LOG_BUFFER, + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + CU_JIT_INFO_LOG_BUFFER, CU_JIT_LOG_VERBOSE}; + // Note that the driver API wants the contents of this values to be stored + // in an array of void*s, so we coerce them accordingly. + void *option_values[] = { + port::bit_cast<void *>(uintptr_t(error_log_buffer_bytes)), + port::bit_cast<void *>(error_log_buffer.data()), + port::bit_cast<void *>(uintptr_t(info_log_buffer_bytes)), + port::bit_cast<void *>(info_log_buffer.data()), + port::bit_cast<void *>(uintptr_t(log_verbose))}; + CHECK(ARRAYSIZE(options) == ARRAYSIZE(option_values)); + + CUresult res; + { + // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their + // module loading: see http://b/13248943 + + res = dynload::cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options), + options, option_values); + } + + // The PTX JIT mutates the values in the option values array to reflect the + // size of the logs it output; now that we've made the call, read the values + // back out. + error_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[0]); + info_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[2]); + CHECK_LE(error_log_buffer_bytes, kLogBufferBytesLimit); + CHECK_LE(info_log_buffer_bytes, kLogBufferBytesLimit); + + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to load PTX text as a module: " << ToString(res); + // As a precaution for null termination of the API-provided value, ensure + // that at least the last byte is null. + error_log_buffer[error_log_buffer_bytes ? + error_log_buffer_bytes - 1 : 0] = '\0'; + LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes + << " bytes): " << error_log_buffer.data(); + ret = false; + notification.Notify(); + } + + VLOG(3) << "PTX compilation info log (" << info_log_buffer_bytes + << " bytes): " << info_log_buffer.data(); + VLOG(3) << "PTX compilation error log (" << error_log_buffer_bytes + << " bytes): " << error_log_buffer.data(); + CHECK(module != nullptr); + notification.Notify(); + }); + notification.WaitForNotification(); + + return ret; +} + +/* static */ bool CUDADriver::SynchronousMemsetUint8(CUcontext context, + CUdeviceptr location, + uint8 value, size_t size) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuMemsetD8_v2(location, value, size); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to memset memory: " << ToString(res); + return false; + } + return true; +} + +/* static */ bool CUDADriver::SynchronousMemsetUint32(CUcontext context, + CUdeviceptr location, + uint32 value, + size_t uint32_count) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuMemsetD32_v2(location, value, uint32_count); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to memset memory: " << ToString(res); + return false; + } + return true; +} + +/* static */ bool CUDADriver::AsynchronousMemsetUint32(CUcontext context, + CUdeviceptr location, + uint32 value, + size_t uint32_count, + CUstream stream) { + ScopedActivateContext activation{context}; + CUresult res = + dynload::cuMemsetD32Async(location, value, uint32_count, stream); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res); + return false; + } + VLOG(2) << "successfully enqueued async memset operation"; + return true; +} + +/* static */ bool CUDADriver::AddStreamCallback(CUcontext context, + CUstream stream, + StreamCallback callback, + void *data) { + // Note: flags param is required to be zero according to CUDA 6.0. + CUresult res = + dynload::cuStreamAddCallback(stream, callback, data, 0 /* = flags */); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "unable to add host callback: " << ToString(res); + return false; + } + return true; +} + +/* static */ bool CUDADriver::GetModuleFunction(CUcontext context, + CUmodule module, + const char *kernel_name, + CUfunction *function) { + ScopedActivateContext activated{context}; + CHECK(module != nullptr && kernel_name != nullptr); + CUresult res = dynload::cuModuleGetFunction(function, module, kernel_name); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name + << "\" from module: " << ToString(res); + return false; + } + + return true; +} + +/* static */ bool CUDADriver::GetModuleSymbol(CUcontext context, + CUmodule module, + const char *symbol_name, + CUdeviceptr *dptr, + size_t *bytes) { + ScopedActivateContext activated{context}; + CHECK(module != nullptr && symbol_name != nullptr && + (dptr != nullptr || bytes != nullptr)); + CUresult res = + dynload::cuModuleGetGlobal_v2(dptr, bytes, module, symbol_name); + if (res != CUDA_SUCCESS) { + // symbol may not be found in the current module, but it may reside in + // another module. + VLOG(2) << "failed to get symbol \"" << symbol_name + << "\" from module: " << ToString(res); + return false; + } + + return true; +} + +/* static */ void CUDADriver::UnloadModule(CUcontext context, CUmodule module) { + ScopedActivateContext activated{context}; + CUresult res = dynload::cuModuleUnload(module); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to unload module " << module + << "; leaking: " << ToString(res); + } +} + +/* static */ port::StatusOr<CUdevice> CUDADriver::DeviceFromContext( + CUcontext context) { + ScopedActivateContext activated{context}; + CUdevice device = -1; + CUresult result = dynload::cuCtxGetDevice(&device); + if (result == CUDA_SUCCESS) { + return device; + } + + return port::Status{ + port::error::INTERNAL, + port::StrCat("failed to get device for context: ", ToString(result))}; +} + +/* static */ bool CUDADriver::CreateStream(CUcontext context, CUstream *out) { + // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess + // up synchronization with respect to memsets and any other things that have + // to occur on the default stream? + ScopedActivateContext activated{context}; + CUresult res = dynload::cuStreamCreate(out, 0); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "could not allocate CUDA stream for context " << context + << ": " << ToString(res); + return false; + } + + VLOG(2) << "successfully created stream " << *out << " for context " + << context << " on thread"; + return true; +} + +/* static */ void CUDADriver::DestroyStream(CUcontext context, + CUstream *stream) { + if (*stream == nullptr) { + return; + } + + ScopedActivateContext activated{context}; + CUresult res = dynload::cuStreamDestroy_v2(*stream); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to destroy CUDA stream for context " << context + << ": " << ToString(res); + } else { + VLOG(2) << "successfully destroyed stream " << *stream << " for context " + << context; + *stream = nullptr; + } +} + +/* static */ void *CUDADriver::DeviceAllocate(CUcontext context, uint64 bytes) { + ScopedActivateContext activated{context}; + CUdeviceptr result = 0; + CUresult res = dynload::cuMemAlloc_v2(&result, bytes); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to allocate " + << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes + << " bytes) from device: " << ToString(res); + return nullptr; + } + void *ptr = reinterpret_cast<void *>(result); + VLOG(2) << "allocated " << ptr << " for context " << context << " of " + << bytes << " bytes"; + return ptr; +} + +/* static */ void CUDADriver::DeviceDeallocate(CUcontext context, + void *location) { + ScopedActivateContext activation{context}; + CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location); + CUresult res = dynload::cuMemFree_v2(pointer); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to free device memory at " << location + << "; result: " << ToString(res); + } else { + VLOG(2) << "deallocated " << location << " for context " << context; + } +} + +/* static */ void *CUDADriver::HostAllocate(CUcontext context, uint64 bytes) { + ScopedActivateContext activation{context}; + void *host_mem = nullptr; + // "Portable" memory is visible to all CUDA contexts. Safe for our use model. + CUresult res = + dynload::cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to alloc " << bytes + << " bytes on host: " << ToString(res); + } + return host_mem; +} + +/* static */ void CUDADriver::HostDeallocate(CUcontext context, + void *location) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuMemFreeHost(location); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "error deallocating host memory at " << location << ": " + << ToString(res); + } +} + +/* static */ bool CUDADriver::HostRegister(CUcontext context, void *location, + uint64 bytes) { + ScopedActivateContext activation{context}; + // "Portable" memory is visible to all CUDA contexts. Safe for our use model. + CUresult res = + dynload::cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "error registering host memory at " << location << ": " + << ToString(res); + return false; + } + return true; +} + +/* static */ bool CUDADriver::HostUnregister(CUcontext context, + void *location) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuMemHostUnregister(location); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "error unregistering host memory at " << location << ": " + << ToString(res); + return false; + } + return true; +} + +/* static */ port::Status CUDADriver::DestroyEvent(CUcontext context, + CUevent *event) { + if (*event == nullptr) { + return port::Status{port::error::INVALID_ARGUMENT, + "input event cannot be null"}; + } + + ScopedActivateContext activated{context}; + CUresult res = dynload::cuEventDestroy_v2(*event); + *event = nullptr; + + switch (res) { + case CUDA_SUCCESS: + return port::Status::OK(); + case CUDA_ERROR_DEINITIALIZED: + case CUDA_ERROR_NOT_INITIALIZED: + return port::Status{ + port::error::FAILED_PRECONDITION, + port::Printf("error destroying CUDA event in context %p: %s", context, + ToString(res).c_str())}; + default: + return port::Status{ + port::error::INTERNAL, + port::Printf("error destroying CUDA event in context %p: %s", context, + ToString(res).c_str())}; + } +} + +/* static */ port::Status CUDADriver::RecordEvent(CUcontext context, + CUevent event, + CUstream stream) { + ScopedActivateContext activated{context}; + CUresult res = dynload::cuEventRecord(event, stream); + switch (res) { + case CUDA_SUCCESS: + return port::Status::OK(); + case CUDA_ERROR_DEINITIALIZED: + case CUDA_ERROR_NOT_INITIALIZED: + return port::Status{ + port::error::FAILED_PRECONDITION, + port::Printf("error recording CUDA event on stream %p: %s", stream, + ToString(res).c_str())}; + default: + return port::Status{ + port::error::INVALID_ARGUMENT, + port::Printf("error recording CUDA event on stream %p: %s", stream, + ToString(res).c_str())}; + } +} + +/* static */ port::StatusOr<CUresult> CUDADriver::QueryEvent(CUcontext context, + CUevent event) { + ScopedActivateContext activated{context}; + CUresult res = dynload::cuEventQuery(event); + if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) { + return port::Status{ + port::error::INTERNAL, + port::Printf("failed to query event: %s", ToString(res).c_str())}; + } + + return res; +} + +/* static */ bool CUDADriver::GetEventElapsedTime(CUcontext context, + float *elapsed_milliseconds, + CUevent start, CUevent stop) { + ScopedActivateContext activated{context}; + CUresult res = dynload::cuEventElapsedTime(elapsed_milliseconds, start, stop); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to get elapsed time between events: " + << ToString(res); + return false; + } + + return true; +} + +/* static */ bool CUDADriver::WaitStreamOnEvent(CUcontext context, + CUstream stream, + CUevent event) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuStreamWaitEvent(stream, event, 0 /* = flags */); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "could not wait stream on event: " << ToString(res); + return false; + } + + return true; +} + +/* static */ bool CUDADriver::SynchronizeContext(CUcontext context) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuCtxSynchronize(); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res) + << " :: " << port::CurrentStackTrace(); + return false; + } + + return true; +} + +/* static */ bool CUDADriver::SynchronizeStream(CUcontext context, + CUstream stream) { + ScopedActivateContext activated{context}; + CHECK(stream != nullptr); + CUresult res = dynload::cuStreamSynchronize(stream); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "could not synchronize on CUDA stream: " << ToString(res) + << " :: " << port::CurrentStackTrace(); + return false; + } + VLOG(2) << "successfully synchronized stream " << stream << " on context " + << context; + return true; +} + +/* static */ bool CUDADriver::IsStreamIdle(CUcontext context, CUstream stream) { + ScopedActivateContext activated{context}; + CHECK(stream != nullptr); + CUresult res = dynload::cuStreamQuery(stream); + if (res == CUDA_SUCCESS) { + return true; + } + + if (res != CUDA_ERROR_NOT_READY) { + LOG(ERROR) << "stream in bad state on status query: " << ToString(res); + } + return false; +} + +/* static */ bool CUDADriver::SynchronousMemcpyD2H(CUcontext context, + void *host_dst, + CUdeviceptr gpu_src, + uint64 size) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuMemcpyDtoH_v2(host_dst, gpu_src, size); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << port::Printf( + "failed to synchronous memcpy from device to host: %s; " + "host dst: %p; GPU src: %p; size: %llu=0x%llx", + ToString(res).c_str(), host_dst, port::bit_cast<void *>(gpu_src), size, size); + return false; + } + VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to " + << host_dst; + return true; +} + +/* static */ bool CUDADriver::SynchronousMemcpyH2D(CUcontext context, + CUdeviceptr gpu_dst, + const void *host_src, + uint64 size) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuMemcpyHtoD_v2(gpu_dst, host_src, size); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << port::Printf( + "failed to synchronous memcpy from host to device: %s; GPU dst: %p;" + " host src: %p; size: %llu=0x%llx", + ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), host_src, size, size); + return false; + } + VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes"; + return true; +} + +/* static */ bool CUDADriver::SynchronousMemcpyD2D(CUcontext context, + CUdeviceptr gpu_dst, + CUdeviceptr gpu_src, + uint64 size) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuMemcpyDtoD_v2(gpu_dst, gpu_src, size); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << port::Printf( + "failed to synchronous memcpy from host to device: %s; GPU dst: %p; " + "GPU src: %p; size: %llu=0x%llx", + ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), + port::bit_cast<void *>(gpu_src), size, size); + return false; + } + VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes"; + return true; +} + +/* static */ bool CUDADriver::AsynchronousMemcpyD2H(CUcontext context, + void *host_dst, + CUdeviceptr gpu_src, + uint64 size, + CUstream stream) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuMemcpyDtoHAsync_v2(host_dst, gpu_src, size, stream); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << port::Printf( + "failed to enqueue async memcpy from device to host: %s; host dst: %p; " + "GPU src: %p; size: %llu=0x%llx", + ToString(res).c_str(), host_dst, port::bit_cast<void *>(gpu_src), size, size); + return false; + } + VLOG(2) << "successfully enqueued async memcpy d2h of " << size + << " bytes from " << port::bit_cast<void *>(gpu_src) << " to " << host_dst + << " on stream " << stream; + return true; +} + +/* static */ bool CUDADriver::AsynchronousMemcpyH2D(CUcontext context, + CUdeviceptr gpu_dst, + const void *host_src, + uint64 size, + CUstream stream) { + ScopedActivateContext activation{context}; + CUresult res = dynload::cuMemcpyHtoDAsync_v2(gpu_dst, host_src, size, stream); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << port::Printf( + "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; " + "host src: %p; size: %llu=0x%llx", + ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), host_src, size, size); + return false; + } + VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes" + << " on stream " << stream; + return true; +} + +/* static */ bool CUDADriver::AsynchronousMemcpyD2D(CUcontext context, + CUdeviceptr gpu_dst, + CUdeviceptr gpu_src, + uint64 size, + CUstream stream) { + ScopedActivateContext activation{context}; + CUresult result = + dynload::cuMemcpyDtoDAsync_v2(gpu_dst, gpu_src, size, stream); + if (result != CUDA_SUCCESS) { + LOG(ERROR) << port::Printf( + "failed to enqueue async memcpy from device to device: %s" + "; GPU dst: %p on %s %s" + "; GPU src: %p on %s %s" + "; can access? %s; size: %llu=0x%llx", + ToString(result).c_str(), port::bit_cast<void *>(gpu_dst), + CUDAPointerToMemorySpaceString(gpu_dst).c_str(), + CUDAPointerToDeviceString(gpu_dst).c_str(), port::bit_cast<void *>(gpu_src), + CUDAPointerToMemorySpaceString(gpu_src).c_str(), + CUDAPointerToDeviceString(gpu_src).c_str(), + CUDAPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size); + + return false; + } + VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes"; + return true; +} + +/* static */ port::Status CUDADriver::CreateEvent(CUcontext context, + CUevent *result, + EventFlags flags) { + int cuflags; + switch (flags) { + case EventFlags::kDefault: + cuflags = CU_EVENT_DEFAULT; + break; + case EventFlags::kDisableTiming: + cuflags = CU_EVENT_DISABLE_TIMING; + break; + default: + LOG(FATAL) << "impossible event flags: " << int(flags); + } + + ScopedActivateContext activated{context}; + CUresult res = dynload::cuEventCreate(result, cuflags); + + if (res == CUDA_SUCCESS) { + return port::Status::OK(); + } else if (res == CUDA_ERROR_OUT_OF_MEMORY) { + return port::Status{port::error::RESOURCE_EXHAUSTED, + "could not create CUDA event: out of device memory"}; + } else { + return port::Status{ + port::error::FAILED_PRECONDITION, + port::StrCat("could not create CUDA event: ", ToString(res))}; + } +} + +/* static */ int CUDADriver::GetDeviceCount() { + int device_count = 0; + CUresult res = dynload::cuDeviceGetCount(&device_count); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res); + return 0; + } + + if (FLAGS_gpuexec_cuda_device_0_only && device_count > 1) { + device_count = 1; + } + return device_count; +} + +/* static */ port::StatusOr<CUcontext> CUDADriver::GetPointerContext( + CUdeviceptr pointer) { + CUcontext context = nullptr; + CUresult result = dynload::cuPointerGetAttribute( + &context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer); + if (result == CUDA_SUCCESS) { + CHECK(context != nullptr) << "success should entail non-null context"; + return context; + } + + return port::Status{ + port::error::INTERNAL, + port::StrCat("failed to query device pointer for context: ", + ToString(result))}; +} + +/* static */ port::StatusOr<MemorySpace> CUDADriver::GetPointerMemorySpace( + CUdeviceptr pointer) { + unsigned int value; + CUresult result = dynload::cuPointerGetAttribute( + &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer); + if (result == CUDA_SUCCESS) { + switch (value) { + case CU_MEMORYTYPE_DEVICE: + return MemorySpace::kDevice; + case CU_MEMORYTYPE_HOST: + return MemorySpace::kHost; + default: + return port::Status{ + port::error::INTERNAL, + port::StrCat("unknown memory space provided by CUDA API: ", value)}; + } + } + + return port::Status{ + port::error::INTERNAL, + port::StrCat("failed to query device pointer for memory space: ", + ToString(result))}; +} + +/* static */ port::Status CUDADriver::GetPointerAddressRange(CUdeviceptr dptr, + CUdeviceptr *base, + size_t *size) { + CUresult result = dynload::cuMemGetAddressRange(base, size, dptr); + if (result == CUDA_SUCCESS) { + return port::Status::OK(); + } else if (result == CUDA_ERROR_NOT_FOUND) { + // We differentiate between "this pointer is unknown" (return here) and + // "there was an internal error while performing this operation" (return + // below). + return port::Status{ + port::error::NOT_FOUND, + port::Printf("not a device pointer %p; %s", + reinterpret_cast<void *>(dptr), ToString(result).c_str())}; + } + + return port::Status{ + port::error::INTERNAL, + port::Printf("failed to get pointer into for device pointer %p; %s", + reinterpret_cast<void *>(dptr), ToString(result).c_str())}; +} + +/* static */ port::StatusOr<CUdevice> CUDADriver::GetPointerDevice( + CUdeviceptr pointer) { + auto result = GetPointerContext(pointer); + if (!result.ok()) { + return result.status(); + } + + return DeviceFromContext(result.ValueOrDie()); +} + +/* static */ port::Status CUDADriver::GetComputeCapability(int *cc_major, + int *cc_minor, + CUdevice device) { + *cc_major = 0; + *cc_minor = 0; + CUresult result = + dynload::cuDeviceComputeCapability(cc_major, cc_minor, device); + if (result == CUDA_SUCCESS) { + return port::Status::OK(); + } + + return port::Status{ + port::error::INTERNAL, + port::Printf("failed to get compute capability for device: %s; %d", + ToString(result).c_str(), device)}; +} + +// Helper function that turns the integer output of cuDeviceGetAttribute to type +// T and wraps it in a StatusOr. +template <typename T> +static port::StatusOr<T> GetSimpleAttribute(CUdevice device, + CUdevice_attribute attribute) { + int value = -1; + CUresult result = dynload::cuDeviceGetAttribute(&value, attribute, device); + if (result != CUDA_SUCCESS) { + return port::Status{ + port::error::NOT_FOUND, + port::StrCat("could not retrieve CUDA device attribute (", attribute, + "): ", ToString(result))}; + } + T converted = value; + return converted; +} + +/* static */ port::StatusOr<int> CUDADriver::GetMultiprocessorCount( + CUdevice device) { + return GetSimpleAttribute<int>(device, + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); +} + +/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerCore( + CUdevice device) { + return GetSimpleAttribute<int64>( + device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR); +} + +/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerBlock( + CUdevice device) { + return GetSimpleAttribute<int64>( + device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK); +} + +/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerMultiprocessor( + CUdevice device) { + return GetSimpleAttribute<int64>( + device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR); +} + +/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerBlock( + CUdevice device) { + return GetSimpleAttribute<int64>(device, + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); +} + +/* static */ port::StatusOr<int64> CUDADriver::GetMaxRegistersPerBlock( + CUdevice device) { + return GetSimpleAttribute<int64>(device, + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK); +} + +/* static */ port::StatusOr<int64> CUDADriver::GetThreadsPerWarp( + CUdevice device) { + return GetSimpleAttribute<int64>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE); +} + +/* static */ bool CUDADriver::GetGridLimits(int *x, int *y, int *z, + CUdevice device) { + int value; + CUresult res = dynload::cuDeviceGetAttribute( + &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query max grid dim x: " << ToString(res); + return false; + } + *x = value; + + res = dynload::cuDeviceGetAttribute( + &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query max grid dim y: " << ToString(res); + return false; + } + *y = value; + + res = dynload::cuDeviceGetAttribute( + &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query max grid dim z: " << ToString(res); + return false; + } + *z = value; + return true; +} + +/* static */ bool CUDADriver::GetDriverVersion(int *driver_version) { + CUresult res = dynload::cuDriverGetVersion(driver_version); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query driver version: " << ToString(res); + return false; + } + + return true; +} + +/* static */ bool CUDADriver::GetDeviceProperties(CUdevprop *device_properties, + int device_ordinal) { + CUresult res = + dynload::cuDeviceGetProperties(device_properties, device_ordinal); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query device properties: " << ToString(res); + return false; + } + + return true; +} + +/* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) { + int value = -1; + CUresult res = dynload::cuDeviceGetAttribute( + &value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query ECC status: " << ToString(res); + return false; + } + + *result = value; + return true; +} + +/* static */ bool CUDADriver::GetDeviceMemoryInfo(CUcontext context, + int64 *free_out, + int64 *total_out) { + ScopedActivateContext activation{context}; + size_t free = 0; + size_t total = 0; + CUresult res = dynload::cuMemGetInfo_v2(&free, &total); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query device memory info: " << ToString(res); + return false; + } + + *free_out = free; + *total_out = total; + return true; +} + +/* static */ bool CUDADriver::GetDeviceTotalMemory(CUdevice device, + uint64 *result) { + size_t value = -1; + CUresult res = dynload::cuDeviceTotalMem_v2(&value, device); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query total available memory: " << ToString(res); + return false; + } + + *result = value; + return true; +} + +/* static */ string CUDADriver::GetPCIBusID(CUdevice device) { + string pci_bus_id; + static const int kBufferSize = 64; + port::InlinedVector<char, 4> chars(kBufferSize); + chars[kBufferSize - 1] = '\0'; + CUresult res = + dynload::cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res); + return pci_bus_id; + } + pci_bus_id = chars.begin(); + return pci_bus_id; +} + +/* static */ bool CUDADriver::CanEnablePeerAccess(CUcontext from, + CUcontext to) { + if (from == to) { + return true; // A context can always access its own memory. + } + + int can_access_peer = -1; + auto from_device = DeviceFromContext(from); + if (!from_device.ok()) { + LOG(ERROR) << "failed to resolve 'from' peer access context to a device: " + << from_device.status(); + return false; + } + auto to_device = DeviceFromContext(to); + if (!to_device.ok()) { + LOG(ERROR) << "failed to resolve 'to' peer access context to a device: " + << to_device.status(); + return false; + } + CUresult res = dynload::cuDeviceCanAccessPeer( + &can_access_peer, from_device.ValueOrDie(), to_device.ValueOrDie()); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to detect peer access capability: " << ToString(res); + return false; + } + + return can_access_peer; +} + +/* static */ port::Status CUDADriver::EnablePeerAccess(CUcontext from, + CUcontext to) { + if (from == to) { + return port::Status::OK(); // A context can always access its own memory. + } + + ScopedActivateContext activated{from}; + CUresult result = dynload::cuCtxEnablePeerAccess(to, 0 /* = flags */); + if (result != CUDA_SUCCESS && + result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) { + return port::Status{ + port::error::INTERNAL, + port::Printf("failed to enable peer access from %p to %p: %s", from, to, + ToString(result).c_str())}; + } + + return port::Status::OK(); +} + +/* static */ port::StatusOr<int> CUDADriver::GetMaxOccupiedBlocksPerCore( + CUcontext context, CUfunction kernel, int threads_per_block, + size_t dynamic_shared_memory_bytes) { + ScopedActivateContext activation{context}; + + int max_blocks; + CUresult result = dynload::cuOccupancyMaxActiveBlocksPerMultiprocessor( + &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes); + if (result != CUDA_SUCCESS) { + return port::Status{ + port::error::INTERNAL, + port::Printf("failed to calculate occupancy of kernel %p: %s", kernel, + ToString(result).c_str())}; + } + + return max_blocks; +} + +} // namespace cuda +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h new file mode 100644 index 0000000000..007db222d9 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_driver.h @@ -0,0 +1,460 @@ +// CUDA userspace driver library wrapper functionality. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_ + +#include <stddef.h> +#include "tensorflow/stream_executor/platform/port.h" + +#include "tensorflow/stream_executor/cuda/multi_op_activation.h" +#include "tensorflow/stream_executor/device_options.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "third_party/gpus/cuda/include/cuda.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +// Identifies the memory space where an allocation resides. See +// CUDADriver::GetPointerMemorySpace(). +enum class MemorySpace { kHost, kDevice }; + +// Returns a casual string, such as "host" for the provided memory space. +string MemorySpaceString(MemorySpace memory_space); + +// CUDADriver contains wrappers for calls to the userspace library driver. It's +// useful to isolate these calls and put basic wrappers around them to separate +// userspace library driver behaviors from the rest of the program. +// +// At the moment it's simply used as a namespace. +// +// The calls log any specific errors internally and return whether the operation +// was successful to the caller. +// +// The order of parameters is generally kept symmetric with the underlying CUDA +// driver API. +// +// Links on functions are to specific documentation under +// http://docs.nvidia.com/cuda/cuda-driver-api/ +// +// Thread safety: these functions should not be used from signal handlers. +class CUDADriver { + public: + // Wraps a call to cuInit with logging to help indicate what has gone wrong in + // the case of failure. Safe to call multiple times; will be fast on all calls + // after the first. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3 + static port::Status Init(); + + // Returns the device associated with the given context. + // device is an outparam owned by the caller, must not be null. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e + static port::StatusOr<CUdevice> DeviceFromContext(CUcontext context); + + // Creates a new CUDA stream associated with the given context via + // cuStreamCreate. + // stream is an outparam owned by the caller, must not be null. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4 + static bool CreateStream(CUcontext context, CUstream *stream); + + // Destroys a CUDA stream associated with the given context. + // stream is owned by the caller, must not be null, and *stream is set to null + // if the stream is successfuly destroyed. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758 + static void DestroyStream(CUcontext context, CUstream *stream); + + // CUDA events can explicitly disable event TSC retrieval for some presumed + // performance improvement if timing is unnecessary. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db + enum class EventFlags { kDefault, kDisableTiming }; + + // Creates a new event associated with the given context. + // result is an outparam owned by the caller and must not be null. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db + static port::Status CreateEvent(CUcontext context, CUevent *result, + EventFlags flags); + + // Destroys *event and turns it into a nullptr. event may not be null, but + // *event may be, via cuEventDestroy + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef + static port::Status DestroyEvent(CUcontext context, CUevent *event); + + // Allocates a GPU memory space of size bytes associated with the given + // context via cuMemAlloc. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467 + static void *DeviceAllocate(CUcontext context, uint64 bytes); + + // Deallocates a GPU memory space of size bytes associated with the given + // context via cuMemFree. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a + static void DeviceDeallocate(CUcontext context, void *location); + + // Allocates page-locked and CUDA-registered memory on the host via + // cuMemAllocHost. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0 + static void *HostAllocate(CUcontext context, uint64 bytes); + + // Deallocates a location created by HostAllocate, via cuMemFreeHost. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c + static void HostDeallocate(CUcontext context, void *location); + + // Registers a memory region at location of size bytes via cuMemHostRegister. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223 + static bool HostRegister(CUcontext context, void *location, uint64 bytes); + + // Unregisters a memory region that was previously registered at location via + // cuMemHostUnregister. + // + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14 + // + // TODO(leary) verify an error will be returned if the location wasn't + // previously registered. + static bool HostUnregister(CUcontext context, void *location); + + // Given a device ordinal, returns a device handle into the device outparam, + // which must not be null. + // + // N.B. these device handles do not have a corresponding destroy function in + // the CUDA driver API. + static port::Status GetDevice(int device_ordinal, CUdevice *device); + + // Given a device handle, returns the name reported by the driver for the + // device. + static bool GetDeviceName(CUdevice device, string *name_out); + + // Given a device to create a context for, returns a context handle into the + // context outparam, which must not be null. + // + // N.B. CUDA contexts are weird. They are implicitly associated with the + // calling thread. Current documentation on contexts and their influence on + // userspace processes is given here: + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf + static port::Status CreateContext(CUdevice device, + DeviceOptions device_options, + CUcontext *context); + + // Destroys the provided context via cuCtxDestroy. + // Don't do this while clients could still be using the context, per the docs + // bad things will happen. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e + static void DestroyContext(CUcontext context); + + // Queries the runtime for the specified attribute of the specified function. + // cuFuncGetAttribute (the underlying CUDA driver API routine) only operates + // in terms of integer-sized values, so there's no potential for overrun (as + // of CUDA 5.5). + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b + static bool FuncGetAttribute(CUfunction_attribute attribute, + CUfunction function, int *attribute_value); + + // Sets the preferred cache configuration for the specified function. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681 + static bool FuncSetCacheConfig(CUfunction function, + CUfunc_cache cache_config); + + // Gets the preferred shared memory bank configuration for the specified + // CONTEXT (not function!), either default or four- or eight-byte bank size. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74 + static port::StatusOr<CUsharedconfig> ContextGetSharedMemConfig( + CUcontext context); + + // Sets the preferred shared memory bank configuration for the specified + // CONTEXT (not function!), either default or four- or eight-byte bank size. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692 + static port::Status ContextSetSharedMemConfig( + CUcontext context, CUsharedconfig shared_mem_config); + + // Launches a CUDA kernel via cuLaunchKernel. + // TODO(leary) describe the structure of kernel_params and extra in a readable + // way. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15 + static bool LaunchKernel(CUcontext context, CUfunction function, + unsigned int grid_dim_x, unsigned int grid_dim_y, + unsigned int grid_dim_z, unsigned int block_dim_x, + unsigned int block_dim_y, unsigned int block_dim_z, + unsigned int shared_mem_bytes, CUstream stream, + void **kernel_params, void **extra); + + // Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting + // handle in "module". Any error logs that are produced are logged internally. + static bool LoadPtx(CUcontext context, const char *ptx_contents, + CUmodule *module); + + // Loads cubin_bytes with the CUDA driver's blob loading interface and stores + // the resulting handle in "module". + static port::Status LoadCubin(CUcontext context, const char *cubin_bytes, + CUmodule *module); + + // Retrieves a named kernel from a loaded module, and places the resulting + // handle into function (outparam) on success. Neither kernel_name nor + // function may be null. No ownership is taken of kernel_name. + static bool GetModuleFunction(CUcontext context, CUmodule module, + const char *kernel_name, CUfunction *function); + + // Retrieves a named global/constant symbol from a loaded module, and returns + // a device pointer and size of the symbol on success. symbol_name may not be + // null. At least one of dptr or bytes should not be null. No ownership is + // taken of symbol_name. + static bool GetModuleSymbol(CUcontext context, CUmodule module, + const char *symbol_name, CUdeviceptr *dptr, + size_t *bytes); + + // Unloads module from the current context via cuModuleUnload. + // TODO(leary) the documentation doesn't say what kind of disasters happen + // if you try to unload a module while its CUfunctions are in use. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b + static void UnloadModule(CUcontext context, CUmodule module); + + // Performs a synchronous memset of the device memory segment via cuMemsetD8. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b + static bool SynchronousMemsetUint8(CUcontext context, CUdeviceptr location, + uint8 value, size_t size); + + // Performs a synchronous memset of the device memory segment via cuMemsetD32. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132 + static bool SynchronousMemsetUint32(CUcontext context, CUdeviceptr location, + uint32 value, size_t uint32_count); + + // Performs an asynchronous memset of the device memory segment via + // cuMemsetD32Async. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5 + static bool AsynchronousMemsetUint32(CUcontext context, CUdeviceptr location, + uint32 value, size_t uint32_count, + CUstream stream); + + // -- Synchronous memcopies. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169 + + static bool SynchronousMemcpyD2H(CUcontext context, void *host_dst, + CUdeviceptr gpu_src, uint64 size); + static bool SynchronousMemcpyH2D(CUcontext context, CUdeviceptr gpu_dst, + const void *host_src, uint64 size); + static bool SynchronousMemcpyD2D(CUcontext context, CUdeviceptr gpu_dst, + CUdeviceptr gpu_src, uint64 size); + + // -- Asynchronous memcopies. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362 + + static bool AsynchronousMemcpyD2H(CUcontext context, void *host_dst, + CUdeviceptr gpu_src, uint64 size, + CUstream stream); + static bool AsynchronousMemcpyH2D(CUcontext context, CUdeviceptr gpu_dst, + const void *host_src, uint64 size, + CUstream stream); + static bool AsynchronousMemcpyD2D(CUcontext context, CUdeviceptr gpu_dst, + CUdeviceptr gpu_src, uint64 size, + CUstream stream); + + // The CUDA stream callback type signature. + // The data passed to AddStreamCallback is subsequently passed to this + // callback when it fires. + // + // Some notable things: + // * Callbacks must not make any CUDA API calls. + // * Callbacks from independent streams execute in an undefined order and may + // be serialized. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483 + typedef void (*StreamCallback)(CUstream stream, CUresult status, void *data); + + // Enqueues a callback operation into stream. + // See StreamCallback above and the NVIDIA documentation for additional + // details. + static bool AddStreamCallback(CUcontext context, CUstream stream, + StreamCallback callback, void *data); + + // Causes stream to wait for event to trigger before proceeding via + // cuStreamWaitEvent. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#axzz334nAXAhM + static bool WaitStreamOnEvent(CUcontext context, CUstream stream, + CUevent event); + + // Blocks the calling thread until the operations enqueued onto stream have + // been completed, via cuStreamSynchronize. + // + // TODO(leary) if a pathological thread enqueues operations onto the stream + // while another thread blocks like this, can you wind up waiting an unbounded + // amount of time? + // + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad + static bool SynchronizeStream(CUcontext context, CUstream stream); + + // Blocks the calling thread until the operations associated with the context + // have been completed, via cuCtxSynchronize. + // + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616 + static bool SynchronizeContext(CUcontext context); + + // Returns true if all stream tasks have completed at time of the call. Note + // the potential for races around this call (if another thread adds work to + // the stream immediately after this returns). + static bool IsStreamIdle(CUcontext context, CUstream stream); + + // Returns whether code in the from context can access memory in the to + // context via cuDeviceCanAccessPeer. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e + static bool CanEnablePeerAccess(CUcontext from, CUcontext to); + + // Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a + static port::Status EnablePeerAccess(CUcontext from, CUcontext to); + + // Returns the elapsed milliseconds between start and stop via + // cuEventElapsedTime. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97 + static bool GetEventElapsedTime(CUcontext context, + float *elapsed_milliseconds, CUevent start, + CUevent stop); + + // Records that an event occurred when execution reaches the current point in + // thestream via cuEventRecord. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1 + static port::Status RecordEvent(CUcontext context, CUevent event, + CUstream stream); + + // Polls (without blocking) to determine the status of an event - pending or + // complete (or an error status). + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef + static port::StatusOr<CUresult> QueryEvent(CUcontext context, CUevent event); + + // -- Pointer-specific calls. + + // Returns the context in which pointer was allocated or registered. + static port::StatusOr<CUcontext> GetPointerContext(CUdeviceptr pointer); + + // Returns the device associated with the context from GetPointerContext(). + static port::StatusOr<CUdevice> GetPointerDevice(CUdeviceptr pointer); + + // Returns the memory space addressed by pointer. + static port::StatusOr<MemorySpace> GetPointerMemorySpace(CUdeviceptr pointer); + + // Returns the base address and size of the device pointer dptr. + static port::Status GetPointerAddressRange(CUdeviceptr dptr, + CUdeviceptr *base, size_t *size); + + // -- Device-specific calls. + + // Returns the compute capability for the device; i.e (3, 5). + // This is currently done via the deprecated device API. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea + static port::Status GetComputeCapability(int *cc_major, int *cc_minor, + CUdevice device); + + // Returns the number of multiprocessors on the device (note that the device + // may be multi-GPU-per-board). + static port::StatusOr<int> GetMultiprocessorCount(CUdevice device); + + // Returns the limit on number of threads that can be resident in a single + // multiprocessor. + static port::StatusOr<int64> GetMaxThreadsPerMultiprocessor(CUdevice device); + + // Returns the limit on number of threads which may be resident for a single + // block (cooperative thread array). + static port::StatusOr<int64> GetMaxThreadsPerBlock(CUdevice device); + + // Returns the amount of shared memory available on a single GPU core (i.e. + // SM on NVIDIA devices). + static port::StatusOr<int64> GetMaxSharedMemoryPerCore(CUdevice device); + + // Returns the amount of shared memory available for a single block + // (cooperative thread array). + static port::StatusOr<int64> GetMaxSharedMemoryPerBlock(CUdevice device); + + // Returns the maximum supported number of registers per block. + static port::StatusOr<int64> GetMaxRegistersPerBlock(CUdevice device); + + // Returns the number of threads per warp. + static port::StatusOr<int64> GetThreadsPerWarp(CUdevice device); + + // Queries the grid limits for device with cuDeviceGetAttribute calls. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266 + static bool GetGridLimits(int *x, int *y, int *z, CUdevice device); + + // Returns a grab-bag of device properties in a caller-owned device_properties + // structure for device_ordinal via cuDeviceGetProperties. + // This call is deprecated in the NVIDIA driver API. + // + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6 + static bool GetDeviceProperties(CUdevprop *device_properties, + int device_ordinal); + + // Returns whether ECC is enabled for the given CUdevice via + // cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266 + static bool IsEccEnabled(CUdevice device, bool *result); + + // Returns the total amount of memory available for allocation by the CUDA + // context, in bytes, via cuDeviceTotalMem. + static bool GetDeviceTotalMemory(CUdevice device, uint64 *result); + + // Returns the free amount of memory and total amount of memory, as reported + // by cuMemGetInfo. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0 + static bool GetDeviceMemoryInfo(CUcontext context, int64 *free, int64 *total); + + // Returns a PCI bus id string for the device. + // [domain]:[bus]:[device].[function] + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g85295e7d9745ab8f0aa80dd1e172acfc + static string GetPCIBusID(CUdevice device); + + // -- Context- and device-independent calls. + + // Returns the number of visible CUDA device via cuDeviceGetCount. + // This should correspond to the set of device ordinals available. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74 + static int GetDeviceCount(); + + // Returns the driver version number via cuDriverGetVersion. + // This is, surprisingly, NOT the actual driver version (e.g. 331.79) but, + // instead, the CUDA toolkit release number that this driver is compatible + // with; e.g. 6000 (for a CUDA 6.0 compatible driver) or 6050 (for a CUDA 6.5 + // compatible driver). + // + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VERSION.html#group__CUDA__VERSION_1g8b7a10395392e049006e61bcdc8ebe71 + static bool GetDriverVersion(int *driver_version); + + // -- Other calls + + // Returns the maximum number of blocks (per multiprocessor) occupied by the + // specified kernel/CUfunction when launched with the specified parameters. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98 + static port::StatusOr<int> GetMaxOccupiedBlocksPerCore( + CUcontext context, CUfunction kernel, int threads_per_block, + size_t dynamic_shared_memory_bytes); + + // Seam for injecting an error at CUDA initialization time for testing + // purposes. + static bool driver_inject_init_error_; +}; + +// Ensures a context is activated within a scope. +class ScopedActivateContext { + public: + // Activates the context via cuCtxSetCurrent, if it is not the currently + // active context (a la cuCtxGetCurrent). Note the alternative push/pop + // mechanism is said by NVIDIA to be relatively slow and deprecated. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7 + explicit ScopedActivateContext( + CUcontext context, MultiOpActivation moa = MultiOpActivation::kNo); + + // Checks that the context has remained activated for the duration of the + // scope. + ~ScopedActivateContext(); + + private: + CUcontext context_; // context being activated. + + CUcontext prior_context_; // context that was active when we were activated. + + // Stores whether this was instantiated during a MultiOpActivation, in which + // case we will not pop the context when we're destroyed (we will leave it to + // the parent MultiOpActivation that we were nested within). + bool previously_in_multi_op_activation_; +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_event.cc b/tensorflow/stream_executor/cuda/cuda_event.cc new file mode 100644 index 0000000000..a87c868c6b --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_event.cc @@ -0,0 +1,56 @@ +#include "tensorflow/stream_executor/cuda/cuda_event.h" + +#include "tensorflow/stream_executor/cuda/cuda_stream.h" +#include "tensorflow/stream_executor/lib/statusor.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +CUDAEvent::CUDAEvent(CUDAExecutor* parent) + : parent_(parent), cuda_event_(nullptr) {} + +CUDAEvent::~CUDAEvent() {} + +port::Status CUDAEvent::Init() { + return CUDADriver::CreateEvent(parent_->cuda_context(), &cuda_event_, + CUDADriver::EventFlags::kDisableTiming); +} + +port::Status CUDAEvent::Destroy() { + return CUDADriver::DestroyEvent(parent_->cuda_context(), &cuda_event_); +} + +port::Status CUDAEvent::Record(CUDAStream* stream) { + return CUDADriver::RecordEvent(parent_->cuda_context(), cuda_event_, + stream->cuda_stream()); +} + +Event::Status CUDAEvent::PollForStatus() { + port::StatusOr<CUresult> status = + CUDADriver::QueryEvent(parent_->cuda_context(), cuda_event_); + if (!status.ok()) { + LOG(ERROR) << "Error polling for event status: " + << status.status().error_message(); + return Event::Status::kError; + } + + switch (status.ValueOrDie()) { + case CUDA_SUCCESS: + return Event::Status::kComplete; + case CUDA_ERROR_NOT_READY: + return Event::Status::kPending; + default: + LOG(INFO) << "Error condition returned for event status: " + << status.ValueOrDie(); + return Event::Status::kError; + } +} + +const CUevent& CUDAEvent::cuda_event() { + return cuda_event_; +} + +} // namespace cuda +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/cuda/cuda_event.h b/tensorflow/stream_executor/cuda/cuda_event.h new file mode 100644 index 0000000000..c5b65662db --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_event.h @@ -0,0 +1,49 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_ + +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/cuda/cuda_stream.h" +#include "tensorflow/stream_executor/event.h" +#include "tensorflow/stream_executor/lib/status.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +// CUDAEvent wraps a CUevent in the platform-independent EventInterface +// interface. +class CUDAEvent : public internal::EventInterface { + public: + explicit CUDAEvent(CUDAExecutor* parent); + + ~CUDAEvent() override; + + // Populates the CUDA-platform-specific elements of this object. + port::Status Init(); + + // Deallocates any platform-specific elements of this object. This is broken + // out (not part of the destructor) to allow for error reporting. + port::Status Destroy(); + + // Inserts the event at the current position into the specified stream. + port::Status Record(CUDAStream* stream); + + // Polls the CUDA platform for the event's current status. + Event::Status PollForStatus(); + + // The underyling CUDA event element. + const CUevent& cuda_event(); + + private: + // The Executor used to which this object and CUevent are bound. + CUDAExecutor* parent_; + + // The underlying CUDA event element. + CUevent cuda_event_; +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc new file mode 100644 index 0000000000..59c3159895 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_fft.cc @@ -0,0 +1,327 @@ +#include "tensorflow/stream_executor/cuda/cuda_fft.h" + +#include <dlfcn.h> + +#include <complex> + +#include "tensorflow/stream_executor/cuda/cuda_activation.h" +#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" +#include "tensorflow/stream_executor/cuda/cuda_helpers.h" +#include "tensorflow/stream_executor/cuda/cuda_platform.h" +#include "tensorflow/stream_executor/device_memory.h" +#include "tensorflow/stream_executor/dso_loader.h" +#include "tensorflow/stream_executor/lib/initialize.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/plugin_registry.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuFftPlugin); + +namespace dynload { + +// This macro wraps a global identifier, given by __name, in a callable +// structure that loads the DLL symbol out of the DSO handle in a thread-safe +// manner on first use. This dynamic loading technique is used to avoid DSO +// dependencies on vendor libraries which may or may not be available in the +// deployed binary environment. +#define PERFTOOLS_GPUTOOLS_CUFFT_WRAP(__name) \ + struct DynLoadShim__##__name { \ + static const char *kName; \ + using FuncPointerT = std::add_pointer<decltype(::__name)>::type; \ + static void *GetDsoHandle() { \ + static auto status = internal::CachedDsoLoader::GetCufftDsoHandle(); \ + return status.ValueOrDie(); \ + } \ + static FuncPointerT DynLoad() { \ + static void *f = dlsym(GetDsoHandle(), kName); \ + CHECK(f != nullptr) << "could not find " << kName \ + << " in cuFFT DSO; dlerror: " << dlerror(); \ + return reinterpret_cast<FuncPointerT>(f); \ + } \ + template <typename... Args> \ + cufftResult operator()(CUDAExecutor * parent, Args... args) { \ + cuda::ScopedActivateExecutorContext sac{parent}; \ + return DynLoad()(args...); \ + } \ + } __name; \ + const char *DynLoadShim__##__name::kName = #__name; + +#define CUFFT_ROUTINE_EACH(__macro) \ + __macro(cufftDestroy) __macro(cufftSetStream) __macro(cufftPlan1d) \ + __macro(cufftPlan2d) __macro(cufftPlan3d) __macro(cufftPlanMany) \ + __macro(cufftExecD2Z) __macro(cufftExecZ2D) __macro(cufftExecC2C) \ + __macro(cufftExecC2R) __macro(cufftExecZ2Z) \ + __macro(cufftExecR2C) + +CUFFT_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUFFT_WRAP) + +} // namespace dynload + +namespace { + +// A helper function transforming gpu_fft arguments into cuFFT arguments. +cufftType CUDAFftType(fft::Type type) { + switch (type) { + case fft::Type::kC2CForward: + case fft::Type::kC2CInverse: + return CUFFT_C2C; + case fft::Type::kC2R: + return CUFFT_C2R; + case fft::Type::kR2C: + return CUFFT_R2C; + case fft::Type::kZ2ZForward: + case fft::Type::kZ2ZInverse: + return CUFFT_Z2Z; + case fft::Type::kZ2D: + return CUFFT_Z2D; + case fft::Type::kD2Z: + return CUFFT_D2Z; + default: + LOG(FATAL) << "Invalid value of fft::Type."; + } +} + +// Associates the given stream with the given cuFFT plan. +bool SetStream(CUDAExecutor *parent, cufftHandle plan, Stream *stream) { + auto ret = dynload::cufftSetStream(parent, plan, AsCUDAStreamValue(stream)); + if (ret != CUFFT_SUCCESS) { + LOG(ERROR) << "failed to run cuFFT routine cufftSetStream: " << ret; + return false; + } + return true; +} + +} // namespace + +CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, fft::Type type) + : parent_(parent), fft_type_(type) { + auto ret = dynload::cufftPlan1d(parent, &plan_, num_x, CUDAFftType(type), + 1 /* = batch */); + if (ret != CUFFT_SUCCESS) { + LOG(ERROR) << "failed to create cuFFT 1d plan:" << ret; + } +} + +CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y, + fft::Type type) + : parent_(parent), fft_type_(type) { + auto ret = + dynload::cufftPlan2d(parent, &plan_, num_x, num_y, CUDAFftType(type)); + if (ret != CUFFT_SUCCESS) { + LOG(ERROR) << "failed to create cuFFT 2d plan:" << ret; + } +} + +CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y, + uint64 num_z, fft::Type type) + : parent_(parent), fft_type_(type) { + auto ret = dynload::cufftPlan3d(parent, &plan_, num_x, num_y, num_z, + CUDAFftType(type)); + if (ret != CUFFT_SUCCESS) { + LOG(ERROR) << "failed to create cuFFT 3d plan:" << ret; + } +} + +CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, int rank, uint64 *elem_count, + uint64 *input_embed, uint64 input_stride, + uint64 input_distance, uint64 *output_embed, + uint64 output_stride, uint64 output_distance, + fft::Type type, int batch_count) + : parent_(parent), fft_type_(type) { + int elem_count_[3], input_embed_[3], output_embed_[3]; + for (int i = 0; i < rank; ++i) { + elem_count_[i] = elem_count[i]; + if (input_embed) { + input_embed_[i] = input_embed[i]; + } + if (output_embed) { + output_embed_[i] = output_embed[i]; + } + } + auto ret = dynload::cufftPlanMany( + parent, &plan_, rank, elem_count_, input_embed ? input_embed_ : nullptr, + input_stride, input_distance, output_embed ? output_embed_ : nullptr, + output_stride, output_distance, CUDAFftType(type), batch_count); + if (ret != CUFFT_SUCCESS) { + LOG(ERROR) << "failed to create cuFFT batched plan:" << ret; + } +} + +CUDAFftPlan::~CUDAFftPlan() { dynload::cufftDestroy(parent_, plan_); } + +int CUDAFftPlan::GetFftDirection() const { + switch (fft_type_) { + case fft::Type::kC2CForward: + case fft::Type::kZ2ZForward: + case fft::Type::kR2C: + case fft::Type::kD2Z: + return CUFFT_FORWARD; + case fft::Type::kC2CInverse: + case fft::Type::kZ2ZInverse: + case fft::Type::kC2R: + case fft::Type::kZ2D: + return CUFFT_INVERSE; + default: + LOG(FATAL) << "Invalid value of fft::Type."; + } +} + +std::unique_ptr<fft::Plan> CUDAFft::Create1dPlan(Stream *stream, uint64 num_x, + fft::Type type, + bool in_place_fft) { + std::unique_ptr<fft::Plan> plan{new CUDAFftPlan(parent_, num_x, type)}; + return plan; +} + +std::unique_ptr<fft::Plan> CUDAFft::Create2dPlan(Stream *stream, uint64 num_x, + uint64 num_y, fft::Type type, + bool in_place_fft) { + std::unique_ptr<fft::Plan> plan{new CUDAFftPlan(parent_, num_x, num_y, type)}; + return plan; +} + +std::unique_ptr<fft::Plan> CUDAFft::Create3dPlan(Stream *stream, uint64 num_x, + uint64 num_y, uint64 num_z, + fft::Type type, + bool in_place_fft) { + std::unique_ptr<fft::Plan> plan{ + new CUDAFftPlan(parent_, num_x, num_y, num_z, type)}; + return plan; +} + +std::unique_ptr<fft::Plan> CUDAFft::CreateBatchedPlan( + Stream *stream, int rank, uint64 *elem_count, uint64 *input_embed, + uint64 input_stride, uint64 input_distance, uint64 *output_embed, + uint64 output_stride, uint64 output_distance, fft::Type type, + bool in_place_fft, int batch_count) { + std::unique_ptr<fft::Plan> plan{new CUDAFftPlan( + parent_, rank, elem_count, input_embed, input_stride, input_distance, + output_embed, output_stride, output_distance, type, batch_count)}; + return plan; +} + +template <typename FuncT, typename InputT, typename OutputT> +bool CUDAFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufftExec, + const DeviceMemory<InputT> &input, + DeviceMemory<OutputT> *output) { + CUDAFftPlan *cuda_fft_plan = dynamic_cast<CUDAFftPlan *>(plan); + if (cuda_fft_plan == nullptr) { + LOG(ERROR) << "the passed-in plan is not a CUDAFftPlan object."; + return false; + } + + if (!SetStream(parent_, cuda_fft_plan->GetPlan(), stream)) { + return false; + } + + auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(), + CUDAComplex(const_cast<InputT *>(CUDAMemory(input))), + CUDAComplex(CUDAMemoryMutable(output))); + + if (ret != CUFFT_SUCCESS) { + LOG(ERROR) << "failed to run cuFFT routine: " << ret; + return false; + } + + return true; +} + +template <typename FuncT, typename InputT, typename OutputT> +bool CUDAFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan, + FuncT cufftExec, + const DeviceMemory<InputT> &input, + DeviceMemory<OutputT> *output) { + CUDAFftPlan *cuda_fft_plan = dynamic_cast<CUDAFftPlan *>(plan); + if (cuda_fft_plan == nullptr) { + LOG(ERROR) << "the passed-in plan is not a CUDAFftPlan object."; + return false; + } + + if (!SetStream(parent_, cuda_fft_plan->GetPlan(), stream)) { + return false; + } + + auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(), + CUDAComplex(const_cast<InputT *>(CUDAMemory(input))), + CUDAComplex(CUDAMemoryMutable(output)), + cuda_fft_plan->GetFftDirection()); + + if (ret != CUFFT_SUCCESS) { + LOG(ERROR) << "failed to run cuFFT routine: " << ret; + return false; + } + + return true; +} + +#define PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(__type, __fft_type1, __fft_type2, \ + __fft_type3) \ + bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan, \ + const DeviceMemory<std::complex<__type>> &input, \ + DeviceMemory<std::complex<__type>> *output) { \ + return DoFftWithDirectionInternal( \ + stream, plan, dynload::cufftExec##__fft_type1, input, output); \ + } \ + bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan, \ + const DeviceMemory<__type> &input, \ + DeviceMemory<std::complex<__type>> *output) { \ + return DoFftInternal(stream, plan, dynload::cufftExec##__fft_type2, input, \ + output); \ + } \ + bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan, \ + const DeviceMemory<std::complex<__type>> &input, \ + DeviceMemory<__type> *output) { \ + return DoFftInternal(stream, plan, dynload::cufftExec##__fft_type3, input, \ + output); \ + } + +PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(float, C2C, R2C, C2R) +PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(double, Z2Z, D2Z, Z2D) + +#undef PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +namespace gpu = ::perftools::gputools; + +REGISTER_MODULE_INITIALIZER(register_cufft, { + gpu::port::Status status = + gpu::PluginRegistry::Instance() + ->RegisterFactory<gpu::PluginRegistry::FftFactory>( + gpu::cuda::kCudaPlatformId, gpu::cuda::kCuFftPlugin, "cuFFT", + [](gpu::internal::StreamExecutorInterface + *parent) -> gpu::fft::FftSupport * { + gpu::cuda::CUDAExecutor *cuda_executor = + dynamic_cast<gpu::cuda::CUDAExecutor *>(parent); + if (cuda_executor == nullptr) { + LOG(ERROR) + << "Attempting to initialize an instance of the cuFFT " + << "support library with a non-CUDA StreamExecutor"; + return nullptr; + } + + return new gpu::cuda::CUDAFft(cuda_executor); + }); + if (!status.ok()) { + LOG(ERROR) << "Unable to register cuFFT factory: " + << status.error_message(); + } + + // Prime the cuFFT DSO. The loader will log more information. + auto statusor = gpu::internal::CachedDsoLoader::GetCufftDsoHandle(); + if (!statusor.ok()) { + LOG(INFO) << "Unable to load cuFFT DSO."; + } + + gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId, + gpu::PluginKind::kFft, + gpu::cuda::kCuFftPlugin); +}); diff --git a/tensorflow/stream_executor/cuda/cuda_fft.h b/tensorflow/stream_executor/cuda/cuda_fft.h new file mode 100644 index 0000000000..2577c2952e --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_fft.h @@ -0,0 +1,95 @@ +// CUDA-specific support for FFT functionality -- this wraps the cuFFT library +// capabilities, and is only included into CUDA implementation code -- it will +// not introduce cuda headers into other code. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_ + +#include "tensorflow/stream_executor/fft.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/plugin_registry.h" +#include "third_party/gpus/cuda/include/cufft.h" + +namespace perftools { +namespace gputools { + +class Stream; + +namespace cuda { + +class CUDAExecutor; + +// Opaque and unique indentifier for the cuFFT plugin. +extern const PluginId kCuFftPlugin; + +class CUDAFftPlan : public fft::Plan { + public: + // Constructor creating 1d FFT plan. + CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, fft::Type type); + // Constructor creating 2d FFT plan. + CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y, fft::Type type); + // Constructor creating 3d FFT plan. + CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y, uint64 num_z, + fft::Type type); + // Constructor creating batched FFT plan. + CUDAFftPlan(CUDAExecutor *parent, int rank, uint64 *elem_count, + uint64 *input_embed, uint64 input_stride, uint64 input_distance, + uint64 *output_embed, uint64 output_stride, + uint64 output_distance, fft::Type type, int batch_count); + ~CUDAFftPlan() override; + + // Get FFT direction in cuFFT based on FFT type. + int GetFftDirection() const; + cufftHandle GetPlan() const { return plan_; } + + private: + CUDAExecutor *parent_; + cufftHandle plan_; + fft::Type fft_type_; +}; + +// FFT support for CUDA platform via cuFFT library. +// +// This satisfies the platform-agnostic FftSupport interface. +// +// Note that the cuFFT handle that this encapsulates is implicitly tied to the +// context (and, as a result, the device) that the parent CUDAExecutor is tied +// to. This simply happens as an artifact of creating the cuFFT handle when a +// CUDA context is active. +// +// Thread-safe. The CUDA context associated with all operations is the CUDA +// context of parent_, so all context is explicit. +class CUDAFft : public fft::FftSupport { + public: + explicit CUDAFft(CUDAExecutor *parent) : parent_(parent) {} + ~CUDAFft() override {} + + TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES + + private: + CUDAExecutor *parent_; + + // Two helper functions that execute dynload::cufftExec?2?. + + // This is for complex to complex FFT, when the direction is required. + template <typename FuncT, typename InputT, typename OutputT> + bool DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan, + FuncT cufft_exec, + const DeviceMemory<InputT> &input, + DeviceMemory<OutputT> *output); + + // This is for complex to real or real to complex FFT, when the direction + // is implied. + template <typename FuncT, typename InputT, typename OutputT> + bool DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufft_exec, + const DeviceMemory<InputT> &input, + DeviceMemory<OutputT> *output); + + SE_DISALLOW_COPY_AND_ASSIGN(CUDAFft); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc new file mode 100644 index 0000000000..77f16e2a6e --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc @@ -0,0 +1,1082 @@ +#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" + +#include <unistd.h> + +#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/cuda/cuda_event.h" +#include "tensorflow/stream_executor/cuda/cuda_platform.h" +#include "tensorflow/stream_executor/cuda/cuda_stream.h" +#include "tensorflow/stream_executor/cuda/cuda_timer.h" +#include "tensorflow/stream_executor/dso_loader.h" +#include "tensorflow/stream_executor/kernel_cache_config.h" +#include "tensorflow/stream_executor/lib/casts.h" +#include "tensorflow/stream_executor/lib/env.h" +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/initialize.h" +#include "tensorflow/stream_executor/lib/mathutil.h" +#include "tensorflow/stream_executor/lib/path.h" +#include "tensorflow/stream_executor/lib/process_state.h" +#include "tensorflow/stream_executor/lib/ptr_util.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/lib/str_util.h" +#include "tensorflow/stream_executor/lib/strcat.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" +#include "tensorflow/stream_executor/platform.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/plugin_registry.h" +#include "tensorflow/stream_executor/stream.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" +#include "tensorflow/stream_executor/stream_executor_pimpl.h" +#include "tensorflow/stream_executor/timer.h" +#include "tensorflow/stream_executor/lib/numbers.h" + +#ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_ +#error \ + "No driver calls in this file, wrap driver functionality in cuda_driver.cc." +#endif + +#ifdef __CUDA_RUNTIME_H__ +#error \ + "CUDA runtime being included into CUDA GPU executor; should be driver only." +#endif + +extern bool FLAGS_check_gpu_leaks; +tensorflow::int32 FLAGS_register_occupancy_warning_threshold; +bool FLAGS_prefer_cubin_to_ptx = true; + +namespace perftools { +namespace gputools { +namespace rng { +class RngSupport; +} // namespace rng +} // namespace gputools +} // namespace perftools + +namespace perftools { +namespace gputools { +namespace cuda { + +// Hook that can be used to CUBIN-ate PTX before it is loaded into the driver. +// It has been observed that loading both PTX and cubins into the driver library +// can cause it to crash, but loading only CUBINs avoids those crashes; +// therefore, it's useful to have this hook to hack in uniform CUBIN-ation of +// PTX code. +// +// As this is an implementation-detail workaround, the usage is to declare this +// variable with extern linkage and populate it from another translation unit. +std::function<string(const string &)> g_cubinate; + +static CUDAEvent *AsCUDAEvent(Event *event) { + DCHECK(event != nullptr); + return static_cast<CUDAEvent *>(event->implementation()); +} + +// Given a platform-independent stream datatype, returns the internal CUDA +// platform implementation pointer. +static CUDAStream *AsCUDAStream(Stream *stream) { + DCHECK(stream != nullptr); + return static_cast<CUDAStream *>(stream->implementation()); +} + +// Given a platform-independent stream datatype, returns the platform +// implementation's internal value, suitable for passing directly to libcuda +// APIs. +CUstream AsCUDAStreamValue(Stream *stream) { + DCHECK(stream != nullptr); + return AsCUDAStream(stream)->cuda_stream(); +} + +// Given a platform-independent timer datatype, returns the internal CUDA +// platform implementation pointer. +static CUDATimer *AsCUDATimer(Timer *timer) { + DCHECK(timer != nullptr); + return static_cast<CUDATimer *>(timer->implementation()); +} + +// Given const GPU memory, returns a libcuda device pointer datatype, suitable +// for passing directly to libcuda APIs. +// +// N.B. we must lose constness in order to pass a suitable type to the existing +// libcuda APIs, so the caller should take care to only pass the result of const +// GPU memory conversions to libcuda functions which will honor constness. +static CUdeviceptr AsCudaDevicePtr(const DeviceMemoryBase &gpu_mem) { + return reinterpret_cast<CUdeviceptr>(gpu_mem.opaque()); +} + +// See description on const version above. +static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase *gpu_mem) { + return AsCudaDevicePtr(*gpu_mem); +} + +static CUcontext GetCudaContext(Stream *stream) { + return static_cast<CUDAExecutor *>(stream->parent()->implementation()) + ->cuda_context(); +} + +CUcontext ExtractCudaContext(CUDAExecutor *cuda_exec) { + CHECK(cuda_exec != nullptr); + return cuda_exec->cuda_context(); +} + +CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec) { + return static_cast<CUDAExecutor *>(stream_exec->implementation()); +} + +CUDAExecutor::~CUDAExecutor() { + for (auto &it : disk_modules_) { + CUDADriver::UnloadModule(context_, it.second); + } + for (auto &it : in_memory_modules_) { + CUDADriver::UnloadModule(context_, it.second); + } + if (context_ != nullptr) { + CUDADriver::DestroyContext(context_); + } +} + +port::Status CUDAExecutor::Init(int device_ordinal, + DeviceOptions device_options) { + device_ordinal_ = device_ordinal; + + auto status = CUDADriver::Init(); + if (!status.ok()) { + return status; + } + + status = CUDADriver::GetDevice(device_ordinal_, &device_); + if (!status.ok()) { + return status; + } + + status = CUDADriver::CreateContext(device_, device_options, &context_); + if (!status.ok()) { + return status; + } + + return CUDADriver::GetComputeCapability(&cc_major_, &cc_minor_, device_); +} + +bool CUDAExecutor::FindOnDiskForComputeCapability( + port::StringPiece filename, port::StringPiece canonical_suffix, + string *found_filename) const { + if (cc_major_ == 0 && cc_minor_ == 0) { + return false; + } + + // TODO(22689637): Eliminate unnecessary ToString()s when all dependencies + // have been migrated. + string cc_specific = port::StrCat(filename.ToString(), ".cc", cc_major_, + cc_minor_, canonical_suffix.ToString()); + if (port::FileExists(cc_specific)) { + VLOG(2) << "found compute-capability-specific file, using that: " + << cc_specific; + *found_filename = cc_specific; + return true; + } + + VLOG(2) << "could not find compute-capability specific file at: " + << cc_specific; + if (port::FileExists(filename.ToString())) { + *found_filename = filename.ToString(); + return true; + } + + return false; +} + +// Returns the path to the running executable. +// N.B. Derived from //knowledge/smalltalk/background_kb.cc +// Arg: strip_exe: if true, remove the name of the executable itself from the +// returned string. Example: calling this from /usr/bin/foo +// would return /usr/bin. +static string GetBinaryDir(bool strip_exe) { + char exe_path[PATH_MAX] = {0}; + CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1)); + // Make sure it's null-terminated: + exe_path[sizeof(exe_path) - 1] = 0; + + if (strip_exe) { + // The exe is the last component of the path, so remove one component. + string ret = exe_path; + std::vector<string> components = port::Split(exe_path, '/'); + components.pop_back(); + return port::Join(components, "/"); + } + return exe_path; +} + +// Returns the location of the runfiles directory. +// This is the directory which "bazel run" sets as the current working directory +// before the program starts. +// N.B. This doesn't have to be running under "bazel run" in order to get the +// appropriate runfiles directory. +static string GetRunfilesDir() { + return port::StrCat(GetBinaryDir(false), ".runfiles"); +} + +bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec, + KernelBase *kernel) { + CUDAKernel *cuda_kernel = AsCUDAKernel(kernel); + CUmodule module = nullptr; + const string *kernelname; + + const OnDiskKernelLoaderSpec *on_disk_spec = nullptr; + bool has_ptx = spec.has_cuda_ptx_on_disk(); + bool has_cubin = spec.has_cuda_cubin_on_disk(); + if (has_cubin && (!has_ptx || FLAGS_prefer_cubin_to_ptx)) { + on_disk_spec = &spec.cuda_cubin_on_disk(); + } else if (has_ptx) { + on_disk_spec = &spec.cuda_ptx_on_disk(); + } + + if (on_disk_spec != nullptr) { + } else if (spec.has_cuda_ptx_in_memory()) { + kernelname = &spec.cuda_ptx_in_memory().kernelname(); + + if (cc_major_ == 0 && cc_minor_ == 0) { + return false; + } + + // Note that the orignal ptx may be compressed, and the ptx we get below is + // the decompressed result. To cache the module we should use the original + // ptx (compressed one) as the key. This is because for the same compressed + // ptx, we may get different decompressed ptx wrt the pointer value. + const char *ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_); + const char *orig_ptx = + spec.cuda_ptx_in_memory().original_text(cc_major_, cc_minor_); + if (ptx == nullptr || orig_ptx == nullptr) { + ptx = spec.cuda_ptx_in_memory().default_text(); + orig_ptx = spec.cuda_ptx_in_memory().original_default_text(); + } + if (ptx == nullptr || orig_ptx == nullptr) { + LOG(FATAL) << "could not load ptx for kernel " << kernelname; + return false; + } + + mutex_lock lock{in_memory_modules_mu_}; + module = in_memory_modules_[orig_ptx]; + + if (module == nullptr) { + if (g_cubinate == nullptr) { + if (!CUDADriver::LoadPtx(context_, ptx, &module)) { + return false; + } + } else { + string cubin = g_cubinate(ptx); + auto load_status = + CUDADriver::LoadCubin(context_, cubin.c_str(), &module); + if (!load_status.ok()) { + LOG(ERROR) << "failed to load cubin via hook: " << load_status; + return false; + } + } + in_memory_modules_[orig_ptx] = module; + } + } else if (spec.has_cuda_cubin_in_memory()) { + kernelname = &spec.cuda_cubin_in_memory().kernelname(); + const char *cubin = spec.cuda_cubin_in_memory().bytes(); + mutex_lock lock{in_memory_modules_mu_}; + module = in_memory_modules_[cubin]; + + if (module == nullptr) { + auto load_status = CUDADriver::LoadCubin(context_, cubin, &module); + if (!load_status.ok()) { + LOG(ERROR) << "failed to load CUBIN: " << load_status; + return false; + } + + in_memory_modules_[cubin] = module; + } + } else { + LOG(WARNING) << "no method of loading CUDA kernel provided"; + return false; + } + + VLOG(2) << "getting function " << kernelname << " from module " << module; + if (!CUDADriver::GetModuleFunction(context_, module, kernelname->c_str(), + cuda_kernel->cuda_function_ptr())) { + return false; + } + + // We have to trust the kernel loader spec arity because there doesn't appear + // to be a way to reflect on the number of expected arguments w/the CUDA API. + cuda_kernel->set_arity(spec.arity()); + + KernelMetadata kernel_metadata; + if (!GetKernelMetadata(cuda_kernel, &kernel_metadata)) { + LOG(WARNING) << "Unable to get metadata for kernel " << kernelname; + } + kernel->set_metadata(kernel_metadata); + kernel->set_name(*kernelname); + return true; +} + +bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel, + KernelMetadata *kernel_metadata) { + int value; + if (!CUDADriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS, + *cuda_kernel->cuda_function_ptr(), + &value)) { + return false; + } + kernel_metadata->set_registers_per_thread(value); + + if (!CUDADriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, + *cuda_kernel->cuda_function_ptr(), + &value)) { + return false; + } + kernel_metadata->set_shared_memory_bytes(value); + + return true; +} + +bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims, + const BlockDim &block_dims, const KernelBase &kernel, + const std::vector<KernelArg> &args) { + CHECK_EQ(kernel.Arity(), args.size()); + CUstream custream = AsCUDAStreamValue(stream); + const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel); + CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue(); + + std::vector<void *> addrs; + addrs.reserve(args.size()); + int shmem_bytes = 0; + for (size_t i = 0; i < args.size(); i++) { + switch (args[i].type) { + case KernelArg::kNormal: + addrs.push_back(const_cast<void *>( + static_cast<const void *>(args[i].data.begin()))); + break; + case KernelArg::kSharedMemory: + shmem_bytes += args[i].bytes; + break; + default: + LOG(ERROR) << "Invalid kernel arg type passed (" << args[i].type + << ") for arg " << i; + return false; + } + } + + // Only perform/print the occupancy check 1x. + launched_kernels_mu_.lock(); + if (launched_kernels_.find(cufunc) == launched_kernels_.end()) { + OccupancyCheck(kernel, thread_dims, block_dims); + // TODO(rspringer): Remove elements from launched_kernels_...if we ever + // expose a kernel/module deallocation method. + launched_kernels_.insert(cufunc); + } + launched_kernels_mu_.unlock(); + + if (cuda_kernel->GetPreferredCacheConfig() != + KernelCacheConfig::kNoPreference) { + CUDADriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetCUDACacheConfig()); + } + + if (!CUDADriver::LaunchKernel( + GetCudaContext(stream), cufunc, block_dims.x, block_dims.y, + block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z, + shmem_bytes, custream, addrs.data(), nullptr /* = extra */)) { + LOG(ERROR) << "failed to launch CUDA kernel with args: " << args.size() + << "; thread dim: " << thread_dims.ToString() + << "; block dim: " << block_dims.ToString(); + return false; + } + + return true; +} + +// This is a non-essential operation; if there's a failure, proceed without +// logging an error. It's nearly certain that in case of failures, we'd never +// get here in the first place; these are very low-impact routines. +void CUDAExecutor::OccupancyCheck(const KernelBase &kernel, + const ThreadDim &thread_dims, + const BlockDim &block_dims) { + VLOG(2) << "Computing kernel occupancy for kernel " + << kernel.demangled_name(); + VLOG(2) << "Thread dimensions (" << thread_dims.x << ", " << thread_dims.y + << ", " << thread_dims.z << ")"; + + int regs_per_thread; + if (!kernel.metadata().registers_per_thread(®s_per_thread)) { + return; + } + + int smem_per_block; + if (!kernel.metadata().shared_memory_bytes(&smem_per_block)) { + return; + } + + const DeviceDescription &device_description = + kernel.parent()->GetDeviceDescription(); + + uint64 blocks_per_sm = CalculateOccupancy( + device_description, regs_per_thread, smem_per_block, thread_dims); + VLOG(2) << "Resident blocks per SM is " << blocks_per_sm; + + // To increase occupancy, there must be a sufficient number of blocks + // available to spread across the sm's at this new improved occupancy level. + int multiprocessor_count = device_description.core_count(); + int block_count = block_dims.x * block_dims.y * block_dims.z; + int available_blocks_per_sm = + port::MathUtil::CeilOfRatio(block_count, multiprocessor_count); + if (available_blocks_per_sm <= static_cast<int64>(blocks_per_sm)) { + VLOG(2) << "Occupancy is limited by number of blocks available per sm."; + return; + } + + uint64 improved_regs_per_thread = CalculateRegisterLimitForTargetOccupancy( + device_description, smem_per_block, thread_dims, blocks_per_sm + 1); + if (improved_regs_per_thread != 0) { + VLOG(2) << "Reducing register usage from " << regs_per_thread + << " to " << improved_regs_per_thread + << " could increase resident blocks per SM by one."; + + uint64 reg_reduction = regs_per_thread - improved_regs_per_thread; + if (reg_reduction <= + static_cast<uint64>(FLAGS_register_occupancy_warning_threshold)) { + LOG(INFO) << "Notice: occupancy would increase if register usage was" + << " reduced from " << regs_per_thread + << " to " << improved_regs_per_thread + << " registers per thread for kernel: " + << kernel.demangled_name(); + } + } else { + VLOG(2) << "Resident blocks per SM cannot be increased by reducing " + "register usage."; + } +} + +void *CUDAExecutor::Allocate(uint64 size) { + return CUDADriver::DeviceAllocate(context_, size); +} + +void *CUDAExecutor::AllocateSubBuffer(DeviceMemoryBase *mem, + uint64 offset_bytes, uint64 size_bytes) { + // offset and size are in bytes, so char* works as the pointer type. + return reinterpret_cast<char *>(mem->opaque()) + offset_bytes; +} + +void CUDAExecutor::Deallocate(DeviceMemoryBase *mem) { + // CUDA "sub-buffers" are just pointer + offset, so no dealloc is necessary. + if (!mem->is_sub_buffer()) { + CUDADriver::DeviceDeallocate(context_, mem->opaque()); + } +} + +bool CUDAExecutor::HostMemoryRegister(void *location, uint64 size) { + if (location == nullptr || size == 0) { + LOG(WARNING) << "attempting to register null or zero-sized memory: " + << location << "; size " << size; + } + VLOG(2) << "registering " << location << " size " << size; + return CUDADriver::HostRegister(context_, location, size); +} + +bool CUDAExecutor::HostMemoryUnregister(void *location) { + VLOG(2) << "unregistering " << location; + return CUDADriver::HostUnregister(context_, location); +} + +bool CUDAExecutor::SynchronizeAllActivity() { + return CUDADriver::SynchronizeContext(context_); +} + +bool CUDAExecutor::SynchronousMemZero(DeviceMemoryBase *location, uint64 size) { + if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 && + size % 4 == 0) { + return CUDADriver::SynchronousMemsetUint32( + context_, AsCudaDevicePtr(location), 0x0, size / 4); + } + return CUDADriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location), + 0x0, size); +} + +bool CUDAExecutor::SynchronousMemSet(DeviceMemoryBase *location, int value, + uint64 size) { + if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 && + size % 4 == 0) { + // cudaMemset reinterprets "value" as a uint8. + uint8 byte_value = static_cast<uint8>(value); + uint32 pattern = (byte_value << 24) | (byte_value << 16) | + (byte_value << 8) | byte_value; + return CUDADriver::SynchronousMemsetUint32( + context_, AsCudaDevicePtr(location), pattern, size / 4); + } + return CUDADriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location), + value, size); +} + +bool CUDAExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst, + const void *host_src, uint64 size) { + return CUDADriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst), + host_src, size); +} + +bool CUDAExecutor::SynchronousMemcpy(void *host_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) { + return CUDADriver::SynchronousMemcpyD2H(context_, host_dst, + AsCudaDevicePtr(gpu_src), size); +} + +bool CUDAExecutor::SynchronousMemcpyDeviceToDevice( + DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src, uint64 size) { + return CUDADriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst), + AsCudaDevicePtr(gpu_src), size); +} + +bool CUDAExecutor::MemZero(Stream *stream, DeviceMemoryBase *location, + uint64 size) { + return Memset32(stream, location, 0x0, size); +} + +bool CUDAExecutor::Memset32(Stream *stream, DeviceMemoryBase *location, + uint32 pattern, uint64 size) { + VLOG(2) << "enqueueing memset32 operation onto stream " << stream + << " at location " << location << " with size " << size + << " and pattern " << std::hex << pattern; + CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 && + size % 4 == 0); + return CUDADriver::AsynchronousMemsetUint32( + context_, AsCudaDevicePtr(location), pattern, size / 4, + AsCUDAStreamValue(stream)); +} + +bool CUDAExecutor::Memcpy(Stream *stream, void *host_dst, + const DeviceMemoryBase &gpu_src, uint64 size) { + return CUDADriver::AsynchronousMemcpyD2H(context_, host_dst, + AsCudaDevicePtr(gpu_src), size, + AsCUDAStreamValue(stream)); +} + +bool CUDAExecutor::Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, + const void *host_src, uint64 size) { + return CUDADriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst), + host_src, size, + AsCUDAStreamValue(stream)); +} + +bool CUDAExecutor::MemcpyDeviceToDevice(Stream *stream, + DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) { + return CUDADriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst), + AsCudaDevicePtr(gpu_src), size, + AsCUDAStreamValue(stream)); +} + +bool CUDAExecutor::HostCallback(Stream *stream, + std::function<void()> callback) { + auto callback_ptr = new std::function<void()>(callback); + return CUDADriver::AddStreamCallback(context_, AsCUDAStreamValue(stream), + InternalHostCallback, callback_ptr); +} + +/* static */ void CUDAExecutor::InternalHostCallback(CUstream stream, + CUresult status, + void *data) { + std::function<void()> *callback = + reinterpret_cast<std::function<void()> *>(data); + (*callback)(); + delete callback; +} + +port::Status CUDAExecutor::AllocateEvent(Event *event) { + return AsCUDAEvent(event)->Init(); +} + +port::Status CUDAExecutor::DeallocateEvent(Event *event) { + return AsCUDAEvent(event)->Destroy(); +} + +port::Status CUDAExecutor::RecordEvent(Stream *stream, Event *event) { + return AsCUDAEvent(event)->Record(AsCUDAStream(stream)); +} + +port::Status CUDAExecutor::WaitForEvent(Stream *stream, Event *event) { + if (CUDADriver::WaitStreamOnEvent(context_, + AsCUDAStream(stream)->cuda_stream(), + AsCUDAEvent(event)->cuda_event())) { + return port::Status::OK(); + } else { + return port::Status{ + port::error::INTERNAL, + port::Printf("error recording waiting for CUDA event on stream %p", + stream)}; + } +} + +Event::Status CUDAExecutor::PollForEventStatus(Event *event) { + return AsCUDAEvent(event)->PollForStatus(); +} + +bool CUDAExecutor::AllocateStream(Stream *stream) { + return AsCUDAStream(stream)->Init(); +} + +void CUDAExecutor::DeallocateStream(Stream *stream) { + CUDAStream *cuda_stream = AsCUDAStream(stream); + if (!cuda_stream->IsIdle()) { + LOG(ERROR) << "Deallocating stream with pending work"; + } + cuda_stream->Destroy(); +} + +bool CUDAExecutor::AllocateTimer(Timer *timer) { + return AsCUDATimer(timer)->Init(); +} + +void CUDAExecutor::DeallocateTimer(Timer *timer) { + AsCUDATimer(timer)->Destroy(); +} + +bool CUDAExecutor::CreateStreamDependency(Stream *dependent, Stream *other) { + CUevent other_completed_event; + bool ok = + AsCUDAStream(other)->GetOrCreateCompletedEvent(&other_completed_event); + if (!ok) { + LOG(ERROR) << "failed to get completion event from other; " + "therefore, failed to create inter-stream dependency"; + return false; + } + + ok = CUDADriver::RecordEvent(context_, other_completed_event, + AsCUDAStreamValue(other)) + .ok(); + if (!ok) { + LOG(ERROR) << "failed to record completion event; " + "therefore, failed to create inter-stream dependency"; + return false; + } + + return CUDADriver::WaitStreamOnEvent(context_, AsCUDAStreamValue(dependent), + other_completed_event); +} + +bool CUDAExecutor::StartTimer(Stream *stream, Timer *timer) { + return AsCUDATimer(timer)->Start(AsCUDAStream(stream)); +} + +bool CUDAExecutor::StopTimer(Stream *stream, Timer *timer) { + return AsCUDATimer(timer)->Stop(AsCUDAStream(stream)); +} + +bool CUDAExecutor::BlockHostUntilDone(Stream *stream) { + return CUDADriver::SynchronizeStream(context_, AsCUDAStreamValue(stream)); +} + +blas::BlasSupport *CUDAExecutor::CreateBlas() { + PluginRegistry *registry = PluginRegistry::Instance(); + port::StatusOr<PluginRegistry::BlasFactory> status = + registry->GetFactory<PluginRegistry::BlasFactory>(kCudaPlatformId, + plugin_config_.blas()); + if (!status.ok()) { + LOG(ERROR) << "Unable to retrieve BLAS factory: " + << status.status().error_message(); + return nullptr; + } + + return status.ValueOrDie()(this); +} + +dnn::DnnSupport *CUDAExecutor::CreateDnn() { + PluginRegistry *registry = PluginRegistry::Instance(); + port::StatusOr<PluginRegistry::DnnFactory> status = + registry->GetFactory<PluginRegistry::DnnFactory>(kCudaPlatformId, + plugin_config_.dnn()); + if (!status.ok()) { + LOG(ERROR) << "Unable to retrieve DNN factory: " + << status.status().error_message(); + return nullptr; + } + + return status.ValueOrDie()(this); +} + +fft::FftSupport *CUDAExecutor::CreateFft() { + PluginRegistry *registry = PluginRegistry::Instance(); + port::StatusOr<PluginRegistry::FftFactory> status = + registry->GetFactory<PluginRegistry::FftFactory>(kCudaPlatformId, + plugin_config_.fft()); + if (!status.ok()) { + LOG(ERROR) << "Unable to retrieve FFT factory: " + << status.status().error_message(); + return nullptr; + } + + return status.ValueOrDie()(this); +} + +rng::RngSupport *CUDAExecutor::CreateRng() { + PluginRegistry *registry = PluginRegistry::Instance(); + port::StatusOr<PluginRegistry::RngFactory> status = + registry->GetFactory<PluginRegistry::RngFactory>(kCudaPlatformId, + plugin_config_.rng()); + if (!status.ok()) { + LOG(ERROR) << "Unable to retrieve RNG factory: " + << status.status().error_message(); + return nullptr; + } + + return status.ValueOrDie()(this); +} + +// TODO(rspringer): Remove in b/18544742. +bool CUDAExecutor::SupportsDnn() const { + return true; +} + +bool CUDAExecutor::CanEnablePeerAccessTo(StreamExecutorInterface *other) { + CUDAExecutor *cuda_other = static_cast<CUDAExecutor *>(other); + return CUDADriver::CanEnablePeerAccess(context_, cuda_other->context_); +} + +port::Status CUDAExecutor::EnablePeerAccessTo(StreamExecutorInterface *other) { + CUDAExecutor *cuda_other = static_cast<CUDAExecutor *>(other); + return CUDADriver::EnablePeerAccess(context_, cuda_other->context_); +} + +SharedMemoryConfig CUDAExecutor::GetDeviceSharedMemoryConfig() { + port::StatusOr<CUsharedconfig> cuda_config = + CUDADriver::ContextGetSharedMemConfig(context_); + if (!cuda_config.ok()) { + // Don't log; the failed call will log necessary output. + return SharedMemoryConfig::kDefault; + } + + switch (cuda_config.ValueOrDie()) { + case CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: + return SharedMemoryConfig::kDefault; + case CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: + return SharedMemoryConfig::kFourByte; + case CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: + return SharedMemoryConfig::kEightByte; + default: + LOG(FATAL) << "Invalid shared memory configuration returned: " + << cuda_config.ValueOrDie(); + } +} + +port::Status CUDAExecutor::SetDeviceSharedMemoryConfig( + SharedMemoryConfig config) { + CUsharedconfig cuda_config; + switch (config) { + case SharedMemoryConfig::kDefault: + cuda_config = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE; + break; + case SharedMemoryConfig::kFourByte: + cuda_config = CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE; + break; + case SharedMemoryConfig::kEightByte: + cuda_config = CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE; + break; + default: + LOG(FATAL) << "Invalid shared memory configuration specified: " + << static_cast<int>(config); + } + return CUDADriver::ContextSetSharedMemConfig(context_, cuda_config); +} + +bool CUDAExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const { + return CUDADriver::GetDeviceMemoryInfo(context_, free, total); +} + +bool CUDAExecutor::GetSymbol(const string& symbol_name, void **mem, + size_t *bytes) { + { // give limited scope to mutex_lock + mutex_lock lock{disk_modules_mu_}; + for (auto &it : disk_modules_) { + if (CUDADriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(), + reinterpret_cast<CUdeviceptr *>(mem), + bytes)) { + return true; + } + } + } + + { // give limited scope to mutex_lock + mutex_lock lock{in_memory_modules_mu_}; + for (auto &it : in_memory_modules_) { + if (CUDADriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(), + reinterpret_cast<CUdeviceptr *>(mem), + bytes)) { + return true; + } + } + } + + LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name; + return false; +} + +bool CUDAExecutor::FillBlockDimLimit(BlockDim *block_dim_limit) const { + // The BlockDim name is a mismatch against these GRID_DIM_* queries because + // we use BlockDims to express the dimensions of blocks within a grid + // (as opposed to ThreadDim which expresses the dimensions of threads + // within a block). + int x, y, z; + if (!CUDADriver::GetGridLimits(&x, &y, &z, device_)) { + return false; + } + + block_dim_limit->x = x; + block_dim_limit->y = y; + block_dim_limit->z = z; + return true; +} + +KernelArg CUDAExecutor::DeviceMemoryToKernelArg( + const DeviceMemoryBase &gpu_mem) const { + const void* arg = gpu_mem.opaque(); + const uint8 *arg_ptr = reinterpret_cast<const uint8 *>(&arg); + + KernelArg kernel_arg; + kernel_arg.type = KernelArg::kNormal; + kernel_arg.data = port::InlinedVector<uint8, 4>(arg_ptr, arg_ptr + sizeof(arg)); + kernel_arg.bytes = sizeof(arg); + return kernel_arg; +} + +bool CUDAExecutor::SupportsBlas() const { return true; } + +bool CUDAExecutor::SupportsFft() const { return true; } + +bool CUDAExecutor::SupportsRng() const { return true; } + +void *CUDAExecutor::CudaContextHack() { return context_; } + +CUcontext CUDAExecutor::cuda_context() { return context_; } + +// Attemps to read the NUMA node corresponding to the GPU device's PCI bus out +// of SysFS. Returns -1 if it cannot. +// +// For anything more complicated/prod-focused than this, you'll likely want to +// turn to gsys' topology modeling. +static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) { + VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal; + static const int kUnknownNumaNode = -1; + + if (pci_bus_id.empty()) { + LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal; + return kUnknownNumaNode; + } + + string filename = + port::Printf("/sys/bus/pci/devices/%s/numa_node", pci_bus_id.c_str()); + + // We have to use fopen/fread here so that the device properties can be + // populated before InitGoogle procedure has been completed (at which point we + // could use the file::* utilities). + FILE *file = fopen(filename.c_str(), "r"); + if (file == nullptr) { + LOG(ERROR) << "could not open file to read NUMA node: " << filename; + return kUnknownNumaNode; + } + + string content; + char buf[32]; + size_t did_read = fread(buf, sizeof(buf[0]), sizeof(buf) - 1, file); + buf[did_read] = '\0'; + content = buf; + + int32 value; + if (port::safe_strto32(content, &value)) { + if (value < 0) { // See http://b/18228951 for details on this path. + LOG(INFO) << "successful NUMA node read from SysFS had negative value (" + << value << "), but there must be at least one NUMA node" + ", so returning NUMA node zero"; + return 0; + } + return value; + } + + LOG(WARNING) + << "could not convert SysFS file contents to integral NUMA node value: " + << content; + + return kUnknownNumaNode; +} + +// Set of compute capability specific device parameters that cannot be +// queried from the driver API. These values instead are baked into a +// lookup table indexed by compute capability version. +struct UnqueryableDeviceParams { + int cc_major; + int cc_minor; + uint64 blocks_per_core_limit; + uint64 registers_per_core_limit; + uint64 registers_per_thread_limit; + uint64 warp_alloc_granularity; + uint64 register_alloc_granularity; + uint64 shared_memory_alloc_granularity; +}; + +static const UnqueryableDeviceParams kAllUnqueryableDeviceParams[] = { + { + 3, 5, // compute capability (3.5) + 16, // blocks_per_core_limit + 64 * 1024, // registers_per_core_limit + 255, // registers_per_thread_limit + 4, // warp_alloc_granularity + 256, // register_alloc_granularity + 256 // shared_memory_alloc_granularity + } +}; + +DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const { + internal::DeviceDescriptionBuilder builder; + + { + int driver_version = 0; + (void)CUDADriver::GetDriverVersion(&driver_version); + string augmented_driver_version = port::Printf( + "%d (%s)", driver_version, + DriverVersionStatusToString(Diagnostician::FindDsoVersion()).c_str()); + builder.set_driver_version(augmented_driver_version); + } + + { + string pci_bus_id = CUDADriver::GetPCIBusID(device_); + + // Lower the hex characters to match sysfs. + pci_bus_id = port::Lowercase(pci_bus_id); + builder.set_pci_bus_id(pci_bus_id); + + // Read the NUMA node corresponding to the PCI bus ID out of sysfs. + int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal_); + builder.set_numa_node(numa_node); + } + + CUdevprop prop; + if (CUDADriver::GetDeviceProperties(&prop, device_ordinal_)) { + builder.set_threads_per_block_limit(prop.maxThreadsPerBlock); + + ThreadDim thread_dim_limit; + thread_dim_limit.x = prop.maxThreadsDim[0]; + thread_dim_limit.y = prop.maxThreadsDim[1]; + thread_dim_limit.z = prop.maxThreadsDim[2]; + builder.set_thread_dim_limit(thread_dim_limit); + + float clock_rate_ghz = static_cast<float>(prop.clockRate) / 1e6; + builder.set_clock_rate_ghz(clock_rate_ghz); + } + + { + bool ecc_enabled = false; + (void)CUDADriver::IsEccEnabled(device_, &ecc_enabled); + builder.set_ecc_enabled(ecc_enabled); + } + + { + uint64 device_memory_size = -1; + (void)CUDADriver::GetDeviceTotalMemory(device_, &device_memory_size); + builder.set_device_memory_size(device_memory_size); + } + + { + BlockDim block_dim_limit; + FillBlockDimLimit(&block_dim_limit); + builder.set_block_dim_limit(block_dim_limit); + } + + { + string device_name; + (void)CUDADriver::GetDeviceName(device_, &device_name); + builder.set_name(device_name); + } + + for (size_t i = 0; i < ARRAYSIZE(kAllUnqueryableDeviceParams); i++) { + const auto ¶ms = kAllUnqueryableDeviceParams[i]; + if (params.cc_major == cc_major_ && params.cc_minor == cc_minor_) { + builder.set_blocks_per_core_limit(params.blocks_per_core_limit); + builder.set_registers_per_core_limit(params.registers_per_core_limit); + builder.set_registers_per_thread_limit(params.registers_per_thread_limit); + builder.set_warp_alloc_granularity(params.warp_alloc_granularity); + builder.set_register_alloc_granularity(params.register_alloc_granularity); + builder.set_shared_memory_alloc_granularity( + params.shared_memory_alloc_granularity); + } + } + + builder.set_platform_version( + port::StrCat("Compute Capability ", cc_major_, ".", cc_minor_)); + + // TODO(leary) should be a way to query this from the driver, but this is + // unlikely to change for us any time soon. + builder.set_device_address_bits(64); + + builder.set_device_vendor("NVIDIA Corporation"); + builder.set_cuda_compute_capability(cc_major_, cc_minor_); + builder.set_shared_memory_per_core( + CUDADriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie()); + builder.set_shared_memory_per_block( + CUDADriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie()); + builder.set_core_count( + CUDADriver::GetMultiprocessorCount(device_).ValueOrDie()); + builder.set_threads_per_core_limit( + CUDADriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie()); + builder.set_registers_per_block_limit( + CUDADriver::GetMaxRegistersPerBlock(device_).ValueOrDie()); + builder.set_threads_per_warp( + CUDADriver::GetThreadsPerWarp(device_).ValueOrDie()); + + auto built = builder.Build(); + return built.release(); +} + +} // namespace cuda + +namespace gpu = ::perftools::gputools; + +void initialize_cuda_gpu_executor() { + port::StatusOr<void *> status = + gpu::internal::CachedDsoLoader::GetLibcudaDsoHandle(); + if (!status.ok()) { + gpu::cuda::Diagnostician::LogDriverVersionInformation(); + LOG(INFO) << "LD_LIBRARY_PATH: " << getenv("LD_LIBRARY_PATH"); + LOG(INFO) << "failed to find libcuda.so on this system: " + << status.status(); + } + + // TODO(b/22689637): Temporary until users are migrated off of PlatformKind. + gpu::PluginRegistry::Instance()->MapPlatformKindToId( + gpu::PlatformKind::kCuda, gpu::cuda::kCudaPlatformId); + + *gpu::internal::MakeCUDAExecutorImplementation() = []( + const gpu::PluginConfig &config) { + return new gpu::cuda::CUDAExecutor{config}; + }; + + *gpu::internal::MakeCUDAKernelImplementation() = []() { + return new gpu::cuda::CUDAKernel; + }; + + *gpu::internal::MakeCUDAEventImplementation() = []( + gpu::StreamExecutor *parent) { + gpu::cuda::CUDAExecutor *cuda_executor = + static_cast<gpu::cuda::CUDAExecutor *>(parent->implementation()); + return new gpu::cuda::CUDAEvent{cuda_executor}; + }; + + *gpu::internal::MakeCUDAStreamImplementation() = []( + gpu::StreamExecutor *parent) { + gpu::cuda::CUDAExecutor *cuda_executor = + static_cast<gpu::cuda::CUDAExecutor *>(parent->implementation()); + return new gpu::cuda::CUDAStream{cuda_executor}; + }; + *gpu::internal::MakeCUDATimerImplementation() = []( + gpu::StreamExecutor *parent) { + gpu::cuda::CUDAExecutor *cuda_executor = + static_cast<gpu::cuda::CUDAExecutor *>(parent->implementation()); + return new gpu::cuda::CUDATimer{cuda_executor}; + }; +} + +} // namespace gputools +} // namespace perftools + +REGISTER_MODULE_INITIALIZER( + cuda_gpu_executor, {perftools::gputools::initialize_cuda_gpu_executor();}); diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h new file mode 100644 index 0000000000..fda89b9738 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h @@ -0,0 +1,270 @@ +// The CUDA implementation of the StreamExecutorInterface functionality. +// CUDA inclusions are ideally confined to this implementation file. +// +// The notions from the StreamExecutor basically correspond to the CUDA streams +// programming model provided by the libcuda.so driver APIs, so we don't have +// to do much more than wrap the calls to the libraries appropriately. +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_ + +#include <map> +#include <set> + +#include "tensorflow/stream_executor/cuda/cuda_kernel.h" +#include "tensorflow/stream_executor/event.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/platform.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/platform/thread_annotations.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" + +namespace perftools { +namespace gputools { +namespace blas { +class BlasSupport; +} +namespace internal { +class RngSupport; +} // namespace internal +} // namespace gputools +} // namespace perftools + +namespace perftools { +namespace gputools { +namespace cuda { + +// CUDA-platform implementation of the platform-agnostic +// StreamExecutorInferface. +class CUDAExecutor : public internal::StreamExecutorInterface { + public: + // sub_platform indicates the subplatform used in this executor; it must + // be a CUDA type. + explicit CUDAExecutor(const PluginConfig &plugin_config) + : device_(0), + context_(nullptr), + device_ordinal_(0), + cc_major_(0), + cc_minor_(0), + plugin_config_(plugin_config) {} + + // See the corresponding StreamExecutor methods for method comments on the + // following overrides. + + ~CUDAExecutor() override; + + port::Status Init(int device_ordinal, DeviceOptions device_options) override; + + bool GetKernel(const MultiKernelLoaderSpec &spec, + KernelBase *kernel) override; + + bool Launch(Stream *stream, const ThreadDim &thread_dims, + const BlockDim &block_dims, const KernelBase &k, + const std::vector<KernelArg> &args) override; + + void *Allocate(uint64 size) override; + + void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes, + uint64 size_bytes) override; + + void Deallocate(DeviceMemoryBase *mem) override; + + // CUDA allocation/registration functions are necessary because the driver + // internally sets up buffers for DMA operations (and page locks them). + // There's no external interface for us to otherwise control these DMA + // settings. + void *HostMemoryAllocate(uint64 size) override { + return CUDADriver::HostAllocate(context_, size); + } + + void HostMemoryDeallocate(void *location) override { + return CUDADriver::HostDeallocate(context_, location); + } + + bool HostMemoryRegister(void *location, uint64 size) override; + + bool HostMemoryUnregister(void *location) override; + + bool SynchronizeAllActivity() override; + + bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override; + + bool SynchronousMemSet(DeviceMemoryBase *location, int value, + uint64 size) override; + + bool SynchronousMemcpy(DeviceMemoryBase *gpu_dst, const void *host_src, + uint64 size) override; + + bool SynchronousMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src, + uint64 size) override; + + bool SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) override; + + bool MemZero(Stream *stream, DeviceMemoryBase *location, + uint64 size) override; + bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern, + uint64 size) override; + + bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &gpu_src, + uint64 size) override; + + bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, const void *host_src, + uint64 size) override; + + bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) override; + + bool HostCallback(Stream *stream, std::function<void()> callback) override; + + bool AllocateStream(Stream *stream) override; + + void DeallocateStream(Stream *stream) override; + + bool CreateStreamDependency(Stream *dependent, Stream *other) override; + + bool AllocateTimer(Timer *timer) override; + + void DeallocateTimer(Timer *timer) override; + + bool StartTimer(Stream *stream, Timer *timer) override; + + bool StopTimer(Stream *stream, Timer *timer) override; + + port::Status AllocateEvent(Event *event) override; + + port::Status DeallocateEvent(Event *event) override; + + port::Status RecordEvent(Stream *stream, Event *event) override; + + port::Status WaitForEvent(Stream *stream, Event *event) override; + + Event::Status PollForEventStatus(Event *event) override; + + bool BlockHostUntilDone(Stream *stream) override; + + int PlatformDeviceCount() override { return CUDADriver::GetDeviceCount(); } + + port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override; + + bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override; + + SharedMemoryConfig GetDeviceSharedMemoryConfig() override; + + port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override; + + bool DeviceMemoryUsage(int64 *free, int64 *total) const override; + + // Search for the symbol and returns a device pointer and size. + // Returns false if symbol does not exist. + bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) override; + + DeviceDescription *PopulateDeviceDescription() const override; + + // Populates the block_dim_limit by querying the device driver API. If an + // error occurs at any point while asking the driver for block dim limits, it + // will be only partially populated as a result, and an error will be logged. + bool FillBlockDimLimit(BlockDim *block_dim_limit) const; + + KernelArg DeviceMemoryToKernelArg( + const DeviceMemoryBase &gpu_mem) const override; + + bool SupportsBlas() const override; + + blas::BlasSupport *CreateBlas() override; + + bool SupportsFft() const override; + + fft::FftSupport *CreateFft() override; + + bool SupportsRng() const override; + + rng::RngSupport *CreateRng() override; + + bool SupportsDnn() const override; + + dnn::DnnSupport *CreateDnn() override; + + void *CudaContextHack() override; + + CUcontext cuda_context(); + + private: + // Attempts to find a more specific version of the file indicated by + // filename by looking for compute-capability-specific suffixed versions; i.e. + // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if + // we're on a compute capability 3.0 machine. + bool FindOnDiskForComputeCapability(port::StringPiece filename, + port::StringPiece canonical_suffix, + string *found_filename) const; + + // Host callback landing routine invoked by CUDA. + // data: User-provided callback provided to HostCallback() above, captured + // as a std::function<void()>. Allocated/initialized inside + // HostCallback() and owned and deleted by this call. + static void InternalHostCallback(CUstream stream, CUresult status, + void *data); + + // Collects metadata for the specified kernel. + bool GetKernelMetadata(CUDAKernel *cuda_kernel, + KernelMetadata *kernel_metadata); + + // Determines if the given kernel's occupancy could be improved by only + // slightly reducing its register usage. If so, a message is emitted to the + // INFO log. The warning threshold is controlled by the flag + // register_occupancy_warning_threshold. + void OccupancyCheck(const KernelBase &kernel, const ThreadDim &thread_dims, + const BlockDim &block_dims); + + // Guards the on-disk-module mapping. + mutex disk_modules_mu_; + + // Mapping from filename to CUmodule, if it was already retrieved. + // Multiple CUfunctions are usually obtained from a single CUmodule so we + // attempt to hit in this mapping first, before retrieving it. + std::map<string, CUmodule> disk_modules_ GUARDED_BY(disk_modules_mu_); + + // Guards the in-memory-module mapping. + mutex in_memory_modules_mu_; + + std::map<const char *, CUmodule> in_memory_modules_ + GUARDED_BY(in_memory_modules_mu_); + + // Guards the launched kernel set. + mutex launched_kernels_mu_; + + // Keeps track of the set of launched kernels. Currently used to suppress the + // occupancy check on subsequent launches. + std::set<CUfunction> launched_kernels_ GUARDED_BY(launched_kernels_mu_); + + // Handle for the CUDA device being operated on. Immutable + // post-initialization. + CUdevice device_; + + // Handle for session with the library/driver. Immutable post-initialization. + CUcontext context_; + + // The device ordinal value that this executor was initialized with; recorded + // for use in getting device metadata. Immutable post-initialization. + int device_ordinal_; + + // The major verion of the compute capability for device_. + int cc_major_; + + // The minor verion of the compute capability for device_. + int cc_minor_; + + // The plugin configuration associated with this instance. + PluginConfig plugin_config_; + + SE_DISALLOW_COPY_AND_ASSIGN(CUDAExecutor); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_helpers.h b/tensorflow/stream_executor/cuda/cuda_helpers.h new file mode 100644 index 0000000000..2c5311cb3b --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_helpers.h @@ -0,0 +1,95 @@ +// Common helper functions used for dealing with CUDA API datatypes. +// +// These are typically placed here for use by multiple source components (for +// example, BLAS and executor components). + +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_ + +#include <stddef.h> +#include <complex> + +#include "third_party/gpus/cuda/include/cuComplex.h" +#include "third_party/gpus/cuda/include/cuda.h" + +namespace perftools { +namespace gputools { + +class Stream; +template <typename ElemT> +class DeviceMemory; + +namespace cuda { + +// Converts a const DeviceMemory reference to its underlying typed pointer in +// CUDA +// device memory. +template <typename T> +const T *CUDAMemory(const DeviceMemory<T> &mem) { + return static_cast<const T *>(mem.opaque()); +} + +// Converts a (non-const) DeviceMemory pointer reference to its underlying typed +// pointer in CUDA device device memory. +template <typename T> +T *CUDAMemoryMutable(DeviceMemory<T> *mem) { + return static_cast<T *>(mem->opaque()); +} + +CUstream AsCUDAStreamValue(Stream *stream); + +static_assert(sizeof(std::complex<float>) == sizeof(cuComplex), + "std::complex<float> and cuComplex should have the same size"); +static_assert(offsetof(cuComplex, x) == 0, + "The real part of cuComplex should appear first."); +static_assert(sizeof(std::complex<double>) == sizeof(cuDoubleComplex), + "std::complex<double> and cuDoubleComplex should have the same " + "size"); +static_assert(offsetof(cuDoubleComplex, x) == 0, + "The real part of cuDoubleComplex should appear first."); + +// Type traits to get CUDA complex types from std::complex<>. + +template <typename T> +struct CUDAComplexT { + typedef T type; +}; + +template <> +struct CUDAComplexT<std::complex<float>> { + typedef cuComplex type; +}; + +template <> +struct CUDAComplexT<std::complex<double>> { + typedef cuDoubleComplex type; +}; + +// Converts pointers of std::complex<> to pointers of +// cuComplex/cuDoubleComplex. No type conversion for non-complex types. + +template <typename T> +inline const typename CUDAComplexT<T>::type *CUDAComplex(const T *p) { + return reinterpret_cast<const typename CUDAComplexT<T>::type *>(p); +} + +template <typename T> +inline typename CUDAComplexT<T>::type *CUDAComplex(T *p) { + return reinterpret_cast<typename CUDAComplexT<T>::type *>(p); +} + +// Converts values of std::complex<float/double> to values of +// cuComplex/cuDoubleComplex. +inline cuComplex CUDAComplexValue(std::complex<float> val) { + return {val.real(), val.imag()}; +} + +inline cuDoubleComplex CUDAComplexValue(std::complex<double> val) { + return {val.real(), val.imag()}; +} + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_kernel.h b/tensorflow/stream_executor/cuda/cuda_kernel.h new file mode 100644 index 0000000000..e8ad3955e9 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_kernel.h @@ -0,0 +1,115 @@ +// The CUDA implementation of the StreamExecutorInterface functionality. +// CUDA inclusions are ideally confined to this implementation file. +// +// The notions from the StreamExecutor basically correspond to the CUDA streams +// programming model provided by the libcuda.so driver APIs, so we don't have +// to do much more than wrap the calls to the libraries appropriately. +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_ + +#include "tensorflow/stream_executor/kernel_cache_config.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/lib/casts.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "third_party/gpus/cuda/include/cuda.h" + +#ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_ +#error \ + "No driver calls in this file, wrap driver functionality in cuda_driver.cc." +#endif + +#ifdef __CUDA_RUNTIME_H__ +#error \ + "CUDA runtime being included into CUDA GPU executor; should be driver only." +#endif + +namespace perftools { +namespace gputools { +namespace cuda { + +// Wraps a CUfunction to implement the platform-independent KernelInterface. +class CUDAKernel : public internal::KernelInterface { + public: + CUDAKernel() : cuda_function_(nullptr), arity_(0), + preferred_cache_config_(KernelCacheConfig::kNoPreference) {} + + // Note that the function is unloaded when the module is unloaded, and the + // module that the function is contained in is owned by the CUDAExecutor. + ~CUDAKernel() override {} + + // As arity cannot be reflected upon using the CUDA API, the arity is + // explicitly set during the CUDAExecutor::GetKernel initialization process. + void set_arity(unsigned arity) { arity_ = arity; } + unsigned Arity() const override { return arity_; } + + // Returns the CUfunction value for passing to the CUDA API. + CUfunction AsCUDAFunctionValue() const { + DCHECK(cuda_function_ != nullptr); + return const_cast<CUfunction>(cuda_function_); + } + + // Returns the slot that the CUfunction is stored within for this object, + // for the CUDA API which wants to load into a CUfunction*. + CUfunction *cuda_function_ptr() { return &cuda_function_; } + + // CUDA supports setting the preferred cache configuration of a CUfunction + // (more-or-less equivalent to a CUDAKernel). We support this via the below + // functions; users can set a preference, and that is applied when the kernel + // is [lazy-]loaded (in CUDAExecutor::Launch). The alternative would be to + // load the kernel & set the preference when the user calls the setter below; + // either approach is valid. + // Sets the current kernel cache configuration preference. + void SetPreferredCacheConfig(KernelCacheConfig config) override { + preferred_cache_config_ = config; + } + + // Returns the current kernel cache configuration preference. + KernelCacheConfig GetPreferredCacheConfig() const override { + return preferred_cache_config_; + } + + // Returns the current kernel cache configuration preference as a + // CUfunc_cache. + CUfunc_cache GetCUDACacheConfig() const { + switch (preferred_cache_config_) { + case KernelCacheConfig::kNoPreference: + return CU_FUNC_CACHE_PREFER_NONE; + case KernelCacheConfig::kPreferShared: + return CU_FUNC_CACHE_PREFER_SHARED; + case KernelCacheConfig::kPreferL1: + return CU_FUNC_CACHE_PREFER_L1; + case KernelCacheConfig::kPreferEqual: + return CU_FUNC_CACHE_PREFER_EQUAL; + default: + LOG(FATAL) << "Unknown KernelCacheConfig" + << static_cast<int32>(preferred_cache_config_); + } + } + + private: + CUfunction cuda_function_; // Wrapped CUDA kernel handle. + unsigned arity_; // Number of formal parameters the kernel takes. + + // Preferred (but not required) cache configuration for this kernel. + KernelCacheConfig preferred_cache_config_; +}; + +// Given a platform-independent kernel datatype, returns the (const) internal +// CUDA platform implementation pointer. +inline const CUDAKernel *AsCUDAKernel(const KernelBase *kernel) { + return static_cast<const CUDAKernel *>(kernel->implementation()); +} + +// Given a platform-independent kernel datatype, returns the (non-const) +// internal CUDA platform implementation pointer. +inline CUDAKernel *AsCUDAKernel(KernelBase *kernel) { + return static_cast<CUDAKernel *>(kernel->implementation()); +} + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc new file mode 100644 index 0000000000..ef88b89eda --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_platform.cc @@ -0,0 +1,172 @@ +#include "tensorflow/stream_executor/cuda/cuda_platform.h" + +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/initialize.h" +#include "tensorflow/stream_executor/lib/ptr_util.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +PLATFORM_DEFINE_ID(kCudaPlatformId); + +CudaPlatform::CudaPlatform() + : name_("CUDA"), min_numa_node_(0), limit_numa_node_(0) {} + +CudaPlatform::~CudaPlatform() {} + +// Due to legacy issues in user code, we can't currently call InpectNumaNodes +// at module initialization time, because non-GPU programs still include this +// plugin via various methods, so instead, it has to be init-on-reference. +void CudaPlatform::InspectNumaNodes() { + // To get NUMA node information, we need to create all executors, so we can + // examine their device descriptions to see their bus assignments. + static bool initialized = false; + static mutex numa_mutex(LINKER_INITIALIZED); + mutex_lock lock(numa_mutex); + if (initialized) { + return; + } + + StreamExecutorConfig config; + for (int i = 0; i < VisibleDeviceCount(); i++) { + config.ordinal = i; + StreamExecutor* exec = GetExecutor(config).ValueOrDie(); + if (i == 0) { + // NUMA nodes may not start at 0, so set the minimum node based on the + // first executor we see. + min_numa_node_ = exec->GetDeviceDescription().numa_node(); + limit_numa_node_ = min_numa_node_ + 1; + } else { + min_numa_node_ = + std::min(min_numa_node_, exec->GetDeviceDescription().numa_node()); + limit_numa_node_ = std::max(limit_numa_node_, + exec->GetDeviceDescription().numa_node() + 1); + } + } + initialized = true; +} + +int CudaPlatform::BusCount() { + InspectNumaNodes(); + return limit_numa_node_ - min_numa_node_; +} + +int CudaPlatform::DeviceToBus(int device_ordinal) { + StreamExecutorConfig config; + config.ordinal = device_ordinal; + StreamExecutor* exec = GetExecutor(config).ValueOrDie(); + return exec->GetDeviceDescription().numa_node() - min_numa_node_; +} + +port::StatusOr<StreamExecutor*> CudaPlatform::FirstExecutorForBus( + int bus_ordinal) { + InspectNumaNodes(); + CHECK_LT(bus_ordinal, BusCount()) << "bus ordinal out of available range"; + for (int i = 0; i < VisibleDeviceCount(); i++) { + if (DeviceToBus(i) == bus_ordinal) { + StreamExecutorConfig config; + config.ordinal = i; + return GetExecutor(config).ValueOrDie(); + } + } + + return port::Status{ + port::error::NOT_FOUND, + port::Printf("Executor for bus %d not found.", bus_ordinal)}; +} + +Platform::Id CudaPlatform::id() const { return kCudaPlatformId; } + +int CudaPlatform::VisibleDeviceCount() const { + // Throw away the result - it logs internally, and this [containing] function + // isn't in the path of user control. It's safe to call this > 1x. + if (!cuda::CUDADriver::Init().ok()) { + return -1; + } + + return CUDADriver::GetDeviceCount(); +} + +const string& CudaPlatform::Name() const { return name_; } + +port::StatusOr<StreamExecutor*> CudaPlatform::ExecutorForDevice(int ordinal) { + StreamExecutorConfig config; + config.ordinal = ordinal; + config.plugin_config = PluginConfig(); + config.device_options = DeviceOptions::Default(); + return GetExecutor(config); +} + +port::StatusOr<StreamExecutor*> CudaPlatform::ExecutorForDeviceWithPluginConfig( + int device_ordinal, const PluginConfig& plugin_config) { + StreamExecutorConfig config; + config.ordinal = device_ordinal; + config.plugin_config = plugin_config; + config.device_options = DeviceOptions::Default(); + return GetExecutor(config); +} + +port::StatusOr<StreamExecutor*> CudaPlatform::GetExecutor( + const StreamExecutorConfig& config) { + mutex_lock lock(mu_); + + port::StatusOr<StreamExecutor*> status = executor_cache_.Get(config); + if (status.ok()) { + return status.ValueOrDie(); + } + + port::StatusOr<std::unique_ptr<StreamExecutor>> executor = + GetUncachedExecutor(config); + if (!executor.ok()) { + return executor.status(); + } + + StreamExecutor* naked_executor = executor.ValueOrDie().get(); + executor_cache_.Insert(config, executor.ConsumeValueOrDie()); + return naked_executor; +} + +port::StatusOr<std::unique_ptr<StreamExecutor>> +CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) { + auto executor = port::MakeUnique<StreamExecutor>(PlatformKind::kCuda, + config.plugin_config); + auto init_status = executor->Init(config.ordinal, config.device_options); + if (!init_status.ok()) { + return port::Status{ + port::error::INTERNAL, + port::Printf( + "failed initializing StreamExecutor for CUDA device ordinal %d: %s", + config.ordinal, init_status.ToString().c_str())}; + } + + return std::move(executor); +} + +void CudaPlatform::RegisterTraceListener( + std::unique_ptr<TraceListener> listener) { + LOG(FATAL) << "not yet implemented: register CUDA trace listener"; +} + +void CudaPlatform::UnregisterTraceListener(TraceListener* listener) { + LOG(FATAL) << "not yet implemented: unregister CUDA trace listener"; +} + +} // namespace cuda + +static void InitializeCudaPlatform() { + // Disabling leak checking, MultiPlatformManager does not destroy its + // registered platforms. + + std::unique_ptr<cuda::CudaPlatform> platform(new cuda::CudaPlatform); + SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform))); +} + +} // namespace gputools +} // namespace perftools + +REGISTER_MODULE_INITIALIZER(cuda_platform, + perftools::gputools::InitializeCudaPlatform()); diff --git a/tensorflow/stream_executor/cuda/cuda_platform.h b/tensorflow/stream_executor/cuda/cuda_platform.h new file mode 100644 index 0000000000..966d7343f7 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_platform.h @@ -0,0 +1,98 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_ + +#include <memory> +#include "tensorflow/stream_executor/platform/port.h" +#include <vector> + +#include "tensorflow/stream_executor/executor_cache.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/multi_platform_manager.h" +#include "tensorflow/stream_executor/platform.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/platform/thread_annotations.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" +#include "tensorflow/stream_executor/stream_executor_pimpl.h" +#include "tensorflow/stream_executor/trace_listener.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +// Opaque and unique identifier for the CUDA platform plugin. +// This is needed so that plugins can refer to/identify this platform without +// instantiating a CudaPlatform object. +extern const Platform::Id kCudaPlatformId; + +// Cuda-specific platform plugin, registered as a singleton value via module +// initializer. +class CudaPlatform : public Platform { + public: + CudaPlatform(); + ~CudaPlatform() override; + + // CudaPlatform-specific functionality + // Returns the number of distinct buses / NUMA nodes on the machine. + int BusCount(); + + // Returns the bus/NUMA node for the specified device ordinal. + int DeviceToBus(int device_ordinal); + + // Returns the lowest-ordinal-number StreamExecutor on the specified bus. + port::StatusOr<StreamExecutor*> FirstExecutorForBus(int bus_ordinal); + + // Platform interface implementation: + // Returns the same value as kCudaPlatform above. + Platform::Id id() const override; + + // Returns -1 as a sentinel on internal failure (and logs the error). + int VisibleDeviceCount() const override; + + const string& Name() const override; + + port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override; + + port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig( + int ordinal, const PluginConfig& config) override; + + port::StatusOr<StreamExecutor*> GetExecutor( + const StreamExecutorConfig& config) override; + + port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor( + const StreamExecutorConfig& config) override; + + void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override; + + void UnregisterTraceListener(TraceListener* listener) override; + + private: + // Determines the number of NUMA nodes and the assignment of executor to each. + void InspectNumaNodes(); + + // This platform's name. + string name_; + + // mutex that guards internal state. + mutable mutex mu_; + + // Cache of created executors. + ExecutorCache executor_cache_; + + // The smallest NUMA node value for any device managed by this machine + // manager. Used, along with limit_numa_node_, to convert NUMA nodes into bus + // ordinals. The NUMA node space occupied by GPUs is assumed to be dense./ + int min_numa_node_; + + // Larger than the NUMA node value for any device managed by this machine + // manager. + int limit_numa_node_; + + SE_DISALLOW_COPY_AND_ASSIGN(CudaPlatform); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc new file mode 100644 index 0000000000..ad48c8b59a --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_rng.cc @@ -0,0 +1,317 @@ +#include "tensorflow/stream_executor/cuda/cuda_rng.h" + +#include <dlfcn.h> + +#include "tensorflow/stream_executor/cuda/cuda_activation.h" +#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" +#include "tensorflow/stream_executor/cuda/cuda_helpers.h" +#include "tensorflow/stream_executor/cuda/cuda_platform.h" +#include "tensorflow/stream_executor/device_memory.h" +#include "tensorflow/stream_executor/dso_loader.h" +#include "tensorflow/stream_executor/lib/initialize.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/rng.h" +#include "third_party/gpus/cuda/include/curand.h" + +// Formats curandStatus_t to output prettified values into a log stream. +std::ostream &operator<<(std::ostream &in, const curandStatus_t &status) { +#define OSTREAM_CURAND_STATUS(__name) \ + case CURAND_STATUS_##__name: \ + in << "CURAND_STATUS_" #__name; \ + return in; + + switch (status) { + OSTREAM_CURAND_STATUS(SUCCESS) + OSTREAM_CURAND_STATUS(VERSION_MISMATCH) + OSTREAM_CURAND_STATUS(NOT_INITIALIZED) + OSTREAM_CURAND_STATUS(ALLOCATION_FAILED) + OSTREAM_CURAND_STATUS(TYPE_ERROR) + OSTREAM_CURAND_STATUS(OUT_OF_RANGE) + OSTREAM_CURAND_STATUS(LENGTH_NOT_MULTIPLE) + OSTREAM_CURAND_STATUS(LAUNCH_FAILURE) + OSTREAM_CURAND_STATUS(PREEXISTING_FAILURE) + OSTREAM_CURAND_STATUS(INITIALIZATION_FAILED) + OSTREAM_CURAND_STATUS(ARCH_MISMATCH) + OSTREAM_CURAND_STATUS(INTERNAL_ERROR) + default: + in << "curandStatus_t(" << static_cast<int>(status) << ")"; + return in; + } +} + +namespace perftools { +namespace gputools { +namespace cuda { + +PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuRandPlugin); + +namespace dynload { + +#define PERFTOOLS_GPUTOOLS_CURAND_WRAP(__name) \ + struct DynLoadShim__##__name { \ + static const char *kName; \ + using FuncPointerT = std::add_pointer<decltype(::__name)>::type; \ + static void *GetDsoHandle() { \ + static auto status = internal::CachedDsoLoader::GetCurandDsoHandle(); \ + return status.ValueOrDie(); \ + } \ + static FuncPointerT DynLoad() { \ + static void *f = dlsym(GetDsoHandle(), kName); \ + CHECK(f != nullptr) << "could not find " << kName \ + << " in curand DSO; dlerror: " << dlerror(); \ + return reinterpret_cast<FuncPointerT>(f); \ + } \ + template <typename... Args> \ + curandStatus_t operator()(CUDAExecutor * parent, Args... args) { \ + cuda::ScopedActivateExecutorContext sac{parent}; \ + return DynLoad()(args...); \ + } \ + } __name; \ + const char *DynLoadShim__##__name::kName = #__name; + +PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandCreateGenerator); +PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandDestroyGenerator); +PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandSetStream); +PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateUniform); +PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateUniformDouble); +PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandSetPseudoRandomGeneratorSeed); +PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandSetGeneratorOffset); +PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateNormal); +PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateNormalDouble); + +} // namespace dynload + +template <typename T> +string TypeString(); + +template <> +string TypeString<float>() { + return "float"; +} + +template <> +string TypeString<double>() { + return "double"; +} + +template <> +string TypeString<std::complex<float>>() { + return "std::complex<float>"; +} + +template <> +string TypeString<std::complex<double>>() { + return "std::complex<double>"; +} + +CUDARng::CUDARng(CUDAExecutor *parent) : parent_(parent), rng_(nullptr) {} + +CUDARng::~CUDARng() { + if (rng_ != nullptr) { + dynload::curandDestroyGenerator(parent_, rng_); + } +} + +bool CUDARng::Init() { + mutex_lock lock{mu_}; + CHECK(rng_ == nullptr); + + curandStatus_t ret = + dynload::curandCreateGenerator(parent_, &rng_, CURAND_RNG_PSEUDO_DEFAULT); + if (ret != CURAND_STATUS_SUCCESS) { + LOG(ERROR) << "failed to create random number generator: " << ret; + return false; + } + + CHECK(rng_ != nullptr); + return true; +} + +bool CUDARng::SetStream(Stream *stream) { + curandStatus_t ret = + dynload::curandSetStream(parent_, rng_, AsCUDAStreamValue(stream)); + if (ret != CURAND_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set stream for random generation: " << ret; + return false; + } + + return true; +} + +// Returns true if std::complex stores its contents as two consecutive +// elements. Tests int, float and double, as the last two are independent +// specializations. +constexpr bool ComplexIsConsecutiveFloats() { + return sizeof(std::complex<int>) == 8 && sizeof(std::complex<float>) == 8 && + sizeof(std::complex<double>) == 16; +} + +template <typename T> +bool CUDARng::DoPopulateRandUniformInternal(Stream *stream, + DeviceMemory<T> *v) { + mutex_lock lock{mu_}; + static_assert(ComplexIsConsecutiveFloats(), + "std::complex values are not stored as consecutive values"); + + if (!SetStream(stream)) { + return false; + } + + // std::complex<T> is currently implemented as two consecutive T variables. + uint64 element_count = v->ElementCount(); + if (std::is_same<T, std::complex<float>>::value || + std::is_same<T, std::complex<double>>::value) { + element_count *= 2; + } + + curandStatus_t ret; + if (std::is_same<T, float>::value || + std::is_same<T, std::complex<float>>::value) { + ret = dynload::curandGenerateUniform( + parent_, rng_, reinterpret_cast<float *>(CUDAMemoryMutable(v)), + element_count); + } else { + ret = dynload::curandGenerateUniformDouble( + parent_, rng_, reinterpret_cast<double *>(CUDAMemoryMutable(v)), + element_count); + } + if (ret != CURAND_STATUS_SUCCESS) { + LOG(ERROR) << "failed to do uniform generation of " << v->ElementCount() + << " " << TypeString<T>() << "s at " << v->opaque() << ": " + << ret; + return false; + } + + return true; +} + +bool CUDARng::DoPopulateRandUniform(Stream *stream, DeviceMemory<float> *v) { + return DoPopulateRandUniformInternal(stream, v); +} + +bool CUDARng::DoPopulateRandUniform(Stream *stream, DeviceMemory<double> *v) { + return DoPopulateRandUniformInternal(stream, v); +} + +bool CUDARng::DoPopulateRandUniform(Stream *stream, + DeviceMemory<std::complex<float>> *v) { + return DoPopulateRandUniformInternal(stream, v); +} + +bool CUDARng::DoPopulateRandUniform(Stream *stream, + DeviceMemory<std::complex<double>> *v) { + return DoPopulateRandUniformInternal(stream, v); +} + +template <typename ElemT, typename FuncT> +bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean, + ElemT stddev, + DeviceMemory<ElemT> *v, + FuncT func) { + mutex_lock lock{mu_}; + + if (!SetStream(stream)) { + return false; + } + + uint64 element_count = v->ElementCount(); + curandStatus_t ret = + func(parent_, rng_, CUDAMemoryMutable(v), element_count, mean, stddev); + + if (ret != CURAND_STATUS_SUCCESS) { + LOG(ERROR) << "failed to do gaussian generation of " << v->ElementCount() + << " floats at " << v->opaque() << ": " << ret; + return false; + } + + return true; +} + +bool CUDARng::DoPopulateRandGaussian(Stream *stream, float mean, float stddev, + DeviceMemory<float> *v) { + return DoPopulateRandGaussianInternal(stream, mean, stddev, v, + dynload::curandGenerateNormal); +} + +bool CUDARng::DoPopulateRandGaussian(Stream *stream, double mean, double stddev, + DeviceMemory<double> *v) { + return DoPopulateRandGaussianInternal(stream, mean, stddev, v, + dynload::curandGenerateNormalDouble); +} + +bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) { + mutex_lock lock{mu_}; + CHECK(rng_ != nullptr); + + if (!CheckSeed(seed, seed_bytes)) { + return false; + } + + if (!SetStream(stream)) { + return false; + } + + // Requires 8 bytes of seed data; checked in RngSupport::CheckSeed (above) + // (which itself requires 16 for API consistency with host RNG fallbacks). + curandStatus_t ret = dynload::curandSetPseudoRandomGeneratorSeed( + parent_, rng_, *(reinterpret_cast<const uint64 *>(seed))); + if (ret != CURAND_STATUS_SUCCESS) { + LOG(ERROR) << "failed to set rng seed: " << ret; + return false; + } + + ret = dynload::curandSetGeneratorOffset(parent_, rng_, 0); + if (ret != CURAND_STATUS_SUCCESS) { + LOG(ERROR) << "failed to reset rng position: " << ret; + return false; + } + return true; +} + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +namespace gpu = ::perftools::gputools; + +REGISTER_MODULE_INITIALIZER(register_curand, { + gpu::port::Status status = + gpu::PluginRegistry::Instance() + ->RegisterFactory<gpu::PluginRegistry::RngFactory>( + gpu::cuda::kCudaPlatformId, gpu::cuda::kCuRandPlugin, "cuRAND", + [](gpu::internal::StreamExecutorInterface + *parent) -> gpu::rng::RngSupport * { + gpu::cuda::CUDAExecutor *cuda_executor = + dynamic_cast<gpu::cuda::CUDAExecutor *>(parent); + if (cuda_executor == nullptr) { + LOG(ERROR) + << "Attempting to initialize an instance of the cuRAND " + << "support library with a non-CUDA StreamExecutor"; + return nullptr; + } + + gpu::cuda::CUDARng *rng = new gpu::cuda::CUDARng(cuda_executor); + if (!rng->Init()) { + // Note: Init() will log a more specific error. + delete rng; + return nullptr; + } + return rng; + }); + + if (!status.ok()) { + LOG(ERROR) << "Unable to register cuRAND factory: " + << status.error_message(); + } + + // Prime the cuRAND DSO. The loader will log more information. + auto statusor = gpu::internal::CachedDsoLoader::GetCurandDsoHandle(); + if (!statusor.ok()) { + LOG(INFO) << "Unable to load cuRAND DSO."; + } + + gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId, + gpu::PluginKind::kRng, + gpu::cuda::kCuRandPlugin); +}); diff --git a/tensorflow/stream_executor/cuda/cuda_rng.h b/tensorflow/stream_executor/cuda/cuda_rng.h new file mode 100644 index 0000000000..4e1b82969b --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_rng.h @@ -0,0 +1,89 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_ + +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/platform/thread_annotations.h" +#include "tensorflow/stream_executor/plugin_registry.h" +#include "tensorflow/stream_executor/rng.h" + +typedef struct curandGenerator_st *curandGenerator_t; + +namespace perftools { +namespace gputools { + +class Stream; +template <typename ElemT> +class DeviceMemory; + +namespace cuda { + +// Opaque and unique identifier for the cuRAND plugin. +extern const PluginId kCuRandPlugin; + +class CUDAExecutor; + +// CUDA-platform implementation of the random number generation support +// interface. +// +// Thread-safe post-initialization. +class CUDARng : public rng::RngSupport { + public: + explicit CUDARng(CUDAExecutor *parent); + + // Retrieves a curand library generator handle. This is necessary for + // enqueuing random number generation work onto the device. + // TODO(leary) provide a way for users to select the RNG algorithm. + bool Init(); + + // Releases a curand library generator handle, if one was acquired. + ~CUDARng() override; + + // See rng::RngSupport for details on the following overrides. + bool DoPopulateRandUniform(Stream *stream, DeviceMemory<float> *v) override; + bool DoPopulateRandUniform(Stream *stream, DeviceMemory<double> *v) override; + bool DoPopulateRandUniform(Stream *stream, + DeviceMemory<std::complex<float>> *v) override; + bool DoPopulateRandUniform(Stream *stream, + DeviceMemory<std::complex<double>> *v) override; + bool DoPopulateRandGaussian(Stream *stream, float mean, float stddev, + DeviceMemory<float> *v) override; + bool DoPopulateRandGaussian(Stream *stream, double mean, double stddev, + DeviceMemory<double> *v) override; + + bool SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) override; + + private: + // Actually performs the work of generating random numbers - the public + // methods are thin wrappers to this interface. + template <typename T> + bool DoPopulateRandUniformInternal(Stream *stream, DeviceMemory<T> *v); + template <typename ElemT, typename FuncT> + bool DoPopulateRandGaussianInternal(Stream *stream, ElemT mean, ElemT stddev, + DeviceMemory<ElemT> *v, FuncT func); + + // Sets the stream for the internal curand generator. + // + // This is a stateful operation, as the handle can only have one stream set at + // a given time, so it is usually performed right before enqueuing work to do + // with random number generation. + bool SetStream(Stream *stream) EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // mutex that guards the cuRAND handle for this device. + mutex mu_; + + // CUDAExecutor which instantiated this CUDARng. + // Immutable post-initialization. + CUDAExecutor *parent_; + + // cuRANDalibrary handle on the device. + curandGenerator_t rng_ GUARDED_BY(mu_); + + SE_DISALLOW_COPY_AND_ASSIGN(CUDARng); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_stream.cc b/tensorflow/stream_executor/cuda/cuda_stream.cc new file mode 100644 index 0000000000..e70579b55c --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_stream.cc @@ -0,0 +1,51 @@ +#include "tensorflow/stream_executor/cuda/cuda_stream.h" + +#include "tensorflow/stream_executor/lib/status.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +bool CUDAStream::Init() { + return CUDADriver::CreateStream(parent_->cuda_context(), &cuda_stream_); +} + +void CUDAStream::Destroy() { + { + mutex_lock lock{mu_}; + if (completed_event_ != nullptr) { + port::Status status = + CUDADriver::DestroyEvent(parent_->cuda_context(), &completed_event_); + if (!status.ok()) { + LOG(ERROR) << status.error_message(); + } + } + } + + CUDADriver::DestroyStream(parent_->cuda_context(), &cuda_stream_); +} + +bool CUDAStream::IsIdle() const { + return CUDADriver::IsStreamIdle(parent_->cuda_context(), cuda_stream_); +} + +bool CUDAStream::GetOrCreateCompletedEvent(CUevent *completed_event) { + mutex_lock lock{mu_}; + if (completed_event_ != nullptr) { + *completed_event = completed_event_; + return true; + } + + if (!CUDADriver::CreateEvent(parent_->cuda_context(), &completed_event_, + CUDADriver::EventFlags::kDisableTiming) + .ok()) { + return false; + } + + *completed_event = completed_event_; + return true; +} + +} // namespace cuda +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/cuda/cuda_stream.h b/tensorflow/stream_executor/cuda/cuda_stream.h new file mode 100644 index 0000000000..f6db64a1bf --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_stream.h @@ -0,0 +1,74 @@ +// Defines the CUDAStream type - the CUDA-specific implementation of the generic +// StreamExecutor Stream interface. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_ + +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +class CUDAExecutor; + +// Wraps a CUstream in order to satisfy the platform-independent +// StreamInterface. +// +// Thread-safe post-initialization. +class CUDAStream : public internal::StreamInterface { + public: + explicit CUDAStream(CUDAExecutor *parent) + : parent_(parent), cuda_stream_(nullptr), completed_event_(nullptr) {} + + // Note: teardown is handled by a parent's call to DeallocateStream. + ~CUDAStream() override {} + + void *CudaStreamHack() override { return cuda_stream_; } + void **CudaStreamMemberHack() override { + return reinterpret_cast<void **>(&cuda_stream_); + } + + // Explicitly initialize the CUDA resources associated with this stream, used + // by StreamExecutor::AllocateStream(). + bool Init(); + + // Explicitly destroy the CUDA resources associated with this stream, used by + // StreamExecutor::DeallocateStream(). + void Destroy(); + + // Returns true if no work is pending or executing on the stream. + bool IsIdle() const; + + // Retrieves an event which indicates that all work enqueued into the stream + // has completed. Ownership of the event is not transferred to the caller, the + // event is owned by this stream. + bool GetOrCreateCompletedEvent(CUevent *completed_event); + + // Returns the CUstream value for passing to the CUDA API. + // + // Precond: this CUDAStream has been allocated (otherwise passing a nullptr + // into the NVIDIA library causes difficult-to-understand faults). + CUstream cuda_stream() const { + DCHECK(cuda_stream_ != nullptr); + return const_cast<CUstream>(cuda_stream_); + } + + CUDAExecutor *parent() const { return parent_; } + + private: + mutex mu_; // mutex that guards the completion event. + CUDAExecutor *parent_; // Executor that spawned this stream. + CUstream cuda_stream_; // Wrapped CUDA stream handle. + + // Event that indicates this stream has completed. + CUevent completed_event_ GUARDED_BY(mu_); +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_ diff --git a/tensorflow/stream_executor/cuda/cuda_timer.cc b/tensorflow/stream_executor/cuda/cuda_timer.cc new file mode 100644 index 0000000000..ad5e13ab6b --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_timer.cc @@ -0,0 +1,73 @@ +#include "tensorflow/stream_executor/cuda/cuda_timer.h" + +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" +#include "tensorflow/stream_executor/cuda/cuda_stream.h" +#include "tensorflow/stream_executor/lib/status.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +bool CUDATimer::Init() { + CHECK(start_event_ == nullptr && stop_event_ == nullptr); + CUcontext context = parent_->cuda_context(); + if (!CUDADriver::CreateEvent(context, &start_event_, + CUDADriver::EventFlags::kDefault) + .ok()) { + return false; + } + + if (!CUDADriver::CreateEvent(context, &stop_event_, + CUDADriver::EventFlags::kDefault) + .ok()) { + port::Status status = CUDADriver::DestroyEvent(context, &start_event_); + if (!status.ok()) { + LOG(ERROR) << status; + } + return false; + } + + CHECK(start_event_ != nullptr && stop_event_ != nullptr); + return true; +} + +void CUDATimer::Destroy() { + CUcontext context = parent_->cuda_context(); + port::Status status = CUDADriver::DestroyEvent(context, &start_event_); + if (!status.ok()) { + LOG(ERROR) << status; + } + + status = CUDADriver::DestroyEvent(context, &stop_event_); + if (!status.ok()) { + LOG(ERROR) << status; + } +} + +float CUDATimer::GetElapsedMilliseconds() const { + CHECK(start_event_ != nullptr && stop_event_ != nullptr); + // TODO(leary) provide a way to query timer resolution? + // CUDA docs say a resolution of about 0.5us + float elapsed_milliseconds = NAN; + (void)CUDADriver::GetEventElapsedTime(parent_->cuda_context(), + &elapsed_milliseconds, start_event_, + stop_event_); + return elapsed_milliseconds; +} + +bool CUDATimer::Start(CUDAStream *stream) { + return CUDADriver::RecordEvent(parent_->cuda_context(), start_event_, + stream->cuda_stream()) + .ok(); +} + +bool CUDATimer::Stop(CUDAStream *stream) { + return CUDADriver::RecordEvent(parent_->cuda_context(), stop_event_, + stream->cuda_stream()) + .ok(); +} + +} // namespace cuda +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/cuda/cuda_timer.h b/tensorflow/stream_executor/cuda/cuda_timer.h new file mode 100644 index 0000000000..e49e212403 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_timer.h @@ -0,0 +1,69 @@ +// Defines the CUDATimer type - the CUDA-specific implementation of the generic +// StreamExecutor Timer interface. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_ + +#include "tensorflow/stream_executor/stream_executor_internal.h" +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" + +namespace perftools { +namespace gputools { +namespace cuda { + +class CUDAExecutor; +class CUDAStream; + +// Wraps a pair of CUevents in order to satisfy the platform-independent +// TimerInferface -- both a start and a stop event are present which may be +// recorded in a stream. +class CUDATimer : public internal::TimerInterface { + public: + explicit CUDATimer(CUDAExecutor *parent) + : parent_(parent), start_event_(nullptr), stop_event_(nullptr) {} + + // Note: teardown is explicitly handled in this API by a call to + // StreamExecutor::DeallocateTimer(), which invokes Destroy(). + ~CUDATimer() override {} + + // Allocates the platform-specific pieces of the timer, called as part of + // StreamExecutor::AllocateTimer(). + bool Init(); + + // Deallocates the platform-specific pieces of the timer, called as part of + // StreamExecutor::DeallocateTimer(). + void Destroy(); + + // Records the "timer start" event at the current point in the stream. + bool Start(CUDAStream *stream); + + // Records the "timer stop" event at the current point in the stream. + bool Stop(CUDAStream *stream); + + // Returns the elapsed time, in milliseconds, between the start and stop + // events. + float GetElapsedMilliseconds() const; + + // See perftools::gputools::Timer::Microseconds(). + // TODO(leary) make this into an error code interface... + uint64 Microseconds() const override { + return GetElapsedMilliseconds() * 1e3; + } + + // See perftools::GPUTools::Timer::Nanoseconds(). + uint64 Nanoseconds() const override { return GetElapsedMilliseconds() * 1e6; } + + private: + CUDAExecutor *parent_; + CUevent start_event_; // Event recorded to indicate the "start" timestamp + // executing in a stream. + CUevent stop_event_; // Event recorded to indicate the "stop" timestamp + // executing in a stream. +}; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_ diff --git a/tensorflow/stream_executor/cuda/multi_op_activation.h b/tensorflow/stream_executor/cuda/multi_op_activation.h new file mode 100644 index 0000000000..ba2bcd3a91 --- /dev/null +++ b/tensorflow/stream_executor/cuda/multi_op_activation.h @@ -0,0 +1,16 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_MULTI_OP_ACTIVATION_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_MULTI_OP_ACTIVATION_H_ + +namespace perftools { +namespace gputools { +namespace cuda { + +// Type-safe boolean wrapper: denotes whether a ScopedActivateExecutorContext +// may have other ScopedActivateExecutorContexts nested within it. +enum class MultiOpActivation { kNo = false, kYes = true }; + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_MULTI_OP_ACTIVATION_H_ diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc new file mode 100644 index 0000000000..23c110c2f3 --- /dev/null +++ b/tensorflow/stream_executor/device_description.cc @@ -0,0 +1,221 @@ +#include "tensorflow/stream_executor/device_description.h" + +#include <algorithm> + +#include "tensorflow/stream_executor/lib/human_readable.h" +#include "tensorflow/stream_executor/lib/mathutil.h" +#include "tensorflow/stream_executor/lib/strcat.h" + +namespace perftools { +namespace gputools { + +static const uint64 kUninitializedUint64 = -1ULL; +/* static */ const char *DeviceDescription::kUndefinedString = "<undefined>"; + +DeviceDescription::DeviceDescription() + : device_vendor_(kUndefinedString), + platform_version_(kUndefinedString), + driver_version_(kUndefinedString), + runtime_version_(kUndefinedString), + pci_bus_id_(kUndefinedString), + name_(kUndefinedString), + thread_dim_limit_(kUninitializedUint64, kUninitializedUint64, + kUninitializedUint64), + block_dim_limit_(kUninitializedUint64, kUninitializedUint64, + kUninitializedUint64), + blocks_per_core_limit_(kUninitializedUint64), + threads_per_core_limit_(kUninitializedUint64), + threads_per_block_limit_(kUninitializedUint64), + threads_per_warp_(kUninitializedUint64), + registers_per_core_limit_(kUninitializedUint64), + registers_per_block_limit_(kUninitializedUint64), + registers_per_thread_limit_(kUninitializedUint64), + warp_alloc_granularity_(1), + register_alloc_granularity_(1), + shared_memory_alloc_granularity_(1), + device_address_bits_(kUninitializedUint64), + device_memory_size_(kUninitializedUint64), + shared_memory_per_core_(kUninitializedUint64), + shared_memory_per_block_(kUninitializedUint64), + clock_rate_ghz_(-1.0), + cuda_compute_capability_major_(-1), + cuda_compute_capability_minor_(-1), + numa_node_(-1), + core_count_(-1), + ecc_enabled_(false) {} + +std::unique_ptr<std::map<string, string>> DeviceDescription::ToMap() const { + std::unique_ptr<std::map<string, string>> owned_result{ + new std::map<string, string>}; + std::map<string, string> &result = *owned_result; + result["Device Vendor"] = device_vendor(); + result["Platform Version"] = platform_version(); + result["Driver Version"] = driver_version(); + result["Runtime Version"] = runtime_version(); + result["PCI bus ID"] = pci_bus_id_; + result["Device Name"] = name_; + + const ThreadDim &thread_dim = thread_dim_limit(); + result["ThreadDim Limit"] = + port::StrCat(thread_dim.x, ",", thread_dim.y, ",", thread_dim.z); + const BlockDim &block_dim = block_dim_limit(); + result["BlockDim Limit"] = + port::StrCat(block_dim.x, ",", block_dim.y, ",", block_dim.z); + + result["Threads Per Core Limit"] = port::StrCat(threads_per_core_limit()); + result["Threads Per Block Limit"] = port::StrCat(threads_per_block_limit()); + result["Registers Per Block Limit"] = + port::StrCat(registers_per_block_limit()); + + result["Device Address Bits"] = port::StrCat(device_address_bits()); + result["Device Memory Size"] = + port::HumanReadableNumBytes::ToString(device_memory_size()); + + result["Shared Memory Per Core"] = + port::HumanReadableNumBytes::ToString(shared_memory_per_core_); + result["Shared Memory Per Block"] = + port::HumanReadableNumBytes::ToString(shared_memory_per_block_); + + result["Clock Rate GHz"] = port::StrCat(clock_rate_ghz()); + + result["CUDA Compute Capability"] = port::StrCat( + cuda_compute_capability_major_, ".", cuda_compute_capability_minor_); + + result["NUMA Node"] = port::StrCat(numa_node()); + result["Core Count"] = port::StrCat(core_count()); + result["ECC Enabled"] = port::StrCat(ecc_enabled()); + return owned_result; +} + +namespace internal { + +DeviceDescriptionBuilder::DeviceDescriptionBuilder() + : device_description_(new DeviceDescription) {} + +} // namespace internal + +bool DeviceDescription::cuda_compute_capability(int *major, int *minor) const { + *major = cuda_compute_capability_major_; + *minor = cuda_compute_capability_minor_; + return cuda_compute_capability_major_ != 0; +} + +bool ThreadDimOk(const DeviceDescription &device_description, + const ThreadDim &thread_dim) { + auto total_threads = thread_dim.x * thread_dim.y * thread_dim.z; + auto threads_per_block_limit = device_description.threads_per_block_limit(); + if (total_threads > threads_per_block_limit) { + VLOG(2) << "exceeded total-thread-per-block limit: " << total_threads + << " vs limit " << threads_per_block_limit; + return false; + } + + const auto &limit = device_description.thread_dim_limit(); + bool ok = thread_dim.x <= limit.x && thread_dim.y <= limit.y && + thread_dim.z <= limit.z; + if (!ok) { + VLOG(2) << "thread dim " << thread_dim.ToString() + << " exceeds limit contraints of " << limit.ToString(); + } + return ok; +} + +uint64 DivideCeil(uint64 x, uint64 y) { + return port::MathUtil::CeilOfRatio(x, y); +} + +void CalculateDimensionality(const DeviceDescription &device_description, + uint64 element_count, uint64 *threads_per_block, + uint64 *block_count) { + *threads_per_block = device_description.threads_per_block_limit(); + *block_count = DivideCeil(element_count, *threads_per_block); + if (*block_count == 1) { + CHECK_LE(element_count, *threads_per_block); + *threads_per_block = element_count; + } +} + +// Round value up to a multiple of n. +static uint64 RoundUp(uint64 value, uint64 n) { + return port::MathUtil::CeilOfRatio(value, n) * n; +} + +// Round value down to a multiple of n. +static uint64 RoundDown(uint64 value, uint64 n) { + return port::MathUtil::FloorOfRatio(value, n) * n; +} + +uint64 CalculateOccupancy(const DeviceDescription &device_description, + uint64 registers_per_thread, + uint64 shared_memory_per_block, + const ThreadDim &thread_dims) { + // Don't try to compute occupancy if necessary values are not initialized. + uint64 required_fields[] = { device_description.registers_per_thread_limit(), + device_description.threads_per_warp(), + device_description.warp_alloc_granularity(), + device_description.register_alloc_granularity(), + device_description.registers_per_block_limit(), + device_description.shared_memory_per_core(), + device_description.blocks_per_core_limit() }; + for (auto value : required_fields) { + if (value == kUninitializedUint64) { + return 0; + } + } + + if (registers_per_thread > device_description.registers_per_thread_limit()) { + return 0; + } + + uint64 warps_per_block = + port::MathUtil::CeilOfRatio(thread_dims.x * thread_dims.y * thread_dims.z, + device_description.threads_per_warp()); + + // Warp resources are allocated at a particular granularity. This value is + // the effective number of warps for resource allocation purposes. + uint64 alloc_warps_per_block = + RoundUp(warps_per_block, device_description.warp_alloc_granularity()); + + uint64 alloc_regs_per_warp = + RoundUp(device_description.threads_per_warp() * registers_per_thread, + device_description.register_alloc_granularity()); + uint64 regs_per_block = alloc_warps_per_block * alloc_regs_per_warp; + uint64 reg_limit = + device_description.registers_per_block_limit() / regs_per_block; + + uint64 alloc_smem_per_block = RoundUp( + shared_memory_per_block, + device_description.shared_memory_alloc_granularity()); + uint64 smem_limit = alloc_smem_per_block > 0 ? + device_description.shared_memory_per_core() / alloc_smem_per_block : + device_description.blocks_per_core_limit(); + + uint64 thread_limit = device_description.threads_per_core_limit() + / (warps_per_block * device_description.threads_per_warp()); + + return std::min({ device_description.blocks_per_core_limit(), + reg_limit, smem_limit, thread_limit }); +} + +uint64 CalculateRegisterLimitForTargetOccupancy( + const DeviceDescription &device_description, uint64 shared_memory_per_block, + const ThreadDim &thread_dims, uint64 target_blocks_per_core) { + // Linear search from maximum number of registers down until the target + // blocks per SM is found. + // TODO(meheff): Compute this using a closed form solution. + int reg_step = device_description.register_alloc_granularity() / + device_description.threads_per_warp(); + for (int r = device_description.registers_per_thread_limit(); r > 0; + r = RoundDown(r - 1, reg_step)) { + uint64 occupancy = CalculateOccupancy( + device_description, r, shared_memory_per_block, thread_dims); + if (occupancy >= target_blocks_per_core) { + return r; + } + } + return 0; +} + + +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h new file mode 100644 index 0000000000..e7b7102da5 --- /dev/null +++ b/tensorflow/stream_executor/device_description.h @@ -0,0 +1,370 @@ +// Describes the underlying platform for a StreamExecutor; e.g. OpenCL or CUDA +// device and platform properties. Also contains convenience functions for +// checking/calculating launch dimensionality based on device properties. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ +#define TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ + +#include <map> +#include <memory> +#include "tensorflow/stream_executor/platform/port.h" + +#include "tensorflow/stream_executor/launch_dim.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { +namespace internal { +class DeviceDescriptionBuilder; +} // namespace internal + +// Data that describes the execution target of the StreamExecutor, in terms of +// important logical parameters. These include dimensionality limits and +// physical parameters of interest, such as number of cores present on the +// device. +// +// Thread-safe: immutable post-initialization. +class DeviceDescription { + public: + // Returns the platform being run on; this value is primarily intended for + // printing, and comes out something like "OpenCL 1.2" or "Compute Capability + // 3.5". + const string &platform_version() const { return platform_version_; } + + // Returns the driver version interfacing with the underlying platform. Vendor + // dependent format. + const string &driver_version() const { return driver_version_; } + + // Return the runtime version, if one is provided by the underlying platform. + // Vendor dependent format / usefulness. + const string &runtime_version() const { return runtime_version_; } + + // Returns the name that the device reports. Vendor dependent. + const string &name() const { return name_; } + + // Returns the PCI bus identifier for this device, of the form + // [domain]:[bus]:[device].[function] + const string &pci_bus_id() const { return pci_bus_id_; } + + // Returns the NUMA node associated with this device, for use in + // determining socket locality. If the NUMA node could not be determined, -1 + // is returned. + int numa_node() const { return numa_node_; } + + // Number of cores (traditional notion of core; i.e. an SM on an NVIDIA device + // or an AMD Compute Unit. + int core_count() const { return core_count_; } + + // Returns the limit on the thread dimensionality values in each of the + // respective dimensions. These limits affect what constitutes a legitimate + // kernel launch request. + const ThreadDim &thread_dim_limit() const { return thread_dim_limit_; } + + // Returns the limit on the block dimensionality values in each of the + // respective dimensions. These limits may affect what constitutes a + // legitimate kernel launch request. + const BlockDim &block_dim_limit() const { return block_dim_limit_; } + + // Returns the limit on the number of simultaneously resident blocks + // on a multiprocessor. + const uint64 blocks_per_core_limit() const { return blocks_per_core_limit_; } + + // Returns the limit on the total number of threads that can be launched in a + // single block; i.e. the limit on x * y * z dimensions of a ThreadDim. + // This limit affects what constitutes a legitimate kernel launch request. + const uint64 &threads_per_block_limit() const { + return threads_per_block_limit_; + } + + // Returns the limit on the total number of threads that can be simultaneously + // launched on a given multiprocessor. + const uint64 &threads_per_core_limit() const { + return threads_per_core_limit_; + } + + // Returns the number of threads per warp/wavefront. + const uint64 &threads_per_warp() const { return threads_per_warp_; } + + // Returns the limit on the total number of registers per core. + const uint64 ®isters_per_core_limit() const { + return registers_per_core_limit_; + } + + // Returns the limit on the total number of registers that can be + // simultaneously used by a block. + const uint64 ®isters_per_block_limit() const { + return registers_per_block_limit_; + } + + // Returns the limit on the total number of registers that can be + // allocated to a thread. + const uint64 ®isters_per_thread_limit() const { + return registers_per_thread_limit_; + } + + // Returns the granularity at which warps are allocated resources. + const uint64 &warp_alloc_granularity() const { + return warp_alloc_granularity_; + } + + // Returns the granularity at which registers are allocated to warps. + const uint64 ®ister_alloc_granularity() const { + return register_alloc_granularity_; + } + + // Returns the granularity at which shared memory is allocated to warps. + const uint64 &shared_memory_alloc_granularity() const { + return shared_memory_alloc_granularity_; + } + + // Returns the number of address bits available to kernel code running on the + // platform. This affects things like the maximum allocation size and perhaps + // types used in kernel code such as size_t. + const uint64 &device_address_bits() const { return device_address_bits_; } + + // Returns the device memory size in bytes. + uint64 device_memory_size() const { return device_memory_size_; } + + // Returns the device's core clock rate in GHz. + const float clock_rate_ghz() const { return clock_rate_ghz_; } + + // Returns whether ECC is enabled. + bool ecc_enabled() const { return ecc_enabled_; } + + // Returns the device vendor string, e.g., "NVIDIA Corporation", "Advanced + // Micro Devices, Inc.", or "GenuineIntel". + const string &device_vendor() const { return device_vendor_; } + + // Returns the CUDA compute capability if we're running on the CUDA platform. + // If a CUDA compute capability is not available, the major version will be + // zero, and the return value will be false. + bool cuda_compute_capability(int *major, int *minor) const; + + // Returns the maximum amount of shared memory present on a single core + // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL + // devices). Note that some devices, such as NVIDIA's have a configurable + // partitioning between shared memory and L1 cache. + uint64 shared_memory_per_core() const { return shared_memory_per_core_; } + + // Returns the maximum amount of shared memory available for a single block. + uint64 shared_memory_per_block() const { return shared_memory_per_block_; } + + // TODO(leary): resident blocks per core will be useful. + + // Convenience typedef for the string-based DeviceDescription mapping. + typedef std::map<string, string> Map; + + // Returns a mapping from readable names to readable values that describe the + // device. This is useful for things like printing. + std::unique_ptr<Map> ToMap() const; + + // For string values that are not available via the underlying platform, this + // value will be provided. + static const char *kUndefinedString; + + private: + friend class internal::DeviceDescriptionBuilder; + + DeviceDescription(); + + // For description of the following members, see the corresponding accessor + // above. + // + // N.B. If another field is added, update ToMap() above. + string device_vendor_; + string platform_version_; + string driver_version_; + string runtime_version_; + string pci_bus_id_; + string name_; + + ThreadDim thread_dim_limit_; + BlockDim block_dim_limit_; + + uint64 blocks_per_core_limit_; + + uint64 threads_per_core_limit_; + uint64 threads_per_block_limit_; + uint64 threads_per_warp_; + + uint64 registers_per_core_limit_; + uint64 registers_per_block_limit_; + uint64 registers_per_thread_limit_; + + uint64 warp_alloc_granularity_; + uint64 register_alloc_granularity_; + uint64 shared_memory_alloc_granularity_; + + uint64 device_address_bits_; + uint64 device_memory_size_; + + // Shared memory limits on a given device. + uint64 shared_memory_per_core_; + uint64 shared_memory_per_block_; + + float clock_rate_ghz_; + + // CUDA "CC" major value, -1 if not available. + int cuda_compute_capability_major_; + int cuda_compute_capability_minor_; + + int numa_node_; + int core_count_; + bool ecc_enabled_; + + SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescription); +}; + +namespace internal { + +// Helper class the builds a device description, given that it has a large +// number of fields that would be easily confused in constructor form. +class DeviceDescriptionBuilder { + public: + DeviceDescriptionBuilder(); + + // For descriptions of the following fields, see comments on the corresponding + // DeviceDescription::* accessors above. + + void set_device_vendor(const string &value) { + device_description_->device_vendor_ = value; + } + void set_platform_version(const string &value) { + device_description_->platform_version_ = value; + } + void set_driver_version(const string &value) { + device_description_->driver_version_ = value; + } + void set_runtime_version(const string &value) { + device_description_->runtime_version_ = value; + } + void set_pci_bus_id(const string &value) { + device_description_->pci_bus_id_ = value; + } + void set_name(const string &value) { device_description_->name_ = value; } + + void set_thread_dim_limit(const ThreadDim &value) { + device_description_->thread_dim_limit_ = value; + } + void set_block_dim_limit(const BlockDim &value) { + device_description_->block_dim_limit_ = value; + } + + void set_blocks_per_core_limit(uint64 value) { + device_description_->blocks_per_core_limit_ = value; + } + + void set_threads_per_core_limit(uint64 value) { + device_description_->threads_per_core_limit_ = value; + } + void set_threads_per_block_limit(uint64 value) { + device_description_->threads_per_block_limit_ = value; + } + void set_threads_per_warp(uint64 value) { + device_description_->threads_per_warp_ = value; + } + + void set_registers_per_core_limit(uint64 value) { + device_description_->registers_per_core_limit_ = value; + } + void set_registers_per_block_limit(uint64 value) { + device_description_->registers_per_block_limit_ = value; + } + void set_registers_per_thread_limit(uint64 value) { + device_description_->registers_per_thread_limit_ = value; + } + + void set_warp_alloc_granularity(uint64 value) { + device_description_->warp_alloc_granularity_ = value; + } + void set_register_alloc_granularity(uint64 value) { + device_description_->register_alloc_granularity_ = value; + } + void set_shared_memory_alloc_granularity(uint64 value) { + device_description_->shared_memory_alloc_granularity_ = value; + } + + void set_device_address_bits(uint64 value) { + device_description_->device_address_bits_ = value; + } + void set_device_memory_size(uint64 value) { + device_description_->device_memory_size_ = value; + } + + void set_shared_memory_per_core(int64 value) { + device_description_->shared_memory_per_core_ = value; + } + void set_shared_memory_per_block(int64 value) { + device_description_->shared_memory_per_block_ = value; + } + + void set_clock_rate_ghz(float value) { + device_description_->clock_rate_ghz_ = value; + } + + void set_cuda_compute_capability(int major, int minor) { + device_description_->cuda_compute_capability_major_ = major; + device_description_->cuda_compute_capability_minor_ = minor; + } + + void set_numa_node(int value) { device_description_->numa_node_ = value; } + void set_core_count(int value) { device_description_->core_count_ = value; } + void set_ecc_enabled(bool value) { + device_description_->ecc_enabled_ = value; + } + + // Returns a built DeviceDescription with ownership transferred to the + // caller. There are currently no restrictions on which fields must be set in + // order to build the descriptor. + // + // Once the description is built, this builder object should be discarded. + std::unique_ptr<DeviceDescription> Build() { + return std::move(device_description_); + } + + private: + std::unique_ptr<DeviceDescription> device_description_; + + SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescriptionBuilder); +}; + +} // namespace internal + +// Returns whether the given thread_dim is acceptable given the limits described +// in device_description. For detailed reasons for failing the predicate, enable +// VLOG(2) for this module. +bool ThreadDimOk(const DeviceDescription &device_description, + const ThreadDim &thread_dim); + +// [deprecated] Use MathUtil::CeilOfRatio directly instead. +// +// Equivalent to ceil(double(element_count) / threads_per_block). +uint64 DivideCeil(uint64 x, uint64 y); + +// Calculate the number of threads/blocks required to process element_count +// elements. Note that you can still end up with more threads than +// element_count due to rounding, so kernels often start with an "is this +// thread id in the element_count range?" test. +void CalculateDimensionality(const DeviceDescription &device_description, + uint64 element_count, uint64 *threads_per_block, + uint64 *block_count); + +// Compute and return maximum blocks per core (occupancy) based on the +// device description, some kernel characteristics and the number of threads per +// block. If unable to compute occupancy, zero is returned. +uint64 CalculateOccupancy(const DeviceDescription &device_description, + uint64 registers_per_thread, + uint64 shared_memory_per_block, + const ThreadDim &thread_dims); + +// Compute and return the maximum number of registers per thread which +// achieves the target occupancy. If the target is not possible then +// zero is returned. +uint64 CalculateRegisterLimitForTargetOccupancy( + const DeviceDescription &device_description, uint64 shared_memory_per_block, + const ThreadDim &thread_dims, uint64 target_blocks_per_core); + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ diff --git a/tensorflow/stream_executor/device_memory.h b/tensorflow/stream_executor/device_memory.h new file mode 100644 index 0000000000..9e88180316 --- /dev/null +++ b/tensorflow/stream_executor/device_memory.h @@ -0,0 +1,284 @@ +// Suite of types that represent device memory allocations. These are +// allocated by the StreamExecutor interface, which produces values appropriate +// for the underlying platform (whether it be CUDA or OpenCL). +// +// The untyped base class (like a device void*) is DeviceMemoryBase, which can +// be specialized for a given allocation type (like a device T*) using +// DeviceMemory<T>. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_ +#define TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_ + +#include <stddef.h> + +#include "tensorflow/stream_executor/lib/casts.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { + +class StreamExecutor; + +// void*-analogous device memory allocation. For the typed variation, see +// DeviceMemory<T>. +// +// This is effectively a two-tuple of a pointer and size; however, note that the +// pointer may not be to the virtual address itself -- in OpenCL the pointer is +// to a cl_mem handle that describes the device allocation. Therefore, +// DeviceMemoryBase::opaque does not necessarily produce a pointer that can be +// referenced directly, so use it with caution. +// +// Thread-compatible. +class DeviceMemoryBase { + public: + // Default constructor instantiates a null-pointed, zero-sized device memory + // region. An opaque pointer may be provided -- see header for details on the + // opacity of that pointer. + explicit DeviceMemoryBase(void *opaque = nullptr, uint64 size = 0, + bool is_sub_buffer = false) + : opaque_(opaque), size_(size), is_sub_buffer_(is_sub_buffer) {} + + // Returns whether the backing memory is the null pointer. + // A `== nullptr` convenience method is also provided. + bool is_null() const { return opaque_ == nullptr; } + bool operator==(std::nullptr_t other) const { return is_null(); } + bool operator!=(std::nullptr_t other) const { return !is_null(); } + + // Provides a partial order between device memory values. + // + // This operator is provided so that this object can be used as a key in an + // ordered map. + bool operator<(const DeviceMemoryBase &other) const { + return opaque() < other.opaque(); + } + + // Returns the size, in bytes, for the backing memory. + uint64 size() const { return size_; } + + // Warning: note that the pointer returned is not necessarily directly to + // device virtual address space, but is platform-dependent. + void *opaque() { return opaque_; } + const void *opaque() const { return opaque_; } + + // Returns true if this is an offset into another primary allocation. + bool is_sub_buffer() const { return is_sub_buffer_; } + + // Returns whether the two DeviceMemoryBase segments are identical (both in + // their opaque pointer and size). + bool IsSameAs(const DeviceMemoryBase &other) const { + return opaque() == other.opaque() && size() == other.size(); + } + + protected: + friend class StreamExecutor; + + // Resets the internal values of the opaque pointer and number of bytes in the + // memory region, just as in the constructor. + void Reset(void *opaque, uint64 bytes) { + opaque_ = opaque; + size_ = bytes; + } + + private: + void *opaque_; // Platform-dependent value representing allocated memory. + uint64 size_; // Size in bytes of this allocation. + bool is_sub_buffer_; // Is this a primary allocation or a sub-buffer? +}; + +// Typed wrapper around "void *"-like DeviceMemoryBase. +// +// For example, DeviceMemory<int> is a simple wrapper around DeviceMemoryBase +// that represents one or more integers in Device memory. +// +// Thread-compatible. +template <typename ElemT> +class DeviceMemory final : public DeviceMemoryBase { + public: + // Default constructor instantiates a null-pointed, zero-sized memory region. + DeviceMemory() : DeviceMemoryBase(nullptr, 0) {} + + // Typed device memory regions may be constructed from untyped device memory + // regions, this effectively amounts to a cast from a void*. + explicit DeviceMemory(const DeviceMemoryBase &other) + : DeviceMemoryBase(const_cast<DeviceMemoryBase &>(other).opaque(), + other.size(), other.is_sub_buffer()) {} + + static constexpr size_t kElemSize = sizeof(ElemT); + + // Returns the number of elements of type ElemT that constitute this + // allocation. + uint64 ElementCount() const { return size() / kElemSize; } + + // Returns whether this is a single-element allocation. + bool IsScalar() const { return ElementCount() == 1; } + + // Create a typed area of DeviceMemory with a given opaque pointer and the + // quantity of bytes in the allocation. This function is broken out to + // distinguish bytes from an element count. + static DeviceMemory<ElemT> MakeFromByteSize(void *opaque, uint64 bytes) { + return DeviceMemory<ElemT>(opaque, bytes); + } + + // Resets the DeviceMemory data, in MakeFromByteSize fashion. + // This simply clobbers the prior values. + void ResetFromByteSize(void *opaque, uint64 bytes) { + // TODO(leary) when NVCC is eliminated we can add this check (and the + // logging include it requires). + // CHECK_EQ(0, bytes % kElemSize); + DeviceMemoryBase::Reset(opaque, bytes); + } + + // ------------------------------------------------------------ + // DO NOT USE - FASTR TEAM-INTERNAL FUNCTIONS + // Used internally by gcudacc. +#ifdef __GCUDACC__ + // Implicit conversion operators needed to support mixed mode. Since buffer + // sizes aren't used in the CUDA launching process, and since the constructed + // objects are all temporary, this is safe. + // Linter warning disabled as we require an implicit conversion. + DeviceMemory(const ElemT *opaque) : // NOLINT + DeviceMemoryBase(reinterpret_cast<void *>(const_cast<ElemT *>(opaque)), + 0) {} + + operator ElemT *() { return reinterpret_cast<ElemT *>(opaque()); } + operator const ElemT *() { + return const_cast<const ElemT *>(reinterpret_cast<ElemT *>(opaque())); + } +#endif + // ------------------------------------------------------------ + + protected: + // This constructor is solely used from derived classes; it is made protected + // because it accepts a byte-size instead of an element count, which could + // potentially be misused given the ElementCount() nature of this interface. + // + // In order to specify the desire to use byte size instead of element count + // explicitly, use MakeFromByteSize. + DeviceMemory(void *opaque, uint64 size) : DeviceMemoryBase(opaque, size) {} +}; + +// A class to encapsulate the type and size of a dynamic shared memory +// buffer. Because the buffer exists solely on the device and is not copyable +// to the host, memory objects of this type do not maintain buffer pointers +// on the host. +template <typename ElemT> +class SharedDeviceMemory final : public DeviceMemoryBase { + public: + explicit SharedDeviceMemory(uint64 elem_count) + : DeviceMemoryBase(nullptr, elem_count * kElemSize) {} + + static constexpr size_t kElemSize = sizeof(ElemT); + + // Returns the number of elements of type ElemT that constitute this + // allocation. + uint64 ElementCount() const { return size() / kElemSize; } + + // Returns whether this is a single-element allocation. + bool IsScalar() const { return ElementCount() == 1; } +}; + +// Similar to the typed DeviceMemory, but is the unique owner of its +// memory, if any. ScopedDeviceMemory is thread-compatible. It is also +// movable and uncopyable to represent unique ownership. +template <typename ElemT> +class ScopedDeviceMemory { + public: + // Parameters: + // parent: Executor used to deallocate memory when this instance goes + // out of scope. + // value: Already-allocated device memory value for this scoped mechanism to + // deallocate. This memory must have been allocated by parent. + ScopedDeviceMemory(StreamExecutor *parent, DeviceMemoryBase value); + + // Constructor overload that places a literal array into device memory + ScopedDeviceMemory(StreamExecutor *parent, + std::initializer_list<ElemT> values); + + // Moves ownership of the memory from other to the constructed + // object. + // + // Postcondition: other == nullptr. + ScopedDeviceMemory(ScopedDeviceMemory &&other) noexcept: + ScopedDeviceMemory(other.parent_, other.Release()) {} + + // Releases the memory that was provided in the constructor, through the + // "parent" StreamExecutor. + ~ScopedDeviceMemory(); + + // Moves ownership of the memory from other to this object. + // + // Postcondition: other == nullptr. + ScopedDeviceMemory& operator=(ScopedDeviceMemory &&other) { + Reset(other.Release()); + parent_ = other.parent_; + return *this; + } + + // Returns the memory that backs this scoped allocation converted to + // DeviceMemory<T> apparent type. This is useful for cases where the + // DeviceMemory must be passed by const-ref, as the ScopedDeviceMemory doesn't + // allow copying, for scoped-object-lifetime reasons. + const DeviceMemory<ElemT> &cref() const { return wrapped_; } + + // Returns a pointer to the DeviceMemory<T> apparent type for use in mutable + // operations. The value returned should not be used outside the scope of this + // ScopedDeviceMemory object's lifetime. + DeviceMemory<ElemT> *ptr() { return &wrapped_; } + const DeviceMemory<ElemT> *ptr() const { return &wrapped_; } + + // Smart-pointer-like operators for the wrapped DeviceMemory. + // This reference must not be used outside the lifetime of this + // ScopedDeviceMemory. + const DeviceMemory<ElemT> &operator*() const { return cref(); } + DeviceMemory<ElemT> *operator->() { return ptr(); } + const DeviceMemory<ElemT> *operator->() const { return ptr(); } + bool operator==(std::nullptr_t other) const { return wrapped_.is_null(); } + bool operator!=(std::nullptr_t other) const { return !wrapped_.is_null(); } + + // Analogous to std::unique_ptr::reset, frees the existing memory held in + // this scoped memory container and replaces it with updated. Ownership + // of updated is transferred to this object. + void Reset(DeviceMemory<ElemT> updated); + void Reset(std::nullptr_t); + + // Analogous to std::unique_ptr::release, releases ownership of the held + // memory and transfers it to the caller. + // + // Postcondition: *this == nullptr + DeviceMemory<ElemT> Release() { + auto tmp = wrapped_; + wrapped_.ResetFromByteSize(nullptr, 0); + return tmp; + } + + private: + DeviceMemory<ElemT> wrapped_; // Value we wrap with scoped-release. + StreamExecutor *parent_; // See constructor. + + SE_DISALLOW_COPY_AND_ASSIGN(ScopedDeviceMemory); +}; + +// Host-side representation of packed-and-aligned vector datatypes on the device +// side. Since these can appear in device kernel signatures, we support +// launching them with these datatypes in launch signatures. + +struct Float2 { + float x, y; +}; + +struct Float4 { + Float2 xz, yw; +}; + +struct Double2 { + double x, y; +}; + +static_assert(sizeof(Float2) == 2 * sizeof(float), "Float2 must be packed"); +static_assert(sizeof(Float4) == 4 * sizeof(float), "Float4 must be packed"); +static_assert(sizeof(Double2) == 2 * sizeof(double), "Double2 must be packed"); + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_ diff --git a/tensorflow/stream_executor/device_options.h b/tensorflow/stream_executor/device_options.h new file mode 100644 index 0000000000..bd393a6efb --- /dev/null +++ b/tensorflow/stream_executor/device_options.h @@ -0,0 +1,70 @@ +// Contains device-level options that can be specified at a platform level. +// Example usage: +// auto device_options = DeviceOptions::Default(); + +#ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_OPTIONS_H_ +#define TENSORFLOW_STREAM_EXECUTOR_DEVICE_OPTIONS_H_ + +#include "tensorflow/stream_executor/platform/port.h" + +#include "tensorflow/stream_executor/platform/logging.h" + +namespace perftools { +namespace gputools { + +// Indicates a set of options for a device's usage, which generally must be +// provided at StreamExecutor device-initialization time. +// +// These are intended to be useful-but-not-mandatorily-supported options for +// using devices on the underlying platform. Presently, if the option requested +// is not available on the target platform, a warning will be emitted. +struct DeviceOptions { + public: + // When it is observed that more memory has to be allocated for thread stacks, + // this flag prevents it from ever being deallocated. Potentially saves + // thrashing the thread stack memory allocation, but at the potential cost of + // some memory space. + static const unsigned kDoNotReclaimStackAllocation = 0x1; + + // The following options refer to synchronization options when + // using SynchronizeStream or SynchronizeContext. + + // Synchronize with spinlocks. + static const unsigned kScheduleSpin = 0x02; + // Synchronize with spinlocks that also call CPU yield instructions. + static const unsigned kScheduleYield = 0x04; + // Synchronize with a "synchronization primitive" (e.g. mutex). + static const unsigned kScheduleBlockingSync = 0x08; + + static const unsigned kMask = 0xf; // Mask of all available flags. + + // Constructs an or-d together set of device options. + explicit DeviceOptions(unsigned flags) : flags_(flags) { + CHECK((flags & kMask) == flags); + } + + // Factory for the default set of device options. + static DeviceOptions Default() { return DeviceOptions(0); } + + unsigned flags() const { return flags_; } + + bool operator==(const DeviceOptions& other) const { + return flags_ == other.flags_; + } + + bool operator!=(const DeviceOptions& other) const { + return !(*this == other); + } + + string ToString() { + return flags_ == 0 ? "none" : "kDoNotReclaimStackAllocation"; + } + + private: + unsigned flags_; +}; + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_OPTIONS_H_ diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc new file mode 100644 index 0000000000..020de7f7bb --- /dev/null +++ b/tensorflow/stream_executor/dnn.cc @@ -0,0 +1,297 @@ +#include "tensorflow/stream_executor/dnn.h" + +#include "tensorflow/stream_executor/lib/strcat.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" + +namespace perftools { +namespace gputools { +namespace dnn { + +string ActivationModeString(ActivationMode mode) { + switch (mode) { + case ActivationMode::kSigmoid: + return "sigmoid"; + case ActivationMode::kRelu: + return "relu"; + case ActivationMode::kRelu6: + return "relu6"; + case ActivationMode::kReluX: + return "reluX"; + case ActivationMode::kTanh: + return "tanh"; + default: + LOG(FATAL) << "Unknown activation_mode " << static_cast<int32>(mode); + } +} + +string ElementwiseOperationString(ElementwiseOperation op) { + switch (op) { + case ElementwiseOperation::kAdd: + return "add"; + case ElementwiseOperation::kMultiply: + return "multiply"; + default: + LOG(FATAL) << "Unknown elementwise op " << static_cast<int32>(op); + } +} + +string DataLayoutString(DataLayout layout) { + switch (layout) { + case DataLayout::kYXDepthBatch: + return "YXDepthBatch"; + case DataLayout::kYXBatchDepth: + return "YXBatchDepth"; + case DataLayout::kBatchYXDepth: + return "BatchYXDepth"; + case DataLayout::kBatchDepthYX: + return "BatchDepthYX"; + default: + LOG(FATAL) << "Unknown data layout " << static_cast<int32>(layout); + } +} + +string FilterLayoutString(FilterLayout layout) { + switch (layout) { + case FilterLayout::kOutputInputYX: + return "OutputInputYX"; + case FilterLayout::kInputYXOutput: + return "InputYXOutput"; + case FilterLayout::kYXInputOutput: + return "YXInputOutput"; + default: + LOG(FATAL) << "Unknown filter layout " << static_cast<int32>(layout); + } +} + +// -- BatchDescriptor + +BatchDescriptor::BatchDescriptor() + : count_(0), + feature_map_count_(0), + height_(0), + width_(0), + value_max_(0.0), + value_min_(0.0), + layout_(DataLayout::kYXDepthBatch), + quantized_activation_mode_(QuantizedActivationMode::k8Bit) {} + +void BatchDescriptor::CloneFrom(const BatchDescriptor& other) { + count_ = other.count_; + feature_map_count_ = other.feature_map_count_; + height_ = other.height_; + width_ = other.width_; + value_max_ = other.value_max_; + value_min_ = other.value_min_; + layout_ = other.layout_; + quantized_activation_mode_ = other.quantized_activation_mode_; +} + +string BatchDescriptor::ToString() const { + return port::Printf( + "{count: %lld feature_map_count: %lld height: %lld width: %lld " + "value_min: %f value_max: %f layout: %s}", + count_, feature_map_count_, height_, width_, value_min_, value_max_, + DataLayoutString(layout_).c_str()); +} + +string BatchDescriptor::ToShortString() const { + // All the constituent strings are less than 15 characters, so the + // small string optimization ensures that there will be at most one + // heap memory allocation. + string x = port::StrCat("x", width()); + string y = port::StrCat("y", height()); + string depth = port::StrCat("d", feature_map_count()); + string batch = port::StrCat("b", count()); + + string suffix; + if (value_min() != value_max()) { + port::StrAppend(&suffix, "[", value_min(), ";", value_max(), "]"); + } + if (quantized_activation_mode() == QuantizedActivationMode::k16Bit) { + suffix += "_16bit"; + } + + switch (layout()) { + case DataLayout::kYXDepthBatch: + return port::StrCat(y, x, depth, batch, suffix); + case DataLayout::kYXBatchDepth: + return port::StrCat(y, x, batch, depth, suffix); + case DataLayout::kBatchYXDepth: + return port::StrCat(batch, y, x, depth, suffix); + case DataLayout::kBatchDepthYX: + return port::StrCat(batch, depth, y, x, suffix); + default: + LOG(FATAL) << "Unknown layout " << static_cast<int32>(layout()); + } +} + +int64 BatchDescriptor::NodesPerFeatureMap() const { return width_ * height_; } + +int64 BatchDescriptor::NodesAcrossFeatureMaps() const { + return NodesPerFeatureMap() * feature_map_count_; +} + +int64 BatchDescriptor::ElementCount() const { + return count_ * feature_map_count_ * height_ * width_; +} + +int64 BatchDescriptor::FullyConnectedWeightCount( + const BatchDescriptor& input, const BatchDescriptor& output) { + return input.NodesAcrossFeatureMaps() * output.NodesAcrossFeatureMaps(); +} + +int64 BatchDescriptor::FullyConnectedBiasCount(const BatchDescriptor& output) { + return output.NodesAcrossFeatureMaps(); +} + +// -- FilterDescriptor + +FilterDescriptor::FilterDescriptor() + : output_feature_map_count_(0), + input_feature_map_count_(0), + input_filter_height_(0), + input_filter_width_(0), + layout_(FilterLayout::kOutputInputYX) {} + +FilterDescriptor::~FilterDescriptor() {} + +void FilterDescriptor::CloneFrom(const FilterDescriptor& other) { + set_output_feature_map_count(other.output_feature_map_count()) + .set_input_feature_map_count(other.input_feature_map_count()) + .set_input_filter_height(other.input_filter_height()) + .set_input_filter_width(other.input_filter_width()) + .set_layout(other.layout()); +} + +string FilterDescriptor::ToString() const { + return port::Printf( + "{output_feature_map_count: %lld input_feature_map_count: %lld " + "input_filter_height: %lld input_filter_width: %lld layout: %s}", + output_feature_map_count_, input_feature_map_count_, input_filter_height_, + input_filter_width_, FilterLayoutString(layout_).c_str()); +} + +string FilterDescriptor::ToShortString() const { + // All the constituent strings are less than 15 characters, so the + // small string optimization ensures that there will be at most one + // heap memory allocation. + string od = port::StrCat("od", output_feature_map_count_); + string id = port::StrCat("id", input_feature_map_count_); + string y = port::StrCat("y", input_filter_height_); + string x = port::StrCat("x", input_filter_width_); + + switch (layout_) { + case FilterLayout::kOutputInputYX: + return port::StrCat(od, id, y, x); + case FilterLayout::kInputYXOutput: + return port::StrCat(id, y, x, od); + case FilterLayout::kYXInputOutput: + return port::StrCat(y, x, id, od); + default: + LOG(FATAL) << "Unknown layout " << static_cast<int32>(layout_); + } +} + +int64 FilterDescriptor::ComputeWeightCount() const { + return output_feature_map_count_ * input_feature_map_count_ * + input_filter_height_ * input_filter_width_; +} + +// -- ConvolutionDescriptor + +ConvolutionDescriptor::ConvolutionDescriptor() + : zero_padding_height_(0), + zero_padding_width_(0), + vertical_filter_stride_(1), + horizontal_filter_stride_(1) {} + +ConvolutionDescriptor::~ConvolutionDescriptor() {} + +string ConvolutionDescriptor::ToString() const { + return port::Printf( + "{zero_padding_height: %lld zero_padding_width: %lld " + "vertical_filter_stride: %lld horizontal_filter_stride: %lld}", + zero_padding_height_, zero_padding_width_, vertical_filter_stride_, + horizontal_filter_stride_); +} + +string ConvolutionDescriptor::ToShortString() const { + return port::StrCat("py:", zero_padding_height_, "_px:", zero_padding_width_, + "_sy:", vertical_filter_stride_, "_sx:", + horizontal_filter_stride_); +} + +// -- PoolingDescriptor + +PoolingDescriptor::PoolingDescriptor() + : mode_(dnn::PoolingMode::kMaximum), + window_height_(0), + window_width_(0), + vertical_padding_(0), + horizontal_padding_(0), + vertical_stride_(0), + horizontal_stride_(0) {} + +void PoolingDescriptor::CloneFrom(const PoolingDescriptor& other) { + mode_ = other.mode_; + window_height_ = other.window_height_; + window_width_ = other.window_width_; + vertical_padding_ = other.vertical_padding_; + horizontal_padding_ = other.horizontal_padding_; + vertical_stride_ = other.vertical_stride_; + horizontal_stride_ = other.horizontal_stride_; +} + +string PoolingDescriptor::ToString() const { + const char* mode_string = + mode_ == dnn::PoolingMode::kMaximum ? "kMaximum" : "kAverage"; + return port::Printf( + "{mode: %s window_height: %lld window_width: %lld vertical_stride: %lld " + "horizontal_stride: %lld vertical padding: %lld horizontal padding: " + "%lld}", + mode_string, window_height_, window_width_, vertical_stride_, + horizontal_stride_, vertical_padding_, horizontal_padding_); +} + +string PoolingDescriptor::ToShortString() const { + return port::StrCat(mode_ == dnn::PoolingMode::kMaximum ? "max" : "avg", + "_y:", window_height_, "_x:", window_width_, "_py:", + vertical_padding_, "_px:", horizontal_padding_, "_sy:", + vertical_stride_, "_sx:", horizontal_stride_); +} + +// -- NormalizeDescriptor + +NormalizeDescriptor::NormalizeDescriptor() + : bias_(0.0), + range_(0), + alpha_(0.0), + beta_(0.0), + wrap_around_(false), + segment_size_(0) {} + +void NormalizeDescriptor::CloneFrom(const NormalizeDescriptor& other) { + bias_ = other.bias_; + range_ = other.range_; + alpha_ = other.alpha_; + beta_ = other.beta_; + wrap_around_ = other.wrap_around_; + segment_size_ = other.segment_size_; +} + +string NormalizeDescriptor::ToString() const { + return port::Printf( + "{bias: %f range: %d alpha: %f beta: %f wrap_around: %d " + "segment_size: %d}", + bias_, range_, alpha_, beta_, wrap_around_, segment_size_); +} + +string NormalizeDescriptor::ToShortString() const { + return port::StrCat("bias:", bias_, "_range:", range_, "_alpha:", alpha_, + "_beta:", beta_, "_wrap:", wrap_around_, "_size:", + segment_size_); +} + +} // namespace dnn +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h new file mode 100644 index 0000000000..e737d1c78f --- /dev/null +++ b/tensorflow/stream_executor/dnn.h @@ -0,0 +1,895 @@ +// Neural Net operation support for StreamExecutor instances. +// +// This is an abstract interface for a platform to optionally support common +// neural net operations; it accommodates implementations such as the cudnn +// library operations. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_DNN_H_ +#define TENSORFLOW_STREAM_EXECUTOR_DNN_H_ + +#include "tensorflow/stream_executor/device_memory.h" +#include "tensorflow/stream_executor/lib/array_slice.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { + +class Stream; + +namespace dnn { + +// Describes how an input or output layer's data is formatted. +// Specify int64 so there's no padding in BatchDescriptor. +enum class DataLayout : int64 { + kYXDepthBatch = 0, // Same as dist_belief::DF_DEPTH_MAJOR. + kYXBatchDepth, // Same as dist_belief::DF_BATCH_MAJOR. + kBatchYXDepth, // Same as run_brain output, and tensorflow's layout. + kBatchDepthYX, // cuDNN's NCHW layout, data laid out as image, feature, + // maps, rows, columns. +}; + +// Returns a string representation of the given data layout. +string DataLayoutString(DataLayout layout); + +// Specifies a quantization for activations in a given BatchDescriptor. +enum class QuantizedActivationMode { + k8Bit = 1, + k16Bit = 2, + k32Bit = 4, +}; + +// Describes the dimensions that a layer consumes/produces. +// +// This is a matrix (height, width), its "depth" (feature_map_count), +// how many of these matrices are present (count), +// and the maximum and minimum values expected in the matrix (value_max, +// value_min). +// If input is quantized, all values greater +// than value_max will be clipped to value_max and all values less than +// value_min will be clipped to value_min. +// When quantized output is dequantized no value will be greater than +// value_max or less than value_min. +// +// Uses the named argument construction form: +// +// auto input_batch_dimensions = +// BatchDescriptor().set_count(42).set_feature_map_count(7)... +// +// Details: +// +// For a convolutional layer, a single inference takes a 3-dimensional matrix +// of input and produces a 3-dimensional matrix of output. We call the three +// dimensions height, width and feature_map_count, where for an image, the +// height and width correspond to the Y and X pixel indices, respectively, and +// the feature_map_count corresponds to the RGB dimension of the input data. +// Then the count indicates how many 3D matrices are being presented to be +// processed at once; this corresponds to the neural network concept of +// minibatch size. +// +// For a fully connected layer, it's better to put the nodes of the layer in +// the feature_map_count, and leave the height and weight as degenerate (== 1). +// Count indicates how many input vectors (degenerate 3D matrices) are to be +// processed. +// +// If unspecified, value_max and value_min default to 0.0. +// If value_max == value_min the Stream will attempt to derive valid values - +// for example the output of Relu6 activation will always be in the range +// [0.0, 6.0]. +// +// If unspecified, layout defaults to kYXDepthBatch. +class BatchDescriptor { + public: + // Creates a "blank" batch descriptor, which should be initialized via the + // named argument helpers. + BatchDescriptor(); + + // Clones values from 'other' for initialization. + void CloneFrom(const BatchDescriptor& other); + + string ToString() const; + string ToShortString() const; + + // Accessors. + int64 count() const { return count_; } + int64 feature_map_count() const { return feature_map_count_; } + int64 height() const { return height_; } + int64 width() const { return width_; } + float value_max() const { return value_max_; } + float value_min() const { return value_min_; } + DataLayout layout() const { return layout_; } + QuantizedActivationMode quantized_activation_mode() const { + return quantized_activation_mode_; + } + + // Named-argument helpers for avoiding user error during construction. + BatchDescriptor& set_count(int64 value) { + count_ = value; + return *this; + } + BatchDescriptor& set_feature_map_count(int64 value) { + feature_map_count_ = value; + return *this; + } + BatchDescriptor& set_height(int64 value) { + height_ = value; + return *this; + } + BatchDescriptor& set_width(int64 value) { + width_ = value; + return *this; + } + BatchDescriptor& set_value_max(float value) { + value_max_ = value; + return *this; + } + BatchDescriptor& set_value_min(float value) { + value_min_ = value; + return *this; + } + BatchDescriptor& set_layout(DataLayout layout) { + layout_ = layout; + return *this; + } + BatchDescriptor& set_quantized_activation_mode( + QuantizedActivationMode quantized_activation_mode) { + quantized_activation_mode_ = quantized_activation_mode; + return *this; + } + + // Return the number of nodes in a single feature map. + int64 NodesPerFeatureMap() const; + + // Return the number of nodes across all feature maps. Note that this is not + // affected by the batch count. + int64 NodesAcrossFeatureMaps() const; + + // Returns the number of elements (e.g. RGB pixel values) required to hold a + // given batch descriptor, given a no-padding assumption. Note that this is + // affected by the batch count. + int64 ElementCount() const; + + // Return the number of weights required to fully connect a layer with + // dimensions given by the 'input' descriptor with a layer with dimensions + // given by the 'output' descriptor. + static int64 FullyConnectedWeightCount(const BatchDescriptor& input, + const BatchDescriptor& output); + + // Return the number of biases required to fully connect to an output layer + // with dimensions given the 'output' descriptor. + static int64 FullyConnectedBiasCount(const BatchDescriptor& output); + + private: + int64 count_; + int64 feature_map_count_; + int64 height_; + int64 width_; + float value_max_; + float value_min_; + DataLayout layout_; + QuantizedActivationMode quantized_activation_mode_; +}; + +// Describes how a filter is laid out in the memory. +// Specify int64 so there's no padding in FilterDescriptor. +enum class FilterLayout : int64 { + kOutputInputYX = 0, // cuDNN's default filter layout, laid out as: + // (major) output feature maps >> input feature maps >> + // rows >> columns (minor). + kInputYXOutput, // Same as dist_belief's default filter layout. + kYXInputOutput, // Same as tensorflow's default filter layout. +}; + +// Returns a string representation of the given filter layout. +string FilterLayoutString(FilterLayout layout); + +// Describes a filter for the convolution. This is the "window" from +// height-by-width patches of each of the feature maps in the input layer to the +// cells within the output feature map. +// +// Uses the named argument construction form: +// +// FilterDescriptor filter_dimensions; +// filter_dimensions +// .set_output_feature_map_count(42) +// .set_input_feature_map_count(7) +// ... +// +// Arguments: +// - output_feature_map_count: number of feature maps in the output layer. +// - input_feature_map_count: number of feature maps in the input layer (from +// which the filter patch is taken). +// - input_filter_height: "height" number of neurons used in the sliding window +// over the input layer. +// - input_filter_width: "width" number of neurons used in the sliding window +// over the input layer. +// +// Sometimes names like "filter input height" are referred to by synonymous +// terminology, such as "kernel y size". +// +// If unspecified, layout defaults to kOutputInputYX. +class FilterDescriptor { + public: + // By default construction, all dimensions are set to zero, so they should all + // be populated by the user via the named-argument helpers below. (See class + // comment for details.) + FilterDescriptor(); + + ~FilterDescriptor(); + + // Named-argument helpers for avoiding user error during construction. + FilterDescriptor& set_output_feature_map_count(int64 value) { + output_feature_map_count_ = value; + return *this; + } + FilterDescriptor& set_input_feature_map_count(int64 value) { + input_feature_map_count_ = value; + return *this; + } + FilterDescriptor& set_input_filter_height(int64 value) { + input_filter_height_ = value; + return *this; + } + FilterDescriptor& set_input_filter_width(int64 value) { + input_filter_width_ = value; + return *this; + } + FilterDescriptor& set_layout(FilterLayout layout) { + layout_ = layout; + return *this; + } + + void CloneFrom(const FilterDescriptor& other); + + string ToString() const; + string ToShortString() const; + + // Returns the number of weights required as parameters for a convolution + // using this filter descriptor. + int64 ComputeWeightCount() const; + + // Returns the number of biases required as parameters for a convolution using + // this filter descriptor. + int64 bias_count() const { return output_feature_map_count_; } + + int64 output_feature_map_count() const { return output_feature_map_count_; } + int64 input_feature_map_count() const { return input_feature_map_count_; } + int64 input_filter_height() const { return input_filter_height_; } + int64 input_filter_width() const { return input_filter_width_; } + FilterLayout layout() const { return layout_; } + + private: + int64 output_feature_map_count_; + int64 input_feature_map_count_; + int64 input_filter_height_; + int64 input_filter_width_; + FilterLayout layout_; + + SE_DISALLOW_COPY_AND_ASSIGN(FilterDescriptor); +}; + +// Describes a convolution. +// +// Uses the named argument construction form: +// +// ConvolutionDescriptor convolution_dimensions; +// convolution_dimensions +// .set_vertical_filter_stride(2) +// .set_horizontal_filter_stride(2) +// ... +// +// Arguments: +// - zero_padding_height: padding of the "y dimension" of the input data. Note +// that this is different from the height of the filter. +// - zero_padding_width: analogouus to the height above, but in the "x +// dimension". +// - vertical_filter_stride: the convolution slides a 2-dimensional window of +// filter-height-by-filter-width over the input layer -- the center of that +// window is moved in the "y dimension" according to this stride value. +// - horizontal_filter_stride: analogous to the vertical stride above, but in +// the "x dimension". +class ConvolutionDescriptor { + public: + // By default construction, there is no zero-padding and the filter stride is + // 1x1 (centering the filter on every cell in the input layer's + // width-by-height area). + ConvolutionDescriptor(); + ~ConvolutionDescriptor(); + + string ToString() const; + string ToShortString() const; + + ConvolutionDescriptor& set_zero_padding_height(int64 value) { + zero_padding_height_ = value; + return *this; + } + ConvolutionDescriptor& set_zero_padding_width(int64 value) { + zero_padding_width_ = value; + return *this; + } + ConvolutionDescriptor& set_vertical_filter_stride(int64 value) { + vertical_filter_stride_ = value; + return *this; + } + ConvolutionDescriptor& set_horizontal_filter_stride(int64 value) { + horizontal_filter_stride_ = value; + return *this; + } + + int64 zero_padding_height() const { return zero_padding_height_; } + int64 zero_padding_width() const { return zero_padding_width_; } + int64 vertical_filter_stride() const { return vertical_filter_stride_; } + int64 horizontal_filter_stride() const { return horizontal_filter_stride_; } + + private: + int64 zero_padding_height_; + int64 zero_padding_width_; + int64 vertical_filter_stride_; + int64 horizontal_filter_stride_; + // TODO(leary) cudnn provides these fields, but need to characterize what + // their effect is -- they may be boolean rather than integral. + // int64 upscale_input_x; + // int64 upscale_input_y; +}; + +// A patch of values in the input can be pooled via either a max or an average +// operation. +// Specify int64 so there's no padding in PoolingDescriptor. +enum class PoolingMode : int64 { + kMaximum, + kAverage, +}; + +// Describes a pooling operation to be enqueued onto a stream via a platform's +// DnnSupport. +// +// TODO(broune): describe how padding works and what happens if the +// window height/width is not divisible by the vertical/horizontal +// stride. +// +// Arguments: +// pooling_mode: pooling operator to use on the input patch +// window_height: height of input window +// window_width: width of input window +// vertical_stride: vertical delta for center of the input patch +// horizontal_stride: horizontal delta for center of the input patch +class PoolingDescriptor { + public: + PoolingDescriptor(); + + PoolingDescriptor& set_pooling_mode(PoolingMode value) { + mode_ = value; + return *this; + } + PoolingDescriptor& set_window_height(int64 value) { + window_height_ = value; + return *this; + } + PoolingDescriptor& set_window_width(int64 value) { + window_width_ = value; + return *this; + } + PoolingDescriptor& set_vertical_padding(int64 value) { + vertical_padding_ = value; + return *this; + } + PoolingDescriptor& set_horizontal_padding(int64 value) { + horizontal_padding_ = value; + return *this; + } + PoolingDescriptor& set_vertical_stride(int64 value) { + vertical_stride_ = value; + return *this; + } + PoolingDescriptor& set_horizontal_stride(int64 value) { + horizontal_stride_ = value; + return *this; + } + + void CloneFrom(const PoolingDescriptor& other); + + string ToString() const; + string ToShortString() const; + + PoolingMode mode() const { return mode_; } + int64 window_height() const { return window_height_; } + int64 window_width() const { return window_width_; } + int64 vertical_padding() const { return vertical_padding_; } + int64 horizontal_padding() const { return horizontal_padding_; } + int64 vertical_stride() const { return vertical_stride_; } + int64 horizontal_stride() const { return horizontal_stride_; } + + private: + PoolingMode mode_; + int64 window_height_; + int64 window_width_; + int64 vertical_padding_; + int64 horizontal_padding_; + int64 vertical_stride_; + int64 horizontal_stride_; + + SE_DISALLOW_COPY_AND_ASSIGN(PoolingDescriptor); +}; + +// Describes a dist_belief local response normalization. +// The normalization equation is: +// y_i = x_i / (bias + alpha * (sum_j_{i - range}^{i + range} x_j^2)) ^ beta +// where x_i is the input in feature map i, y_i is the output. +// Each feature map is split into segment_size segments for performing the +// sum_j_. If wrap_around is true, the sum_j_ for y_i on the left and right of +// a segment wrap around at the edges of the segment, if wrap_around is false +// zeros are inserted instead. +class NormalizeDescriptor { + public: + NormalizeDescriptor(); + + NormalizeDescriptor& set_bias(float bias) { + bias_ = bias; + return *this; + } + + NormalizeDescriptor& set_range(int32 range) { + range_ = range; + return *this; + } + + NormalizeDescriptor& set_alpha(float alpha) { + alpha_ = alpha; + return *this; + } + + NormalizeDescriptor& set_beta(float beta) { + beta_ = beta; + return *this; + } + + NormalizeDescriptor& set_wrap_around(bool wrap_around) { + wrap_around_ = wrap_around; + return *this; + } + + NormalizeDescriptor& set_segment_size(int32 segment_size) { + segment_size_ = segment_size; + return *this; + } + + void CloneFrom(const NormalizeDescriptor& other); + + string ToString() const; + string ToShortString() const; + + float bias() const { return bias_; } + int32 range() const { return range_; } + float alpha() const { return alpha_; } + float beta() const { return beta_; } + bool wrap_around() const { return wrap_around_; } + int32 segment_size() const { return segment_size_; } + + private: + float bias_; + int32 range_; + float alpha_; + float beta_; + bool wrap_around_; + int32 segment_size_; + + SE_DISALLOW_COPY_AND_ASSIGN(NormalizeDescriptor); +}; + +// Describes a kind of non-linearity (threshold-like mathematical function). +enum class ActivationMode { + kSigmoid, + // Rectified linear activation: f(x) = x < 0 ? 0 : x + kRelu, + // Rectified linear activation, where upper maximum is 6.0. + kRelu6, + // Rectified linear activation, where upper maximum specified by + // BatchDescriptor::value_max(). + kReluX, + kTanh, +}; + +// Returns a string representation of the given activation mode. +string ActivationModeString(ActivationMode mode); + +// Describes the operation that DoElementwiseOperation should perform on its +// inputs. +enum class ElementwiseOperation { + kAdd, + kMultiply +}; + +string ElementwiseOperationString(ElementwiseOperation op); + +// Suite of operations typically used for implementing Deep/Convolutional Neural +// Nets. +class DnnSupport { + public: + DnnSupport() {} + virtual ~DnnSupport() {} + + virtual port::Status Init() = 0; + + // Enqueues a single-precision convolution operation onto the stream. + // + // Arguments (all borrowed): + // stream: borrowed pointer to the stream that the 'convolve' operation + // should be enqueued onto. + // input_descriptor: dimensions of the input layer. + // input_data: un-owned device memory region which contains the + // convolution input. + // filter_descriptor: dimensions of the convolution filter. + // weights: coefficients for the convolution filter, these are multiplied + // against values in the input that the filter convolves over. + // convolution_descriptor: stride of the convolution filter. + // output_descriptor: dimensions of the output layer. + // output_data: un-owned device memory region in which to place the + // convolution result. + // + // input_descriptor, filter_descriptor, convolution_descriptor and + // output_descriptor together specify exactly how the convolution is aligned + // with the input data: + // + // * (input dimensions - filter size + 1) / filter stride == output dimensions + // corresponds to dist_belief padding = VALID, i.e. the input is not padded. + // * input dimensions / filter stride == output dimensions + // corresponds to dist_belief padding = SAME, i.e. input and output are the + // same size - this requires padding the input. + // * (input dimensions + filter size - 1) / filter stride == output dimensions + // corresponds to dist_belief padding = FULL, i.e. the output is sized so + // that if the inverse of the filter is applied to the output in VALID mode + // the result is the same size as the input - this requires even more + // padding + // of the input. + virtual bool DoConvolve( + Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory<float>& input_data, + const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory<float>& filter_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory<float>* output_data) = 0; + + // Enqueues a double-precision convolution operation onto the stream. + // See DoConvolve above for argument details. + virtual bool DoConvolve( + Stream* stream, const dnn::BatchDescriptor& batch_descriptor, + const DeviceMemory<double>& input_data, + const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory<double>& filter_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory<double>* output_data) = 0; + + // Variation of the above with the weight matrix split into two matrices. + // first_weights: Coefficients of the first matrix. + // second_weights: Coefficients of the second matrix. + // depth_multiplier: specifies the columns of the first matrix and rows + // of the second one - first_weights columns = depth_multiplier, + // second_weights rows = depth_multiplier * + // filter_descriptor.input_feature_map_count(). + // see go/separable for documentation on separable convolutions. + virtual bool DoSeparableConvolve( + Stream* stream, const BatchDescriptor& input_descriptor, + const DeviceMemory<float>& input_data, + const FilterDescriptor& filter_descriptor, int depth_multiplier, + const DeviceMemory<float>& first_weights, + const DeviceMemory<float>& second_weights, + const ConvolutionDescriptor& convolution_descriptor, + const BatchDescriptor& output_descriptor, + DeviceMemory<float>* output_data) = 0; + + // Enqueues a single-precision backward convolution (for data) operation onto + // the stream. + // + // Arguments: + // stream: borrowed pointer to the stream that the 'convolve' operation + // should be enqueued onto. + // filter_descriptor: dimensions of the convolution filter. + // filter_data: coefficients for the convolution filter. + // output_descriptor: dimensions of the output gradients, which is the same + // as + // the dimensions of the ouput. + // backward_output_data: un-owned device memory region which contains the + // backprop of the output. + // convolution_descriptor: stride of the convolution filter. + // input_descriptor: dimensions of the input layer. + // backward_input_data: un-owned device memory region in which to place the + // backprop of the input. + virtual bool DoConvolveBackwardData( + Stream* stream, const FilterDescriptor& filter_descriptor, + const DeviceMemory<float>& filter_data, + const BatchDescriptor& output_descriptor, + DeviceMemory<float> backward_output_data, + const ConvolutionDescriptor& convolution_descriptor, + const BatchDescriptor& input_descriptor, + DeviceMemory<float>* backward_input_data) = 0; + + // Enqueues a single-precision backward convolution (for filter) operation + // onto + // the stream. + // + // Arguments: + // stream: borrowed pointer to the stream that the 'convolve' operation + // should be enqueued onto. + // input_descriptor: dimensions of the input layer. + // input_data: un-owned device memory region which contains the + // convolution input. + // output_descriptor: dimensions of the output gradients, which is the same + // as + // the dimensions of the ouput. + // backward_output_data: un-owned device memory region which contains the + // backprop of the output. + // convolution_descriptor: stride of the convolution filter. + // filter_descriptor: dimensions of the convolution filter. + // backward_filter_data: un-owned device memory region in which to place the + // backprop of the filter. + virtual bool DoConvolveBackwardFilter( + Stream* stream, const BatchDescriptor& input_descriptor, + const DeviceMemory<float>& input_data, + const BatchDescriptor& output_descriptor, + DeviceMemory<float> backward_output_data, + const ConvolutionDescriptor& convolution_descriptor, + const FilterDescriptor& filter_descriptor, + DeviceMemory<float>* backward_filter_data) = 0; + + // Fully connects the "nodes" (float values) in input_data with + // shape input_dimensions to output_data with output_dimensions + // using provided weights. This is equivalent to computing a matrix + // product, hence the name MatMul. + // + // A BatchDescriptor has four dimensions: batch, y, x, depth. Matrix products + // happen in two dimensions. To get down to two dimensions, we consider the + // input y, x and depth dimension as one combined dimension T. For now, + // assume that the output height and width are 1 and let OD be the output + // depth. + // + // There are three device memory buffers passed in to this + // function. We can now view all three as matrices: + // + // input_data: A batch x T matrix + // weights: A T x OD matrix + // output_data: A batch x OD matrix + // + // This function then computes the matrix product of input_data and + // weights and writes the result into output_data. + // + // Here the weights buffer is in row major order, i.e. the first OD + // entries in weights are the first row, the second OD entries in + // weights are the second row and so on. + // + // The case for output width*height > 1 is more complicated. Let K = + // OY * OX where OY is the output height and OX is the output + // width. Then weights is divided into K sub-arrays W_i, for + // i=0,...,k-1, that each represent a T x OD matrix. This function + // then computes the K matrix multiplications of input_data with + // each W_i. This creates K matrices with dimensions batch x + // OD. These K matrices are concatenated horizontally to form one + // larger matrix with dimensions batch x (K*OD); note that this is + // not the same as concatenating the bytes of the matrices. The + // combined matrix can then be interpreted as a tensor with + // dimensions (batch, OY, OX, OD). If the output tensor format is + // not kBatchYXDepth, this function would then need to arrange for + // the output to be in the requested layout, if that is + // supported. Note that the case K=1 is equivalent to the + // description above. It is recommended to prefer the case K=1. + // + // Arguments (all borrowed): + // stream: borrowed pointer to the stream that the 'fully connect' operation + // should be enqueued onto. + // output_data: un-owned device memory region in which to place the + // fully connected result. + virtual bool DoMatMul(Stream* stream, const DeviceMemory<float>& input_data, + const DeviceMemory<float>& weights, + const dnn::BatchDescriptor& input_dimensions, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) = 0; + + // Version of DoMatMul that uses pre-quantized 8 bit weights. + // weight_scales specifies the scaling of each column of weights: + // original float weight[row * num_columns + column] = + // quantized_weight[row * nnum_columns + column] * weight_scales[column]. + virtual bool DoMatMulQuantized(Stream* stream, + const DeviceMemory<float>& input_data, + const DeviceMemory<int8>& quantized_weights, + const DeviceMemory<float>& weight_scales, + const dnn::BatchDescriptor& input_dimensions, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) = 0; + + // Version of DoMatMul that uses pre-quantized 16 bit weights. + // weight_scales specifies the scaling of each column of weights: + // original float weight[row * num_columns + column] = + // quantized_weight[row * nnum_columns + column] * weight_scales[column]. + virtual bool DoMatMulQuantized(Stream* stream, + const DeviceMemory<float>& input_data, + const DeviceMemory<int16>& quantized_weights, + const DeviceMemory<float>& weight_scales, + const dnn::BatchDescriptor& input_dimensions, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) = 0; + + // Adds biases to the feature maps in input_data producing + // output_data. input_data can equal output_data, but must not + // partially overlap it. + // + // Let K = count() * height() * width() and N = feature_map_count() + // on dimensions. Then input_value contains K*N values and biases + // contains N values. We can thus logically consider input_value to + // contain K vectors of N elements each. This function adds biases + // to each of those N vectors. + // + // TODO(broune): This works differently when width() * height() > 1 + // and the call to ThenBiasAdd() follows a call to ThenMatMul(). In + // that case there should be width() * height() * + // feature_map_count() biases, but this is not implemented on all + // StreamExecutors. + // + // Arguments (all borrowed): + // stream: borrowed pointer to the stream that the 'bias add' operation + // should be enqueued onto. + // input_data: un-owned device memory region containing the input. + // biases: un-owned device memory region containing biases to add to the + // input. + // dimensions: dimensions of input_data and output_data. + // output_data: un-owned device memory region in which to place the result. + virtual bool DoBiasAdd(Stream* stream, const DeviceMemory<float>& input_data, + const DeviceMemory<float>& biases, + const dnn::BatchDescriptor& dimensions, + DeviceMemory<float>* output_data) = 0; + + // Performs a forward pooling operation on input_data, writing to + // output_data. See PoolingDescriptor for how to configure the + // pooling operation. + // + // Pooling happens as a window that moves across the Y and X + // dimensions of input_data, where each position of the window + // yields one output value. E.g. for max pooling, the computed value + // is the maximum element in the window. The operation is applied + // independently to each batch and at each feature map (depth), so + // that the output depth and feature_map_count are the same as for + // the input. The output width and height can be different. + // + // See PoolingDescriptor for how to configure the pooling operation. + virtual bool DoPoolForward(Stream* stream, + const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) = 0; + + // Performs differentiation of the pooling operation. + virtual bool DoPoolBackward(Stream* stream, + const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_dimensions, + const DeviceMemory<float>& output_data, + const DeviceMemory<float>& input_diff_data, + DeviceMemory<float>* output_diff_data) = 0; + + // Applies local response normalization to all of the values + // held on the device in 'input_data'. + virtual bool DoNormalize(Stream* stream, + const dnn::NormalizeDescriptor& normalize_descriptor, + const DeviceMemory<float>& input_data, + DeviceMemory<float>* output_data) = 0; + + // Applies an activation function (see ActivationMode) to all of the values + // held on the device in 'input_data', whose dimensions are described by + // 'dimensions'. + // + // Arguments (all borrowed): + // stream: borrowed pointer to the stream that the 'activate' operation + // should be enqueued onto. + // activation_mode: Type of activation to perform. + // input_data: un-owned device memory region which contains the + // activate input. + // output_data: un-owned device memory region in which to place the + // activate result. + virtual bool DoActivate(Stream* stream, ActivationMode activation_mode, + const BatchDescriptor& dimensions, + const DeviceMemory<float>& input_data, + DeviceMemory<float>* output_data) = 0; + + // Concatenates several layers into one, by concatenating the depth of each + // layer at matching x and y coordinates. + // The inputs must all have the same width and height, the output will have + // the same width and height as the inputs and its depth will be the sum of + // the input depths. + // + // Arguments (all borrowed): + // stream: borrowed pointer to the stream that the 'depth concatenate' + // operation should be enqueued onto. + // input_dimensions: The dimensions of each input. + // input_data: un-owned device memory region which contains the + // input data for each input layer. + // output_data: un-owned device memory region in which to place the + // depth concatenate result. + virtual bool DoDepthConcatenate( + Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float>*> input_data, + DeviceMemory<float>* output_data) = 0; + + // Computes the specified operation (e.g. addition or multiplication) + // between corresponding elements in the inputs and stores the result in the + // output element. + // The inputs and output must all have the same dimensions, but may have + // different quantization parameters (min_value and max_value). + // + // Arguments (all borrowed): + // stream: borrowed pointer to the stream that the 'elementwise operation' + // should be enqueued onto. + // operation: The operation to perform. + // input_dimensions: The dimensions of each input. + // input_data: un-owned device memory region which contains the + // input data for each input layer. + // output_dimensions: The dimensions of the output. + // output_data: un-owned device memory region in which to place the + // operation result. + virtual bool DoElementwiseOperate( + Stream* stream, ElementwiseOperation operation, + port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float>*> input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) = 0; + + // Enqueues an asynchronous memcpy of the *quantized* output of a layer (that + // is, bytes instead of scaled floats) into 'host_dst' if they are available + // for the underlying DNN implementation. If this quantized output is not + // available, false is returned, which will place 'stream' into an error + // state. + // + // Arguments (all borrowed): + // stream: borrowed pointer to the stream that the 'quantized memcpy' + // operation should be enqueued onto. + // gpu_unquantized_src: the device memory that contains the unquantized data + // -- this data should also have a corresponding quantized representation + // on the device for this operation to succeed. + // host_dst: un-owned host memory region that is mutated in place, + // it is clobbered by the values in 'gpu_unquantized_src' when the enqueued + // (asynchronous) memcpy operation is performed. + // TODO(wgulland) Merge all these versions of DoMemcpyD2HQuantized. + virtual bool DoMemcpyD2HQuantized( + Stream* stream, const DeviceMemory<float>& gpu_unquantized_src, + port::MutableArraySlice<uint8> host_dst) = 0; + + // As above, but for 16-bit values. + virtual bool DoMemcpyD2HQuantized( + Stream* stream, const DeviceMemory<float>& gpu_unquantized_src, + port::MutableArraySlice<uint16> host_dst) = 0; + + // As above, but for signed 32-bit values. + virtual bool DoMemcpyD2HQuantized( + Stream* stream, const DeviceMemory<float>& gpu_unquantized_src, + port::MutableArraySlice<int32> host_dst) = 0; + + // Enqueues an asynchronous memcpy of 'host_dst' into the *quantized* input + // of a layer (that is, bytes instead of scaled floats) if they are supported + // by the underlying DNN implementation. If this quantized input is not + // supported, false is returned, which will place 'stream' into an error + // state. + // + // Arguments (all borrowed): + // stream: borrowed pointer to the stream that the 'quantized memcpy' + // operation should be enqueued onto. + // host_src: un-owned host memory region that contains the quantized data. + // gpu_unquantized_dst: the device memory that is clobbered by the values in + // 'host_src' when the enqueued (asynchronous) memcpy operation is + // performed. -- this data should also have a corresponding quantized + // representation on the device for this operation to + // succeed. + virtual bool DoMemcpyH2DQuantized( + Stream* stream, port::ArraySlice<uint8> host_src, + DeviceMemory<float>* gpu_unquantized_dst) = 0; + + private: + SE_DISALLOW_COPY_AND_ASSIGN(DnnSupport); +}; + +} // namespace dnn +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_DNN_H_ diff --git a/tensorflow/stream_executor/dso_loader.cc b/tensorflow/stream_executor/dso_loader.cc new file mode 100644 index 0000000000..4ac14ea30b --- /dev/null +++ b/tensorflow/stream_executor/dso_loader.cc @@ -0,0 +1,208 @@ +#include "tensorflow/stream_executor/dso_loader.h" + +#include <dlfcn.h> +#include <limits.h> +#include <stdlib.h> +#include <unistd.h> +#include <initializer_list> +#include "tensorflow/stream_executor/platform/port.h" +#include <vector> + +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/str_util.h" +#include "tensorflow/stream_executor/lib/strcat.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/lib/str_util.h" + +namespace perftools { +namespace gputools { +namespace internal { + +/* static */ port::Status DsoLoader::GetCublasDsoHandle(void** dso_handle) { + return GetDsoHandle(FindDsoPath("libcublas.so.7.0", + "third_party/gpus/cuda/lib64"), + dso_handle); +} + +/* static */ port::Status DsoLoader::GetCudnnDsoHandle(void** dso_handle) { + // libcudnn is versioned differently than the other libraries. See b/22397368 + // for some details about the complications surrounding this. + return GetDsoHandle(FindDsoPath("libcudnn.so.6.5", + "third_party/gpus/cuda/lib64"), + dso_handle); +} + +/* static */ port::Status DsoLoader::GetCufftDsoHandle(void** dso_handle) { + return GetDsoHandle(FindDsoPath("libcufft.so.7.0", + "third_party/gpus/cuda/lib64"), + dso_handle); +} + +/* static */ port::Status DsoLoader::GetCurandDsoHandle(void** dso_handle) { + return GetDsoHandle(FindDsoPath("libcurand.so.7.0", + "third_party/gpus/cuda/lib64"), + dso_handle); +} + +/* static */ port::Status DsoLoader::GetLibcudaDsoHandle(void** dso_handle) { + return GetDsoHandle(FindDsoPath("libcuda.so", + "third_party/gpus/cuda/driver/lib64"), + dso_handle); +} + +/* static */ port::Status DsoLoader::GetLibcuptiDsoHandle(void** dso_handle) { + return GetDsoHandle( + FindDsoPath("libcupti.so.7.0", + "third_party/gpus/cuda/extras/CUPTI/lib64"), + dso_handle); +} + +/* static */ void DsoLoader::RegisterRpath(port::StringPiece path) { + mutex_lock lock{rpath_mutex_}; + GetRpaths()->push_back(path.ToString()); +} + + +/* static */ port::Status DsoLoader::GetDsoHandle(port::StringPiece path, + void** dso_handle, + LoadKind load_kind) { + + int dynload_flags = + RTLD_LAZY | (load_kind == LoadKind::kLocal ? RTLD_LOCAL : RTLD_GLOBAL); + string path_string = path.ToString(); + *dso_handle = dlopen(path_string.c_str(), dynload_flags); + if (*dso_handle == nullptr) { + LOG(INFO) << "LD_LIBRARY_PATH: " << getenv("LD_LIBRARY_PATH"); + // TODO(b/22689637): Eliminate unnecessary ToString once StrCat has been + // moved to the open-sourceable version. + return port::Status( + port::error::FAILED_PRECONDITION, + port::StrCat("could not dlopen DSO: ", path, "; dlerror: ", dlerror())); + } + + VLOG(2) << "loaded path \"" << path << "\" " + << (load_kind == LoadKind::kLocal ? "locally" : "globally"); + return port::Status::OK(); +} + +/* static */ string DsoLoader::GetBinaryDirectory(bool strip_executable_name) { + char exe_path[PATH_MAX] = {0}; + CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1)); + // Make sure it's null-terminated: + exe_path[sizeof(exe_path) - 1] = 0; + + if (strip_executable_name) { + // The exe is the last component of the path, so remove one component. + std::vector<string> components = port::Split(exe_path, '/'); + components.pop_back(); + return port::Join(components, "/"); + } + return exe_path; +} + +// Creates a heap-allocated vector for initial rpaths. +// Ownership is transferred to the caller. +static std::vector<string>* CreatePrimordialRpaths() { + auto rpaths = new std::vector<string>; + rpaths->push_back( + "driver/driver_sh.runfiles/third_party/gpus/cuda/lib64"); + return rpaths; +} + +/* static */ mutex DsoLoader::rpath_mutex_{LINKER_INITIALIZED}; +/* static */ std::vector<string>* DsoLoader::GetRpaths() { + static std::vector<string>* rpaths = CreatePrimordialRpaths(); + return rpaths; +} + +/* static */ bool DsoLoader::TrySymbolicDereference(string* candidate) { + char buf[PATH_MAX]; + char* result = realpath(candidate->c_str(), buf); + if (result == nullptr) { + return false; + } + VLOG(3) << "realpath resolved candidate path \"" << *candidate << "\" to \"" + << result << "\""; + *candidate = result; + return true; +} + +/* static */ string DsoLoader::FindDsoPath(port::StringPiece library_name, + port::StringPiece runfiles_relpath) { + + // Keep a record of the paths we attempted so we can dump out meaningful + // diagnostics if no path is found. + std::vector<string> attempted; + + using StringPieces = std::vector<port::StringPiece>; + string candidate; + + // Otherwise, try binary-plus-rpath locations. + string binary_directory = + GetBinaryDirectory(true /* = strip_executable_name */); + mutex_lock lock{rpath_mutex_}; + for (const string& rpath : *GetRpaths()) { + candidate = + port::Join(StringPieces{binary_directory, rpath, library_name}, "/"); + if (TrySymbolicDereference(&candidate)) { + return candidate; + } + } + attempted.push_back(candidate); + + return library_name.ToString(); +} + +// -- CachedDsoLoader + +/* static */ port::StatusOr<void*> CachedDsoLoader::GetCublasDsoHandle() { + static port::StatusOr<void*> result = + FetchHandleResult(DsoLoader::GetCublasDsoHandle); + return result; +} + +/* static */ port::StatusOr<void*> CachedDsoLoader::GetCurandDsoHandle() { + static port::StatusOr<void*> result = + FetchHandleResult(DsoLoader::GetCurandDsoHandle); + return result; +} + +/* static */ port::StatusOr<void*> CachedDsoLoader::GetCudnnDsoHandle() { + static port::StatusOr<void*> result = + FetchHandleResult(DsoLoader::GetCudnnDsoHandle); + return result; +} + +/* static */ port::StatusOr<void*> CachedDsoLoader::GetCufftDsoHandle() { + static port::StatusOr<void*> result = + FetchHandleResult(DsoLoader::GetCufftDsoHandle); + return result; +} + +/* static */ port::StatusOr<void*> CachedDsoLoader::GetLibcudaDsoHandle() { + static port::StatusOr<void*> result = + FetchHandleResult(DsoLoader::GetLibcudaDsoHandle); + return result; +} + +/* static */ port::StatusOr<void*> CachedDsoLoader::GetLibcuptiDsoHandle() { + static port::StatusOr<void*> result = + FetchHandleResult(DsoLoader::GetLibcuptiDsoHandle); + return result; +} + +/* static */ port::StatusOr<void*> CachedDsoLoader::FetchHandleResult( + std::function<port::Status(void**)> load_dso) { + void* handle; + auto status = load_dso(&handle); + if (!status.ok()) { + return status; + } + return handle; +} + +} // namespace internal +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/dso_loader.h b/tensorflow/stream_executor/dso_loader.h new file mode 100644 index 0000000000..4dcc48d231 --- /dev/null +++ b/tensorflow/stream_executor/dso_loader.h @@ -0,0 +1,107 @@ +// Common DSO loading functionality: exposes callables that dlopen DSOs +// in either the runfiles directories + +#ifndef TENSORFLOW_STREAM_EXECUTOR_DSO_LOADER_H_ +#define TENSORFLOW_STREAM_EXECUTOR_DSO_LOADER_H_ + +#include "tensorflow/stream_executor/platform/port.h" +#include <vector> + +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/lib/stringpiece.h" +#include "tensorflow/stream_executor/platform.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { +namespace internal { + +// Permits StreamExecutor code to dynamically load a pre-determined set of +// relevant DSOs via dlopen. +// +// Thread-safe. +class DsoLoader { + public: + // The following methods either load the DSO of interest and return a dlopen + // handle or error status in the canonical namespace. + + static port::Status GetCublasDsoHandle(void** dso_handle); + static port::Status GetCudnnDsoHandle(void** dso_handle); + static port::Status GetCufftDsoHandle(void** dso_handle); + static port::Status GetCurandDsoHandle(void** dso_handle); + static port::Status GetLibcudaDsoHandle(void** dso_handle); + static port::Status GetLibcuptiDsoHandle(void** dso_handle); + + // Registers a new binary-relative path to use as a dlopen search path. + static void RegisterRpath(port::StringPiece path); + + private: + // Registered rpaths (singleton vector) and a mutex that guards it. + static std::vector<string>* GetRpaths(); + static mutex rpath_mutex_; + + // Descriptive boolean wrapper to indicate whether symbols are made available + // to resolve in later-loaded libraries. + enum class LoadKind { kLocal, kGlobal }; + + // Loads a DSO from the given "path" (which can technically be any dlopen-able + // name). If the load kind is global, the symbols in the loaded DSO are + // visible to subsequent DSO loading operations. + static port::Status GetDsoHandle(port::StringPiece path, void** dso_handle, + LoadKind load_kind = LoadKind::kLocal); + + + // Returns the binary directory (or binary path) associated with the currently + // executing program. If strip_executable_name is true, the executable file is + // stripped off of the path. + static string GetBinaryDirectory(bool strip_executable_name); + + // Returns the location of the runfiles directory. + // * Manual invocation gets the runfiles as a relative path to the current + // executable. + static string GetRunfilesDirectory(); + + // Invokes realpath on the original path; updates candidate and returns true + // if it succeeds (i.e. a file exists at the path); otherwise, returns false. + static bool TrySymbolicDereference(string* candidate); + + // Attempts to find a path to the DSO of interest, otherwise returns the + // bare library name: + // Arguments: + // library_name: the filename in tree; e.g. libOpenCL.so.1.0.0 + // runfiles_relpath: where to look for the library relative to the runfiles + // root; e.g. third_party/gpus/cuda/lib64 + static string FindDsoPath(port::StringPiece library_name, + port::StringPiece runfiles_relpath); + + SE_DISALLOW_COPY_AND_ASSIGN(DsoLoader); +}; + +// Wrapper around the DsoLoader that prevents us from dlopen'ing any of the DSOs +// more than once. +class CachedDsoLoader { + public: + // Cached versions of the corresponding DsoLoader methods above. + static port::StatusOr<void*> GetCublasDsoHandle(); + static port::StatusOr<void*> GetCudnnDsoHandle(); + static port::StatusOr<void*> GetCufftDsoHandle(); + static port::StatusOr<void*> GetCurandDsoHandle(); + static port::StatusOr<void*> GetLibcudaDsoHandle(); + static port::StatusOr<void*> GetLibcuptiDsoHandle(); + + private: + // Fetches a DSO handle via "load_dso" and returns the StatusOr form of the + // result. + static port::StatusOr<void*> FetchHandleResult( + std::function<port::Status(void**)> load_dso); + + SE_DISALLOW_COPY_AND_ASSIGN(CachedDsoLoader); +}; + +} // namespace internal +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_DSO_LOADER_H_ diff --git a/tensorflow/stream_executor/event.cc b/tensorflow/stream_executor/event.cc new file mode 100644 index 0000000000..79c3d39f24 --- /dev/null +++ b/tensorflow/stream_executor/event.cc @@ -0,0 +1,48 @@ +#include "tensorflow/stream_executor/event.h" + +#include "tensorflow/stream_executor/stream_executor_internal.h" +#include "tensorflow/stream_executor/stream_executor_pimpl.h" +#include "tensorflow/stream_executor/stream.h" + +namespace perftools { +namespace gputools { + +internal::EventInterface* CreateEventImplementation( + StreamExecutor* stream_exec) { + PlatformKind platform_kind = stream_exec->platform_kind(); + switch (platform_kind) { + case PlatformKind::kCuda: + return (*internal::MakeCUDAEventImplementation())(stream_exec); + default: + LOG(FATAL) << "Cannot create event implementation for platform kind: " + << PlatformKindString(platform_kind); + } +} + +Event::Event(StreamExecutor* stream_exec) + : implementation_(CreateEventImplementation(stream_exec)), + stream_exec_(stream_exec) {} + +Event::~Event() { + auto status = stream_exec_->DeallocateEvent(this); + if (!status.ok()) { + LOG(ERROR) << status.error_message(); + } +} + +bool Event::Init() { + auto status = stream_exec_->AllocateEvent(this); + if (!status.ok()) { + LOG(ERROR) << status.error_message(); + return false; + } + + return true; +} + +Event::Status Event::PollForStatus() { + return stream_exec_->PollForEventStatus(this); +} + +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/event.h b/tensorflow/stream_executor/event.h new file mode 100644 index 0000000000..fdd5112d9a --- /dev/null +++ b/tensorflow/stream_executor/event.h @@ -0,0 +1,63 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_EVENT_H_ +#define TENSORFLOW_STREAM_EXECUTOR_EVENT_H_ + +#include <memory> + +namespace perftools { +namespace gputools { + +namespace internal { +class EventInterface; +} + +class Stream; +class StreamExecutor; + +// The Event class, when supported by a platform, enables low-overhead status +// reporting for a Stream. An Event is inserted at a location in a stream via +// the Stream::ThenRecordEvent() API. From then on, the Event's status can be +// monitored via the nonblocking Event::PollForStatus() call. +class Event { + public: + // Potential states for an Event. If PollForStatus() returns anything aside + // from kPending or kComplete, an error has occurred; kUnknown is a bad state. + // Not all implementations are able to return all enumeration values. Refer to + // the platform-specific implementation for details. + enum class Status { + kUnknown, + kError, + kPending, + kComplete, + }; + + explicit Event(StreamExecutor* stream_exec); // NOLINT + + // Releases any resources held by the Event object. + ~Event(); + + // Performs any platform-specific or potentially error-generating + // initialization. + bool Init(); + + // Returns the current Status for the event. + Status PollForStatus(); + + // Returns a pointer to the underlying platform-specific implementation. + internal::EventInterface* implementation() { return implementation_.get(); } + + private: + friend class Stream; + + // Pointer to the platform-specific EventInterface implementation underlying + // the object. Owned. + std::unique_ptr<internal::EventInterface> implementation_; + + // Pointer to the StreamExecutor interface used to create this object. + // Not owned. + StreamExecutor* stream_exec_; +}; + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_EVENT_H_ diff --git a/tensorflow/stream_executor/executor_cache.cc b/tensorflow/stream_executor/executor_cache.cc new file mode 100644 index 0000000000..7bf1a9aa4a --- /dev/null +++ b/tensorflow/stream_executor/executor_cache.cc @@ -0,0 +1,43 @@ +#include "tensorflow/stream_executor/executor_cache.h" + +#include "tensorflow/stream_executor/lib/stringprintf.h" + +namespace perftools { +namespace gputools { + +port::Status ExecutorCache::Insert(const StreamExecutorConfig& config, + std::unique_ptr<StreamExecutor> entry) { + if (Get(config).ok()) { + return port::Status(port::error::ALREADY_EXISTS, + "An executor with a matching config already exists."); + } + + cache_[config.ordinal].emplace_back(Entry(config, std::move(entry))); + + return port::Status::OK(); +} + +port::StatusOr<StreamExecutor*> ExecutorCache::Get( + const StreamExecutorConfig& config) { + auto entries = cache_.find(config.ordinal); + if (entries == cache_.end()) { + return port::Status( + port::error::NOT_FOUND, + port::Printf("No executors registered for ordinal %d", config.ordinal)); + } + + for (const auto& iter : entries->second) { + if (iter.first.plugin_config == config.plugin_config && + iter.first.device_options == config.device_options) { + return iter.second.get(); + } + } + + return port::Status(port::error::NOT_FOUND, + "No executor found with a matching config."); +} + +void ExecutorCache::DestroyAllExecutors() { cache_.clear(); } + +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/executor_cache.h b/tensorflow/stream_executor/executor_cache.h new file mode 100644 index 0000000000..4d1d9ddb07 --- /dev/null +++ b/tensorflow/stream_executor/executor_cache.h @@ -0,0 +1,45 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_CACHE_H_ +#define TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_CACHE_H_ + +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/stream_executor_pimpl.h" + +namespace perftools { +namespace gputools { + +// Utility class to allow Platform objects to manage cached StreamExecutors. +class ExecutorCache { + public: + ExecutorCache() {} + + // Inserts a new StreamExecutor with the given configuration into the cache. + // Will not overwrite if called when a matching element is already present. + port::Status Insert(const StreamExecutorConfig& config, + std::unique_ptr<StreamExecutor> executor); + + // Returns a pointer to the described executor (if one with a matching config + // has been created), or a NOT_FOUND status. + port::StatusOr<StreamExecutor*> Get(const StreamExecutorConfig& config); + + // Destroys all Executors and clears the cache. + // Performs no synchronization - undefined behavior may occur if any executors + // are active! + void DestroyAllExecutors(); + + private: + typedef std::pair<StreamExecutorConfig, std::unique_ptr<StreamExecutor>> + Entry; + + // Maps ordinal number to a list of cached executors for that ordinal. + // We key off of ordinal (instead of just looking up all fields in the + // StreamExecutorConfig) for a slight improvement in lookup time. + std::map<int, std::vector<Entry>> cache_; + + SE_DISALLOW_COPY_AND_ASSIGN(ExecutorCache); +}; + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_CACHE_H_ diff --git a/tensorflow/stream_executor/fft.h b/tensorflow/stream_executor/fft.h new file mode 100644 index 0000000000..b47921d8f2 --- /dev/null +++ b/tensorflow/stream_executor/fft.h @@ -0,0 +1,187 @@ +// Exposes the family of FFT routines as pre-canned high performance calls for +// use in conjunction with the StreamExecutor abstraction. +// +// Note that this interface is optionally supported by platforms; see +// StreamExecutor::SupportsFft() for details. +// +// This abstraction makes it simple to entrain FFT operations on GPU data into +// a Stream -- users typically will not use this API directly, but will use the +// Stream builder methods to entrain these operations "under the hood". For +// example: +// +// DeviceMemory<std::complex<float>> x = +// stream_exec->AllocateArray<std::complex<float>>(1024); +// DeviceMemory<std::complex<float>> y = +// stream_exec->AllocateArray<std::complex<float>>(1024); +// // ... populate x and y ... +// Stream stream{stream_exec}; +// std::unique_ptr<Plan> plan = +// stream_exec.AsFft()->Create1dPlan(&stream, 1024, Type::kC2CForward); +// stream +// .Init() +// .ThenFft(plan.get(), x, &y) +// .BlockHostUntilDone(); +// +// By using stream operations in this manner the user can easily intermix custom +// kernel launches (via StreamExecutor::ThenLaunch()) with these pre-canned FFT +// routines. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_FFT_H_ +#define TENSORFLOW_STREAM_EXECUTOR_FFT_H_ + +#include <complex> +#include <memory> +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { + +class Stream; +template <typename ElemT> +class DeviceMemory; + +namespace fft { + +// Specifies FFT input and output types, and the direction. +// R, D, C, and Z stand for SP real, DP real, SP complex, and DP complex. +enum class Type { + kC2CForward, + kC2CInverse, + kC2R, + kR2C, + kZ2ZForward, + kZ2ZInverse, + kZ2D, + kD2Z +}; + +// FFT plan class. Each FFT implementation should define a plan class that is +// derived from this class. It does not provide any interface but serves +// as a common type that is used to execute the plan. +class Plan { + public: + virtual ~Plan() {} +}; + +// FFT support interface -- this can be derived from a GPU executor when the +// underlying platform has an FFT library implementation available. See +// StreamExecutor::AsFft(). +// +// This support interface is not generally thread-safe; it is only thread-safe +// for the CUDA platform (cuFFT) usage; host side FFT support is known +// thread-compatible, but not thread-safe. +class FftSupport { + public: + virtual ~FftSupport() {} + + // Creates a 1d FFT plan. + virtual std::unique_ptr<Plan> Create1dPlan(Stream *stream, uint64 num_x, + Type type, bool in_place_fft) = 0; + + // Creates a 2d FFT plan. + virtual std::unique_ptr<Plan> Create2dPlan(Stream *stream, uint64 num_x, + uint64 num_y, Type type, + bool in_place_fft) = 0; + + // Creates a 3d FFT plan. + virtual std::unique_ptr<Plan> Create3dPlan(Stream *stream, uint64 num_x, + uint64 num_y, uint64 num_z, + Type type, bool in_place_fft) = 0; + + // Creates a batched FFT plan. + // + // stream: The GPU stream in which the FFT runs. + // rank: Dimensionality of the transform (1, 2, or 3). + // elem_count: Array of size rank, describing the size of each dimension. + // input_embed, output_embed: + // Pointer of size rank that indicates the storage dimensions + // of the input/output data in memory. If set to null_ptr all + // other advanced data layout parameters are ignored. + // input_stride: Indicates the distance (number of elements; same below) + // between two successive input elements. + // input_distance: Indicates the distance between the first element of two + // consecutive signals in a batch of the input data. + // output_stride: Indicates the distance between two successive output + // elements. + // output_distance: Indicates the distance between the first element of two + // consecutive signals in a batch of the output data. + virtual std::unique_ptr<Plan> CreateBatchedPlan( + Stream *stream, int rank, uint64 *elem_count, uint64 *input_embed, + uint64 input_stride, uint64 input_distance, uint64 *output_embed, + uint64 output_stride, uint64 output_distance, Type type, + bool in_place_fft, int batch_count) = 0; + + // Computes complex-to-complex FFT in the transform direction as specified + // by direction parameter. + virtual bool DoFft(Stream *stream, Plan *plan, + const DeviceMemory<std::complex<float>> &input, + DeviceMemory<std::complex<float>> *output) = 0; + virtual bool DoFft(Stream *stream, Plan *plan, + const DeviceMemory<std::complex<double>> &input, + DeviceMemory<std::complex<double>> *output) = 0; + + // Computes real-to-complex FFT in forward direction. + virtual bool DoFft(Stream *stream, Plan *plan, + const DeviceMemory<float> &input, + DeviceMemory<std::complex<float>> *output) = 0; + virtual bool DoFft(Stream *stream, Plan *plan, + const DeviceMemory<double> &input, + DeviceMemory<std::complex<double>> *output) = 0; + + // Computes complex-to-real FFT in inverse direction. + virtual bool DoFft(Stream *stream, Plan *plan, + const DeviceMemory<std::complex<float>> &input, + DeviceMemory<float> *output) = 0; + virtual bool DoFft(Stream *stream, Plan *plan, + const DeviceMemory<std::complex<double>> &input, + DeviceMemory<double> *output) = 0; + + protected: + FftSupport() {} + + private: + SE_DISALLOW_COPY_AND_ASSIGN(FftSupport); +}; + +// Macro used to quickly declare overrides for abstract virtuals in the +// fft::FftSupport base class. Assumes that it's emitted somewhere inside the +// ::perftools::gputools namespace. +#define TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES \ + std::unique_ptr<fft::Plan> Create1dPlan(Stream *stream, uint64 num_x, \ + fft::Type type, bool in_place_fft) \ + override; \ + std::unique_ptr<fft::Plan> Create2dPlan(Stream *stream, uint64 num_x, \ + uint64 num_y, fft::Type type, \ + bool in_place_fft) override; \ + std::unique_ptr<fft::Plan> Create3dPlan( \ + Stream *stream, uint64 num_x, uint64 num_y, uint64 num_z, \ + fft::Type type, bool in_place_fft) override; \ + std::unique_ptr<fft::Plan> CreateBatchedPlan( \ + Stream *stream, int rank, uint64 *elem_count, uint64 *input_embed, \ + uint64 input_stride, uint64 input_distance, uint64 *output_embed, \ + uint64 output_stride, uint64 output_distance, fft::Type type, \ + bool in_place_fft, int batch_count) override; \ + bool DoFft(Stream *stream, fft::Plan *plan, \ + const DeviceMemory<std::complex<float>> &input, \ + DeviceMemory<std::complex<float>> *output) override; \ + bool DoFft(Stream *stream, fft::Plan *plan, \ + const DeviceMemory<std::complex<double>> &input, \ + DeviceMemory<std::complex<double>> *output) override; \ + bool DoFft(Stream *stream, fft::Plan *plan, \ + const DeviceMemory<float> &input, \ + DeviceMemory<std::complex<float>> *output) override; \ + bool DoFft(Stream *stream, fft::Plan *plan, \ + const DeviceMemory<double> &input, \ + DeviceMemory<std::complex<double>> *output) override; \ + bool DoFft(Stream *stream, fft::Plan *plan, \ + const DeviceMemory<std::complex<float>> &input, \ + DeviceMemory<float> *output) override; \ + bool DoFft(Stream *stream, fft::Plan *plan, \ + const DeviceMemory<std::complex<double>> &input, \ + DeviceMemory<double> *output) override; + +} // namespace fft +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_FFT_H_ diff --git a/tensorflow/stream_executor/gcuda.cc b/tensorflow/stream_executor/gcuda.cc new file mode 100644 index 0000000000..505534c08f --- /dev/null +++ b/tensorflow/stream_executor/gcuda.cc @@ -0,0 +1,87 @@ +#include "tensorflow/stream_executor/gcuda.h" + +namespace perftools { +namespace gputools { + +// Returns the mapping of gcudacc kernel stub to preferred cache +// configuration. C++ static singleton pattern. +std::map<void *, KernelCacheConfig> &GetGcudaccStubToCacheConfigMap() { + static std::map<void *, KernelCacheConfig> cache_config_by_stub; + return cache_config_by_stub; +} + +shared_mem_config::SharedMemConfig DeviceGetSharedMemConfig( + StreamExecutor *stream_exec) { + SharedMemoryConfig config = stream_exec->GetDeviceSharedMemoryConfig(); + + switch (config) { + case SharedMemoryConfig::kDefault: + return shared_mem_config::kDefaultBankSize; + case SharedMemoryConfig::kFourByte: + return shared_mem_config::kFourByteBankSize; + case SharedMemoryConfig::kEightByte: + return shared_mem_config::kEightByteBankSize; + default: + LOG(FATAL) << "Impossible shared memory config returned: " + << static_cast<int>(config); + } +} + +void DeviceSetSharedMemConfig(StreamExecutor *stream_exec, + shared_mem_config::SharedMemConfig config) { + SharedMemoryConfig executor_config; + switch (config) { + case shared_mem_config::kDefaultBankSize: + executor_config = SharedMemoryConfig::kDefault; + break; + case shared_mem_config::kFourByteBankSize: + executor_config = SharedMemoryConfig::kFourByte; + break; + case shared_mem_config::kEightByteBankSize: + executor_config = SharedMemoryConfig::kEightByte; + break; + default: + LOG(FATAL) << "Impossible shared memory config specified: " + << static_cast<int>(config); + } + + if (!stream_exec->SetDeviceSharedMemoryConfig(executor_config).ok()) { + // The message is logged at a higher level. + LOG(INFO) << "Unable to set cache configuration; proceeding."; + } +} + +template <> +void FuncSetCacheConfig<void *>(Stream *stream, void *fptr, + cache_config::CacheConfig cache_config) { + // Map from the legacy to the C++11 type. + KernelCacheConfig kernel_cache_config; + switch (cache_config) { + case cache_config::kPreferShared: + kernel_cache_config = KernelCacheConfig::kPreferShared; + break; + case cache_config::kPreferL1: + kernel_cache_config = KernelCacheConfig::kPreferL1; + break; + case cache_config::kPreferEqual: + kernel_cache_config = KernelCacheConfig::kPreferEqual; + break; + default: + kernel_cache_config = KernelCacheConfig::kNoPreference; + } + auto cache_config_map = GetGcudaccStubToCacheConfigMap(); + cache_config_map[fptr] = kernel_cache_config; +} + +template <> +KernelCacheConfig FuncGetCacheConfig<void *>(void *fptr) { + auto cache_config_map = GetGcudaccStubToCacheConfigMap(); + auto iter = cache_config_map.find(fptr); + if (iter == cache_config_map.end()) { + return KernelCacheConfig::kNoPreference; + } + return cache_config_map[fptr]; +} + +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/gcuda.h b/tensorflow/stream_executor/gcuda.h new file mode 100644 index 0000000000..24b09c5358 --- /dev/null +++ b/tensorflow/stream_executor/gcuda.h @@ -0,0 +1,415 @@ +// Common declarations and includes for mixed-mode GPU usage at Google. +// +// This header serves to define a "common baseline" for GPU usage, +// either with gcudacc or nvcc, and on the host or device. The rule of thumb is, +// "if you're working with mixed-mode GPU code at Google, include this header." +#ifndef TENSORFLOW_STREAM_EXECUTOR_GCUDA_H_ +#define TENSORFLOW_STREAM_EXECUTOR_GCUDA_H_ + +// Symbol glossary: +// __CUDACC__: CUDA capable compiler, compiling host or device +// __CUDA_ARCH__: Compiling device code +// __GCUDACC__: Using gcudacc +// __NVCC__: Using nvcc + +// For device code compiled with gcudacc, CUDA_ASSUME(X) tells the compiler +// that it may assume that X is true. This can enable further optimization. +// It is undefined behavior if X is not true. X should not have side-effects +// and gcudacc will try to warn you if it does. +#if defined(__CUDA_ARCH__) && defined(__GCUDACC__) +#define CUDA_ASSUME(X) __builtin_assume(X) +#else +#define CUDA_ASSUME(X) do {} while (false) +#endif + +namespace perftools { +namespace gputools { +namespace cache_config { +// A version of the KernelCacheConfig enum class, exposed for pre-C++11 +// compilers. +enum CacheConfig { + // Indicates no preference for device L1/shared memory configuration. + kNoPreference, + + // Indicates a preference for more shared memory than L1 cache. + kPreferShared, + + // Indicates a preference for more L1 cache than shared memory. + kPreferL1, + + // Indicates a preference for equal amounts of L1 cache and shared memory. + kPreferEqual, +}; +} // namespace cache_config + +namespace shared_mem_config { +// A compatability-layer declaration of CUsharedconfig, needed to support +// cuFuncSetSharedMemConfig/cudaDeviceSetSharedMemConfig. Declared here for +// compatability with pre-C++11 compilers. +enum SharedMemConfig { + // Indicates that the context's shared memory config should be used. + kDefaultBankSize, + + // Specifies a four-byte bank size for shared memory. + kFourByteBankSize, + + // Specifies an eight-byte bank size for shared memory. + kEightByteBankSize, +}; +} // namespace shared_mem_config +} // namespace gputools +} // namespace perftools + +#if !defined(__NVCC__) && !defined(GCUDACC_STANDALONE_MODE) +// Using gcudacc, either device-only or mixed-mode code. No special declarations +// are needed for host-only code being compiled under gcudacc. + +// These includes are required by the code introduced during gcudacc operation. +// Since the user code may not directly include these headers, they may not be +// present in the build environment without inclusion here. +#include "tensorflow/stream_executor/device_memory.h" +#include "tensorflow/stream_executor/kernel.h" +#include "tensorflow/stream_executor/kernel_cache_config.h" +#include "tensorflow/stream_executor/launch_dim.h" +#include "tensorflow/stream_executor/machine_manager.h" +#include "tensorflow/stream_executor/shared_memory_config.h" +#include "tensorflow/stream_executor/stream.h" +#include "tensorflow/stream_executor/stream_executor.h" + +// cudaConfigureCall is a symbol used by Clang when it sees a CUDA triple-angle- +// bracket launch, so we declare it here so the symbol resolves. It is not used +// by gcudacc-generated code, however, so it is not defined anywhere. +// In other words, this is a dummy declaration needed for parsing. + +#ifdef __GCUDACC__ +// These symbols only need to be defined during compilation with gcudacc. +namespace perftools { +namespace gputools { + +// This class defines all the implicit conversions necessary to match launch +// dimensions against the cudaConfigureCall() signature, and sits where a dim3 +// usually would in triple angle launches. This supports the kernel launch +// dimension styles: +// kernel<<<1, 1>>>() and +// kernel<<<BlockDim(...), ThreadDim(...)>>> and +// kernel<<<dim3(1), dim3(1)>>> +// All of these are predicated upon implicit conversions, which are frowned upon +// by the style guide. Rather then add this CUDA-specific bad behavior to +// StreamExecutor headers, we isolate it here. +class LaunchDimConverter { + public: + LaunchDimConverter(unsigned long long int i) : _dim(i, 1, 1) {} // NOLINT + LaunchDimConverter(::perftools::gputools::BlockDim dim) + : // NOLINT + _dim(dim.x, dim.y, dim.z) {} + LaunchDimConverter(::perftools::gputools::ThreadDim dim) + : // NOLINT + _dim(dim.x, dim.y, dim.z) {} + LaunchDimConverter(dim3 dim) : // NOLINT + _dim(dim.x, dim.y, dim.z) {} + + ::perftools::gputools::BlockDim AsBlockDim() { + return ::perftools::gputools::BlockDim(_dim.x, _dim.y, _dim.z); + } + + ::perftools::gputools::ThreadDim AsThreadDim() { + return ::perftools::gputools::ThreadDim(_dim.x, _dim.y, _dim.z); + } + + private: + ::perftools::gputools::Dim3D _dim; +}; +} // namespace gputools +} // namespace perftools + +int cudaConfigureCall(::perftools::gputools::LaunchDimConverter grid_size, + ::perftools::gputools::LaunchDimConverter block_size, + unsigned shared_size = 0, + ::perftools::gputools::Stream *stream = 0); +#endif + +// The rest of the symbols in this block are needed during both StreamExecutor +// and user library compilation. +namespace perftools { +namespace gputools { + +// Gets the preferred shared memory configuration for the device to which +// the specified executor is bound. +shared_mem_config::SharedMemConfig DeviceGetSharedMemConfig( + StreamExecutor *stream_exec); + +// Sets the preferred shared memory configuration for the device to which +// the specified executor is bound. +// Does not return an error if the current device is invalid. +void DeviceSetSharedMemConfig(StreamExecutor *stream_exec, + shared_mem_config::SharedMemConfig config); + +// Sets the preferred cache configuration for the given kernel. +template <typename KernelT> +void FuncSetCacheConfig(Stream *stream, KernelT kernel, + cache_config::CacheConfig cache_config) { + FuncSetCacheConfig(stream, reinterpret_cast<void *>(kernel), cache_config); +} + +// Internal specialization of the above. +template <> +void FuncSetCacheConfig<void *>(Stream *stream, void *kernel, + cache_config::CacheConfig cache_config); + +// Gets the preferred cache configuration for the given kernel. +template <typename KernelT> +KernelCacheConfig FuncGetCacheConfig(KernelT kernel) { + return FuncGetCacheConfig(reinterpret_cast<void *>(kernel)); +} + +// Internal specialization of the above. +template <> +KernelCacheConfig FuncGetCacheConfig<void *>(void *kernel); + +} // namespace gputools +} // namespace perftools + +#elif defined(__NVCC__) +// NVCC code compilation, device-only or mixed mode. As above, no special +// declarations are needed for host-only code. +namespace perftools { +namespace gputools { +class Stream; +} // namespace gputools +} // namespace perftools + +// --- BEGIN EXTERNALLY-DEFINED FUNCTIONS + +// The following functions must be defined in some external library linked in to +// the final binary - they are _not_ defined in the StreamExecutor +// (in nvcc mode). + +// Sets the preferred cache configuration for the specified kernel. +template <typename KernelT> +void SetCudaCacheConfig(perftools::gputools::Stream* stream, KernelT kernel, + ::perftools::gputools::cache_config::CacheConfig preference); + +// Gets the current device for use in CUDA runtime-emulating routines. +// "device" is the device ordinal as returned by +// StreamExecutor::device_ordinal(). +int GetDevice(); + +// Sets the current device for use in CUDA runtime-emulating routines. +// "device" is the device ordinal as returned by +// StreamExecutor::device_ordinal(). +void SetDevice(int device); + +// --- END EXTERNALLY-DEFINED FUNCTIONS + +namespace perftools { +namespace gputools { +template <typename KernelT> +void FuncSetCacheConfig(Stream *stream, KernelT kernel, + cache_config::CacheConfig cache_config) { + SetCudaCacheConfig(stream, reinterpret_cast<void*>(kernel), cache_config); +} +} // namespace gputools +} // namespace perftools + +// The following functions are declared extern "C" in CUDA's device_functions.h, +// so we have to wrap them for compatability with the cuda_builtin namespace. +// Thin wrappers to break these functions out of cuda_builtin are defined below. +__forceinline__ __device__ clock_t __gcuda_nvcc_clock() { return clock(); } +__forceinline__ __device__ int __gcuda_nvcc__clz(int x) { + return __clz(x); +} +__forceinline__ __device__ int __gcuda_nvcc__clzll(long long int x) { + return __clzll(x); +} +__forceinline__ __device__ float __gcuda_nvcc__fdividef(float a, float b) { + return __fdividef(a, b); +} +__forceinline__ __device__ int __gcuda_nvcc__ffsll(long long int x) { // NOLINT + return __ffsll(x); +} +__forceinline__ __device__ int __gcuda_nvcc__popc(unsigned int x) { + return __popc(x); +} +__forceinline__ __device__ float __gcuda_nvcc__powf(float a, float b) { + return __powf(a, b); +} +__forceinline__ __device__ void __gcuda_nvcc__sincosf( + float x, float *sptr, float *cptr) { + __sincosf(x, sptr, cptr); +} +__forceinline__ __device__ unsigned int __gcuda_nvcc__umulhi( + unsigned int x, unsigned int y) { + return __umulhi(x, y); +} + +#if __CUDA_ARCH__ >= 200 || !defined(__CUDA_ARCH__) +__forceinline__ __device__ unsigned int __gcuda_nvcc__ballot(int x) { + return __ballot(x); +} +#endif // __CUDA_ARCH__ >= 200 || !defined(__CUDA_ARCH__) + +// Forward-declare printf as nvcc does not declare it by itself and we +// need this file to compile even if it is included before including +// stdio.h or cstdio. +int printf(const char* format, ...); + +namespace cuda_builtin { +using ::abs; +using ::atomicAdd; +using ::atomicCAS; +using ::ceil; +using ::ceilf; +using ::cos; +using ::cosf; +using ::erfcinv; +using ::erfcinvf; +using ::exp; +using ::expf; +using ::fabs; +using ::fabsf; +using ::floor; +using ::floorf; +using ::fabs; +using ::fabsf; +using ::fma; +using ::fmaf; +using ::fmax; +using ::fmaxf; +using ::fmin; +using ::fminf; +using ::log; +using ::log1p; +using ::log1pf; +using ::logf; +using ::max; +using ::min; +using ::powf; +using ::printf; +using ::sin; +using ::sinf; +using ::sincos; +using ::sincosf; +using ::sincospi; +using ::sincospif; +using ::sqrt; +using ::sqrtf; +using ::tanh; +using ::trunc; +using ::truncf; +using ::trunc; + +// rsqrt and rsqrtf are functions defined by nvcc in both host and device mode. +// Add these functions to gcuda.h such that it is also host device. In device +// side they correspond to intrinsics while explicit definitions are provided +// below for host side. +#ifdef __CUDA_ARCH__ +using ::rsqrt; +using ::rsqrtf; +#else +__forceinline__ __host__ __device__ float rsqrtf(float x) { + return 1 / std::sqrt(x); +} +__forceinline__ __host__ __device__ double rsqrt(double x) { + return 1 / std::sqrt(x); +} +#endif + +__forceinline__ __device__ int clock() { return __gcuda_nvcc_clock(); } + +__forceinline__ __device__ int __clz(int x) { + return __gcuda_nvcc__clz(x); +} + +__forceinline__ __device__ int __clz(long long int x) { + return __gcuda_nvcc__clzll(x); +} + +__forceinline__ __device__ float __fdividef(float a, float b) { + return __gcuda_nvcc__fdividef(a, b); +} + +__forceinline__ __device__ int __ffsll(long long int x) { // NOLINT + return __gcuda_nvcc__ffsll(x); +} + +__forceinline__ __device__ int __popc(unsigned int x) { + return __gcuda_nvcc__popc(x); +} + +__forceinline__ __device__ float __powf(float a, float b) { + return __gcuda_nvcc__powf(a, b); +} + +__forceinline__ __device__ void __sincosf(float x, float *sptr, float *cptr) { + __gcuda_nvcc__sincosf(x, sptr, cptr); +} + +__forceinline__ __device__ unsigned int __umulhi(unsigned int x, + unsigned int y) { + return __gcuda_nvcc__umulhi(x, y); +} + +#ifdef __CUDA_ARCH__ +// These symbols are only visible when parsing device code. +using ::__double_as_longlong; +using ::__int_as_float; +using ::__float_as_int; +using ::__longlong_as_double; +#endif // __CUDA_ARCH__ + +#if __CUDA_ARCH__ >= 200 || !defined(__CUDA_ARCH__) +__forceinline__ __device__ unsigned int __ballot(int x) { + return __gcuda_nvcc__ballot(x); +} +#endif // __CUDA_ARCH__ >= 200 || !defined(__CUDA_ARCH__) + +#if __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) +using ::__shfl; +using ::__shfl_down; +using ::__shfl_up; +using ::__shfl_xor; +#endif // __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) + +#if __CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__) +using ::__ldg; +#endif // __CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__) + +#if __CUDA_API_VERSION < 6050 +// CUDA < 6.5 defines isfinite as a macro, while CUDA >= 6.5 and gcudacc +// define isfinite as a function. Work around this for the CUDA 5.5 case, +// duplicating that macro definition. +#undef isfinite +#define __gcuda_nvcc_isfinite(x) \ + (sizeof(x) == sizeof(float) ? __finitef(x) : \ + sizeof(x) == sizeof(double) ? __finite(x) : __finitel(x)) +inline __device__ int isfinite(float x) { + return __gcuda_nvcc_isfinite(x); +} +inline __device__ int isfinite(double x) { + return __gcuda_nvcc_isfinite(x); +} +inline __device__ int isfinite(long double x) { + return __gcuda_nvcc_isfinite(x); +} +#else +// CUDA API >= v6.5 +using ::isfinite; +#endif // __CUDA_API_VERSION >= 6050 +} // namespace cuda_builtin + +#if __CUDA_API_VERSION >= 6050 +// The second part of the isfinite workaround. +inline __device__ int isfinite(float x) { + return __gcuda_nvcc_isfinite(x); +} +inline __device__ int isfinite(double x) { + return __gcuda_nvcc_isfinite(x); +} +inline __device__ int isfinite(long double x) { + return __gcuda_nvcc_isfinite(x); +} +#endif // __CUDA_API_VERSION >= 6050 + +#endif // defined(__NVCC__) + +#endif // TENSORFLOW_STREAM_EXECUTOR_GCUDA_H_ diff --git a/tensorflow/stream_executor/gpu_launch_dim.h b/tensorflow/stream_executor/gpu_launch_dim.h new file mode 100644 index 0000000000..51182b2d32 --- /dev/null +++ b/tensorflow/stream_executor/gpu_launch_dim.h @@ -0,0 +1,8 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_LAUNCH_DIM_H_ +#define TENSORFLOW_STREAM_EXECUTOR_GPU_LAUNCH_DIM_H_ + +// TODO(rspringer): Temporary redirection until all users - including gcudacc - +// are using the new file. +#include "tensorflow/stream_executor/launch_dim.h" + +#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_LAUNCH_DIM_H_ diff --git a/tensorflow/stream_executor/kernel.cc b/tensorflow/stream_executor/kernel.cc new file mode 100644 index 0000000000..5e7fe95627 --- /dev/null +++ b/tensorflow/stream_executor/kernel.cc @@ -0,0 +1,95 @@ +// Implementation of the pointer-to-implementation wrapper for the data-parallel +// kernel abstraction. KernelBase just delegates to the internal +// platform-specific implementation instance. + +#include "tensorflow/stream_executor/kernel.h" + +#include "tensorflow/stream_executor/platform/port.h" + +#include "tensorflow/stream_executor/lib/demangle.h" +#include "tensorflow/stream_executor/platform.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/stream_executor.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" + +namespace perftools { +namespace gputools { + +bool KernelMetadata::registers_per_thread(int *registers_per_thread) const { + if (has_registers_per_thread_) { + *registers_per_thread = registers_per_thread_; + return true; + } + + return false; +} + +void KernelMetadata::set_registers_per_thread(int registers_per_thread) { + registers_per_thread_ = registers_per_thread; + has_registers_per_thread_ = true; +} + +bool KernelMetadata::shared_memory_bytes(int *shared_memory_bytes) const { + if (has_shared_memory_bytes_) { + *shared_memory_bytes = shared_memory_bytes_; + return true; + } + + return false; +} + +void KernelMetadata::set_shared_memory_bytes(int shared_memory_bytes) { + shared_memory_bytes_ = shared_memory_bytes; + has_shared_memory_bytes_ = true; +} + +static internal::KernelInterface *KernelImplementationFromPlatformKind( + PlatformKind platform_kind) { + if (platform_kind == PlatformKind::kCuda) { + return (*internal::MakeCUDAKernelImplementation())(); + } else if (platform_kind == PlatformKind::kOpenCL || + platform_kind == PlatformKind::kOpenCLAltera) { + return (*internal::MakeOpenCLKernelImplementation())(); + } else { + LOG(FATAL) << "cannot create kernel implementation for platform kind: " + << PlatformKindString(platform_kind); + } +} + +KernelBase::KernelBase(StreamExecutor *parent) + : implementation_( + KernelImplementationFromPlatformKind(parent->platform_kind())), + parent_(parent) { + DCHECK(parent_ != nullptr); +} + +KernelBase::KernelBase(StreamExecutor *parent, + internal::KernelInterface *implementation) + : implementation_(implementation), parent_(parent) {} + +KernelBase::~KernelBase() {} + +unsigned KernelBase::Arity() const { return implementation_->Arity(); } + +void KernelBase::SetPreferredCacheConfig(KernelCacheConfig config) { + return implementation_->SetPreferredCacheConfig(config); +} + +KernelCacheConfig KernelBase::GetPreferredCacheConfig() const { + return implementation_->GetPreferredCacheConfig(); +} + +// Prefix stub functions emitted by the CUDA splitter. +static const char *kStubPrefix = "__device_stub_"; + +void KernelBase::set_name(port::StringPiece name) { + name_ = name.ToString(); + port::StringPiece stubless_name = name; + if (name.starts_with(kStubPrefix)) { + stubless_name.remove_prefix(strlen(kStubPrefix)); + } + demangled_name_ = port::Demangle(stubless_name.data()); +} + +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/kernel.h b/tensorflow/stream_executor/kernel.h new file mode 100644 index 0000000000..da646d0f40 --- /dev/null +++ b/tensorflow/stream_executor/kernel.h @@ -0,0 +1,499 @@ +// Suite of datatypes to represent data-parallel kernel objects (code entities). +// Kernel is the untyped variant, whereas TypedKernel takes a type signature +// to do some template-based helper generation and give compile-time type +// checking for kernel launch parameters. +// +// Users typically don't see KernelBase, they see typed kernels, analogous to a +// typed function pointer. TypedKernels express their argument types via +// template parameters like so: +// +// TypedKernel<DeviceMemory<int>*, int> +// +// Which expresses a data parallel kernel signature for: +// +// void(int*, int); +// +// And for a const memory region: +// +// TypedKernel<const DeviceMemory<int>&, int> +// +// Corresponds to a data parallel kernel signature for: +// +// void(const int*, int) +// +// Note that kernels always have a void return type, so results typically must +// be memcpy'ied from device memory to the host. +// +// Also note that a scalar integer residing in device memory and an array of +// integers residing in device memory have the same signature: DeviceMemory<T>. +// However, in the future, checks may be added for additional safety that arrays +// of minimum sizes are passed when those minimum sizes are contractually +// expected by the kernel. +// +// For user-defined types whose definitions are appropriately shared between the +// host code doing the launching and the kernel code being launched, the user +// defined types are similarly permitted to be expressed as residing in device +// memory: +// +// TypedKernel<DeviceMemory<MyUserDefinedStructure>> +// +// And, when the alignment and padding are agreed upon, POD types will also be +// able to be passed by value; for example, it is a common idiom to specify a +// bunch of options simultaneously with a structure: +// +// TypedKernel<MyOptionsStructurePassedByValue, DeviceMemory<float>> +// +// Which corresponds to a data parallel kernel signature like: +// +// void(MyOptionsStructurePassedByValue value, float *result); +// +// Users typically won't need to type out the TypedKernel signature in full, it +// will be typedef'd by automatically generated code; for example, see +// perftools::gputools::executor_sample::VecReduceAddKernel. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_H_ +#define TENSORFLOW_STREAM_EXECUTOR_KERNEL_H_ + +#include <memory> +#include <tuple> +#include <type_traits> +#include <vector> + +#include "tensorflow/stream_executor/device_memory.h" +#include "tensorflow/stream_executor/kernel_cache_config.h" +#include "tensorflow/stream_executor/lib/stringpiece.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/lib/inlined_vector.h" + +namespace perftools { +namespace gputools { + +class DeviceMemoryBase; +template <typename ElemT> +class DeviceMemory; +class StreamExecutor; + +namespace internal { +class KernelInterface; +} // namespace internal + +// KernelMetadata holds runtime-queryable attributes of a loaded kernel, such as +// registers allocated, shared memory used, etc. +// Not all platforms support reporting of all information, so each accessor +// returns false if the associated field is not populated in the underlying +// platform. +class KernelMetadata { + public: + KernelMetadata() + : has_registers_per_thread_(false), has_shared_memory_bytes_(false) {} + + // Returns the number of registers used per thread executing this kernel. + bool registers_per_thread(int *registers_per_thread) const; + + // Sets the number of registers used per thread executing this kernel. + void set_registers_per_thread(int registers_per_thread); + + // Returns the amount of [static] shared memory used per block executing this + // kernel. Note that dynamic shared memory allocations are not (and can not) + // be reported here (since they're not specified until kernel launch time). + bool shared_memory_bytes(int *shared_memory_bytes) const; + + // Sets the amount of [static] shared memory used per block executing this + // kernel. + void set_shared_memory_bytes(int shared_memory_bytes); + + private: + // Holds the value returned by registers_per_thread above. + bool has_registers_per_thread_; + int registers_per_thread_; + + // Holds the value returned by shared_memory_bytes above. + bool has_shared_memory_bytes_; + int64 shared_memory_bytes_; +}; + +// A data-parallel kernel (code entity) for launching via the StreamExecutor, +// analogous to a void* device function pointer. See TypedKernel for the typed +// variant. +// +// Thread-compatible. +class KernelBase { + public: + // Constructs an "empty" (not-yet-loaded) kernel instance. + // + // parent is the StreamExecutor that will be responsible for loading the + // implementation of this kernel. It must not be null. + explicit KernelBase(StreamExecutor *parent); + + // Test-only constructor that can take a mock KernelInterface implementation. + // Takes ownership of implementation, it should not be null. + KernelBase(StreamExecutor *parent, internal::KernelInterface *implementation); + + // Releases resources associated with the kernel instance (i.e. + // platform-specific implementation). + ~KernelBase(); + + // Returns the number of parameters that this kernel accepts. (Arity refers to + // nullary, unary, ...). + unsigned Arity() const; + + // Returns the StreamExecutor that represents the platform this kernel + // executes upon. + StreamExecutor *parent() const { return parent_; } + + // Returns a const pointer to the (opaque) platform-dependent implementation. + const internal::KernelInterface *implementation() const { + return implementation_.get(); + } + + // Returns a non-const pointer to the (opaque) platform-dependent + // implementation. + internal::KernelInterface *implementation() { return implementation_.get(); } + + void set_metadata(const KernelMetadata &metadata) { metadata_ = metadata; } + + const KernelMetadata &metadata() const { return metadata_; } + + // Sets the preferred cache configuration for a kernel. This is just a + // suggestion to the runtime, and may not be honored during execution. + void SetPreferredCacheConfig(KernelCacheConfig config); + + // Gets the preferred cache configuration for a kernel. + KernelCacheConfig GetPreferredCacheConfig() const; + + void set_name(port::StringPiece name); + const string &name() const { return name_; } + const string &demangled_name() const { return demangled_name_; } + + private: + // Implementation delegated to for platform-specific functionality. + std::unique_ptr<internal::KernelInterface> implementation_; + + // The StreamExecutor that loads this kernel object. + StreamExecutor *parent_; + + string name_; + string demangled_name_; + + KernelMetadata metadata_; + + SE_DISALLOW_COPY_AND_ASSIGN(KernelBase); +}; + +// Whether T is a DeviceMemory-family pointer. +template <typename T> +struct IsDeviceMemoryPointer { + static constexpr bool value = false; +}; + +template <typename U> +struct IsDeviceMemoryPointer<DeviceMemory<U> *> { + static constexpr bool value = true; +}; + +template <> +struct IsDeviceMemoryPointer<DeviceMemoryBase *> { + static constexpr bool value = true; +}; + +// Whether T is a DeviceMemory-family value-like thing (which includes a +// reference). This trait is useful because we pack values in the same manner as +// references. +template <typename T> +struct IsDeviceMemoryValueLike { + static constexpr bool value = false; +}; + +template <typename U> +struct IsDeviceMemoryValueLike<DeviceMemory<U> &> { + static constexpr bool value = true; +}; + +// We need to treat SharedDeviceMemory types differently than other DeviceMemory +// types (since they maintain no allocations), hence these specializations. +template <typename U> +struct IsDeviceMemoryValueLike<SharedDeviceMemory<U> &> { + static constexpr bool value = false; +}; + +template <> +struct IsDeviceMemoryValueLike<DeviceMemoryBase &> { + static constexpr bool value = true; +}; + +template <typename U> +struct IsDeviceMemoryValueLike<DeviceMemory<U>> { + static constexpr bool value = true; +}; + +template <typename U> +struct IsDeviceMemoryValueLike<SharedDeviceMemory<U>> { + static constexpr bool value = false; +}; + +template <> +struct IsDeviceMemoryValueLike<DeviceMemoryBase> { + static constexpr bool value = true; +}; + +template <typename U> +struct IsSharedDeviceMemory { + static constexpr bool value = false; +}; + +template <typename U> +struct IsSharedDeviceMemory<SharedDeviceMemory<U> &> { + static constexpr bool value = true; +}; + +template <typename U> +struct IsSharedDeviceMemory<SharedDeviceMemory<U>> { + static constexpr bool value = true; +}; + +// KernelArg encapsulates the information necessary for a back-end executor to +// configure a kernel to launch using the given argument. +struct KernelArg { + // Indicates the type of an argument: normal, to be passed to the kernel + // in the standard manner, or shared memory, which has distinct + // rules for specification per backend. + enum Type { + kNormal, + kSharedMemory, + } type; + + // The data to pass to the kernel - either a pointer to device memory, or the + // argument value. compact_array is used to prevent smaller args (ex. u8, u64) + // from requiring heap allocation. + port::InlinedVector<uint8, 4> data; + + // The size of this argument in bytes. + uint64 bytes; +}; + +// Typed variant of KernelBase, like a typed device function pointer. See the +// file comment for details and example usage. +// +// This class contains template metaprogramming magic to type check the +// parameters passed to a kernel launch are acceptable, and subsequently pack +// them into a form which can be used by the StreamExecutorInterface +// implementation. (i.e. CUDA and OpenCL both bind void*s with associated +// sizes as kernel arguments.) +// +// Thread-compatible. +template <typename... Params> +class TypedKernel : public KernelBase { + public: + // Delegates to KernelBase::KernelBase(), see that constructor. + explicit TypedKernel(StreamExecutor *parent) : KernelBase(parent) {} + + // Test-only constructor that can take a mock KernelInterface implementation. + // Takes ownership of implementation, it should not be null. + TypedKernel(StreamExecutor *parent, internal::KernelInterface *implementation) + : KernelBase(parent, implementation) {} + + private: + // Stream needs access to the specific parameter-packing functionality that + // the TypedKernel provides for its corresponding type signature (and no other + // type signatures). + friend class Stream; + + // This is the main entry point into the magic. Packs the parameters (which + // must type check against the class template) into the args and sizes + // arrays. + // + // Const refs are taken as parameters on all of the handlers to avoid + // implicit type promotion of integers. + void PackParams(std::vector<KernelArg> *args, Params... params) const { + PackOneParam(args, params...); + } + + template <typename T, typename... RestOfParams> + void PackOneParam(std::vector<KernelArg> *args, const T &arg, + const RestOfParams... rest) const { + PackOneParam(args, arg); + PackOneParam(args, rest...); + } + + // Packs one (non-DeviceMemoryBase) parameter into the arg and sizes array. + // The enable_if<> is for excluding DeviceMemoryBase args, which have a + // separate implementation below. + template <typename T> + void PackOneParam( + std::vector<KernelArg> *args, const T &arg, + typename std::enable_if<!IsDeviceMemoryValueLike<T>::value && + !IsDeviceMemoryPointer<T>::value && + !IsSharedDeviceMemory<T>::value>::type * = + nullptr) const { + static_assert(!std::is_pointer<T>::value, + "cannot pass raw pointer to the device"); + static_assert(!std::is_convertible<T, DeviceMemoryBase>::value, + "cannot pass device memory as a normal value"); + const uint8 *arg_ptr = reinterpret_cast<const uint8 *>(&arg); + args->emplace_back(KernelArg{ + KernelArg::kNormal, + port::InlinedVector<uint8, 4>{arg_ptr, arg_ptr + sizeof(arg)}, sizeof(arg)}); + } + + // DeviceMemoryBase family reference override. + template <typename T> + void PackOneParam( + std::vector<KernelArg> *args, const T &arg, + typename std::enable_if<IsDeviceMemoryValueLike<T>::value>::type * = + nullptr) const { + args->emplace_back(parent()->DeviceMemoryToKernelArg(arg)); + } + + // DeviceMemoryBase family pointer override. + template <typename T> + void PackOneParam( + std::vector<KernelArg> *args, T arg, + typename std::enable_if<IsDeviceMemoryPointer<T>::value>::type * = + nullptr) const { + DeviceMemoryBase *ptr = static_cast<DeviceMemoryBase *>(arg); + args->emplace_back(parent()->DeviceMemoryToKernelArg(*ptr)); + } + + // Dynamic shared device memory has a size, but no associated allocation on + // the host; internally, the device will allocate storage. + template <typename T> + void PackOneParam( + std::vector<KernelArg> *args, T arg, + typename std::enable_if<IsSharedDeviceMemory<T>::value>::type * = + nullptr) const { + args->emplace_back(KernelArg{KernelArg::kSharedMemory, + port::InlinedVector<uint8, 4>(), arg.size()}); + } + + // Base case for variadic template expansion - nothing to do! + void PackOneParam(std::vector<KernelArg> *args) const {} + + SE_DISALLOW_COPY_AND_ASSIGN(TypedKernel); +}; + +// Template metaprogramming helper type that helps us produce better error +// messages at compile time when the are mismatches between the parameter +// type list and the argument type list. +template <typename ParamTuple, typename ArgTuple> +struct KernelInvocationChecker { + // Whether the parameter tuple and argument tuple match in length. + static constexpr bool kLengthMatches = + std::tuple_size<ParamTuple>::value == std::tuple_size<ArgTuple>::value; + + // The (matching) length of the parameters and arguments type lists. + static constexpr int kTupleLength = + static_cast<int>(std::tuple_size<ArgTuple>::value); + + // Helper trait to say whether the parameter wants a DeviceMemory-reference + // compatible type. This is for inexact type matches, so that it doesn't have + // to be precisely a const DeviceMemory<T>&, but can also be a value that + // represents the same. + template <typename ParamType, typename ArgType> + struct IsCompatibleDeviceMemoryRef { + static constexpr bool value = false; + }; + + // See type trait definition above. + template <typename U> + struct IsCompatibleDeviceMemoryRef<const DeviceMemory<U> &, DeviceMemory<U>> { + static constexpr bool value = true; + }; + + // See type trait definition above. + template <typename U> + struct IsCompatibleDeviceMemoryRef<const SharedDeviceMemory<U> &, + SharedDeviceMemory<U>> { + static constexpr bool value = true; + }; + + // Returns whether ParamT and ArgT are compatible for data parallel kernel + // parameter packing without any assert functionality. + template <typename ParamT, typename ArgT> + static constexpr bool CompatibleNoAssert() { + return std::is_same<typename std::remove_const<ParamT>::type, + ArgT>::value || + IsCompatibleDeviceMemoryRef<ParamT, ArgT>::value; + } + + // Checks whether ParamT and ArgT are compatible for data parallel kernel + // parameter packing. kArgumentNumber is unused, it just for error display. + // + // NOTE: if you encounter an error here, you can see the mismatch by looking + // at the end of the last error message, which will be of the form: + // + // ...::Compatible<const perftools::gputools::DeviceMemory<OneThing> &, + // perftools::gputools::DeviceMemory<AnotherThing>, true, + // 0>' + // requested here + // + // This means that the 0th argument you passed to the kernel invocation should + // have been DeviceMemory<OneThing> but was observed to be + // DeviceMemory<AnotherThing>. + template <typename ParamT, typename ArgT, bool kShouldStaticAssert, + int kArgumentNumber> + static constexpr bool Compatible() { + static_assert( + kShouldStaticAssert ? CompatibleNoAssert<ParamT, ArgT>() : true, + "parameter type (LHS) is not compatible with argument type (RHS)"); + return CompatibleNoAssert<ParamT, ArgT>(); + } + + // Checks the parameter/argument match at kArgumentNumber for an out of bounds + // argument number. + // + // This is the base case: we've run out of argument to check, so we're all + // good. + template <int kArgumentNumber, bool kShouldStaticAssert> + static constexpr bool CheckParam( + typename std::enable_if<(kArgumentNumber < 0)>::type *dummy = nullptr) { + return true; + } + + // Checks the parameter/argument match at kArgumentNumber. + // kShouldStaticAssert determines whether to assert out on a mismatch, or just + // yield the constexpr boolean value. + template <int kArgumentNumber, bool kShouldStaticAssert> + static constexpr bool CheckParam( + typename std::enable_if<kArgumentNumber >= 0>::type *dummy = nullptr) { + typedef typename std::tuple_element<kArgumentNumber, ParamTuple>::type + ParamT; + typedef typename std::tuple_element<kArgumentNumber, ArgTuple>::type ArgT; + return Compatible<ParamT, ArgT, kShouldStaticAssert, kArgumentNumber>() && + CheckParam<kArgumentNumber - 1, kShouldStaticAssert>(); + } + + // Checks the parameters/arguments for match, but doesn't static assert out. + // This is useful for testing/inspecting whether a set of parameters match in + // things like tests. + static constexpr bool CheckAllNoStaticAssert() { + return kLengthMatches && CheckParam<kTupleLength - 1, false>(); + } + + // Checks the parameters and static asserts out with a helpful error message + // (and useful template parameters in the instantiation stack) if there is an + // error. + static constexpr bool CheckAllStaticAssert() { + static_assert(kLengthMatches, + "argument length mismatched against typed kernel parameters"); + return kLengthMatches && CheckParam<kTupleLength - 1, true>(); + } +}; + +// This is a convenience type for checking whether a typed kernel matches +// against a type list. +template <typename KernelT, typename... Params> +struct KernelParamsOk { + static constexpr bool kResult = false; +}; + +// See above. +template <typename... Params, typename... Args> +struct KernelParamsOk<TypedKernel<Params...>, Args...> { + static constexpr bool kResult = KernelInvocationChecker< + std::tuple<Params...>, std::tuple<Args...>>::CheckAllNoStaticAssert(); +}; + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_KERNEL_H_ diff --git a/tensorflow/stream_executor/kernel_cache_config.h b/tensorflow/stream_executor/kernel_cache_config.h new file mode 100644 index 0000000000..9675d2940c --- /dev/null +++ b/tensorflow/stream_executor/kernel_cache_config.h @@ -0,0 +1,29 @@ +// This file contains declarations relating to kernel cache configuration +// parameters recognized by the StreamExecutor. +#ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_CACHE_CONFIG_H_ +#define TENSORFLOW_STREAM_EXECUTOR_KERNEL_CACHE_CONFIG_H_ + +namespace perftools { +namespace gputools { + +// This enum represents potential configurations of L1/shared memory when +// running a particular kernel. These values represent user preference, and +// the runtime is not required to respect these choices. +enum class KernelCacheConfig { + // Indicates no preference for device L1/shared memory configuration. + kNoPreference, + + // Indicates a preference for more shared memory than L1 cache. + kPreferShared, + + // Indicates a preference for more L1 cache than shared memory. + kPreferL1, + + // Indicates a preference for equal amounts of L1 cache and shared memory. + kPreferEqual, +}; + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_KERNEL_CACHE_CONFIG_H_ diff --git a/tensorflow/stream_executor/kernel_spec.cc b/tensorflow/stream_executor/kernel_spec.cc new file mode 100644 index 0000000000..e3b4b0d951 --- /dev/null +++ b/tensorflow/stream_executor/kernel_spec.cc @@ -0,0 +1,236 @@ +#include "tensorflow/stream_executor/kernel_spec.h" + + +namespace perftools { +namespace gputools { + +KernelLoaderSpec::KernelLoaderSpec(port::StringPiece kernelname) + : kernelname_(kernelname.ToString()) {} + +OnDiskKernelLoaderSpec::OnDiskKernelLoaderSpec(port::StringPiece filename, + port::StringPiece kernelname) + : KernelLoaderSpec(kernelname), filename_(filename.ToString()) {} + +CudaPtxOnDisk::CudaPtxOnDisk(port::StringPiece filename, + port::StringPiece kernelname) + : OnDiskKernelLoaderSpec(filename, kernelname) {} + +CudaCubinOnDisk::CudaCubinOnDisk(port::StringPiece filename, + port::StringPiece kernelname) + : OnDiskKernelLoaderSpec(filename, kernelname) {} + +CudaCubinInMemory::CudaCubinInMemory(const char *bytes, + port::StringPiece kernelname) + : KernelLoaderSpec(kernelname), bytes_(bytes) {} + +bool CompareComputeCapability(const std::tuple<int, int> &lhs, + const std::tuple<int, int> &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs) || + (std::get<0>(lhs) == std::get<0>(rhs) && + std::get<1>(lhs) < std::get<1>(rhs)); +} + +const std::tuple<int, int> CudaPtxInMemory::kMinimumCapability{1, 0}; + +CudaPtxInMemory::CudaPtxInMemory(port::StringPiece ptx, + port::StringPiece kernel_name, + bool ptx_compressed) + : KernelLoaderSpec(kernel_name), + ptx_by_compute_capability_(CompareComputeCapability) { + if (ptx_compressed) { + // Lazy decompression. Put an empty string in decompressed_ptx_ showing that + // the original ptx is compressed. + decompressed_ptx_[ptx.data()] = ""; + } + ptx_by_compute_capability_[kMinimumCapability] = ptx.data(); +} + +CudaPtxInMemory::CudaPtxInMemory( + const std::initializer_list<CudaPtxInMemory::PtxSpec> &spec_list, + port::StringPiece kernel_name, bool ptx_compressed) + : KernelLoaderSpec(kernel_name), + ptx_by_compute_capability_(CompareComputeCapability) { + for (const auto &spec : spec_list) { + int major, minor; + port::StringPiece ptx; + std::tie(major, minor, ptx) = spec; + if (ptx_compressed) { + // Lazy decompression. Put an empty string in decompressed_ptx_ showing + // that the original ptx is compressed. + decompressed_ptx_[ptx.data()] = ""; + } + ptx_by_compute_capability_[std::tuple<int, int>{major, minor}] = ptx.data(); + } +} + +string CudaPtxInMemory::DecompressPtx(const char *ptx) { + // Get the length of the PTX string from the beginning of the buffer. + uint64 ptx_length = *reinterpret_cast<const uint64 *>(ptx); + // Get the PTX string from the buffer with offset and length. + string compressed_ptx(ptx + sizeof(uint64), + ptx + sizeof(uint64) + ptx_length); + string decompressed_ptx; + // Decompress the PTX string with bzip2. + LOG(FATAL) << "bzip2 decompression is not supported yet."; + return decompressed_ptx; +} + +const char *CudaPtxInMemory::default_text() const { + if (ptx_by_compute_capability_.empty()) { + return nullptr; + } + + mutex_lock lock{mu_}; + + auto ptx = ptx_by_compute_capability_.begin()->second; + // Check if there is an entry in decompressed ptx table. + auto decompressed_ptx_iter = decompressed_ptx_.find(ptx); + if (decompressed_ptx_iter != decompressed_ptx_.end()) { + // If the decompressed string is empty, which means the ptx hasn't been + // decompressed, decompress it here. + if (decompressed_ptx_iter->second.size() == 0) { + decompressed_ptx_iter->second = DecompressPtx(ptx); + } + return decompressed_ptx_iter->second.c_str(); + } + return ptx; +} + +const char *CudaPtxInMemory::original_default_text() const { + if (ptx_by_compute_capability_.empty()) { + return nullptr; + } + + return ptx_by_compute_capability_.begin()->second; +} + +const char *CudaPtxInMemory::text(int compute_capability_major, + int compute_capability_minor) const { + std::tuple<int, int> capability{compute_capability_major, + compute_capability_minor}; + + auto ptx_iter = ptx_by_compute_capability_.find(capability); + if (ptx_iter == ptx_by_compute_capability_.end()) { + return nullptr; + } + + mutex_lock lock{mu_}; + + // Check if there is an entry in decompressed ptx table. + auto decompressed_ptx_iter = decompressed_ptx_.find(ptx_iter->second); + if (decompressed_ptx_iter != decompressed_ptx_.end()) { + // If the decompressed string is empty, which means the ptx hasn't been + // decompressed, decompress it here. + if (decompressed_ptx_iter->second.size() == 0) { + decompressed_ptx_iter->second = DecompressPtx(ptx_iter->second); + } + return decompressed_ptx_iter->second.c_str(); + } + return ptx_iter->second; +} + +const char *CudaPtxInMemory::original_text(int compute_capability_major, + int compute_capability_minor) const { + std::tuple<int, int> capability{compute_capability_major, + compute_capability_minor}; + + auto ptx_iter = ptx_by_compute_capability_.find(capability); + if (ptx_iter == ptx_by_compute_capability_.end()) { + return nullptr; + } + + return ptx_iter->second; +} + +OpenCLTextOnDisk::OpenCLTextOnDisk(port::StringPiece filename, + port::StringPiece kernelname) + : OnDiskKernelLoaderSpec(filename, kernelname) {} + +OpenCLTextInMemory::OpenCLTextInMemory(port::StringPiece text, + port::StringPiece kernelname) + : KernelLoaderSpec(kernelname), text_(text.ToString()) {} + +OpenCLBinaryOnDisk::OpenCLBinaryOnDisk(port::StringPiece filename, + port::StringPiece kernelname) + : OnDiskKernelLoaderSpec(filename, kernelname) {} + +MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLTextOnDisk( + port::StringPiece filename, port::StringPiece kernelname) { + CHECK(ocl_text_on_disk_ == nullptr); + ocl_text_on_disk_.reset(new OpenCLTextOnDisk{filename, kernelname}); + return this; +} + +MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLBinaryOnDisk( + port::StringPiece filename, port::StringPiece kernelname) { + CHECK(ocl_binary_on_disk_ == nullptr); + ocl_binary_on_disk_.reset(new OpenCLBinaryOnDisk{filename, kernelname}); + return this; +} + +MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLTextInMemory( + port::StringPiece filename, port::StringPiece kernelname) { + CHECK(ocl_text_in_memory_ == nullptr); + ocl_text_in_memory_.reset(new OpenCLTextInMemory{filename, kernelname}); + return this; +} + +MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxOnDisk( + port::StringPiece filename, port::StringPiece kernelname) { + CHECK(cuda_ptx_on_disk_ == nullptr); + cuda_ptx_on_disk_.reset(new CudaPtxOnDisk{filename, kernelname}); + return this; +} + +MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCubinInMemory( + const char *bytes, port::StringPiece kernelname) { + CHECK(cuda_cubin_in_memory_ == nullptr); + cuda_cubin_in_memory_.reset(new CudaCubinInMemory{bytes, kernelname}); + return this; +} + +MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCubinOnDisk( + port::StringPiece filename, port::StringPiece kernelname) { + CHECK(cuda_cubin_on_disk_ == nullptr); + cuda_cubin_on_disk_.reset(new CudaCubinOnDisk{filename, kernelname}); + return this; +} + +MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory( + port::StringPiece ptx, port::StringPiece kernelname) { + CHECK(cuda_ptx_in_memory_ == nullptr); + cuda_ptx_in_memory_.reset( + new CudaPtxInMemory{ptx, kernelname, false /* ptx_compressed */}); + return this; +} + +MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory( + port::StringPiece ptx, port::StringPiece kernelname) { + CHECK(cuda_ptx_in_memory_ == nullptr); + cuda_ptx_in_memory_.reset( + new CudaPtxInMemory{ptx, kernelname, true /* ptx_compressed */}); + return this; +} + +MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory( + std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list, + port::StringPiece kernelname) { + CHECK(cuda_ptx_in_memory_ == nullptr); + cuda_ptx_in_memory_.reset( + new CudaPtxInMemory{spec_list, kernelname, false /* ptx_compressed */}); + return this; +} + +MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory( + std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list, + port::StringPiece kernelname) { + CHECK(cuda_ptx_in_memory_ == nullptr); + cuda_ptx_in_memory_.reset( + new CudaPtxInMemory{spec_list, kernelname, true /* ptx_compressed */}); + return this; +} + +MultiKernelLoaderSpec::MultiKernelLoaderSpec(size_t arity) : arity_(arity) {} + +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/kernel_spec.h b/tensorflow/stream_executor/kernel_spec.h new file mode 100644 index 0000000000..01a47ac253 --- /dev/null +++ b/tensorflow/stream_executor/kernel_spec.h @@ -0,0 +1,365 @@ +// Kernel-loader specs are structures that describe how to load a data-parallel +// kernel on a given platform for subsequent launching. Headers that instantiate +// these data structures will typically be auto-generated. However, users can +// also instantiate them by hand. +// +// A kernel with the same exact functionality and type signature may be +// implemented on several different platforms. Typical usage is to create a +// singleton that describes how to load a kernel on the various supported +// platforms: +// +// static const MultiKernelLoaderSpec &SaxpySpec() { +// static auto *mkls = +// (new MultiKernelLoaderSpec{4 /* = arity */}) +// ->AddCudaPtxOnDisk(ptx_file_path, ptx_kernelname) +// ->AddOpenCLTextOnDisk(opencl_text_file_path, ocl_kernelname); +// }; +// +// return *mkls; +// } +// +// This lazily instantiates an object that describes how to load CUDA PTX +// present on disk that implements saxpy for the for the CUDA platform, or +// OpenCL text present on disk that implements saxpy for an OpenCL-based +// platform. The CudaPtxOnDisk and OpenCLTextOnDisk objects are subtypes of +// KernelLoaderSpec -- KernelLoaderSpec describes how to load a kernel for +// subsequent launching on a single platform. +// +// For the loader functionality that accepts these KernelLoaderSpecs in order +// to grab the kernel appropriately, see StreamExecutor::GetKernel(). + +#ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_ +#define TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_ + +#include <stddef.h> +#include <map> +#include <memory> +#include "tensorflow/stream_executor/platform/port.h" + +#include "tensorflow/stream_executor/lib/stringpiece.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { + +// Describes how to load a kernel on a target platform. +// +// This is an abstract base class, subclassed for specific platforms. +// The filename_or_text field represents the program location (i.e. PTX or +// OpenCL loadable translation unit path) and is simply stored; whether it is a +// filename or text is exposed via more specifically named accessors in +// subclasses. +// +// These kernel loader specifications are typically auto-generated into header +// files at build time, but can also be specified manually. +class KernelLoaderSpec { + public: + virtual ~KernelLoaderSpec() {} + + // Returns the kernel name to load out of the program. + const string &kernelname() const { return kernelname_; } + + protected: + explicit KernelLoaderSpec(port::StringPiece kernelname); + + private: + // The kernel name that should be loaded out of the program description given + // above. + string kernelname_; + + SE_DISALLOW_COPY_AND_ASSIGN(KernelLoaderSpec); +}; + +// An abstract kernel loader spec that has an associated file path, where +// there's a canonical suffix for the filename; e.g. see CudaPtxOnDisk whose +// canonical filename suffix is ".ptx". +class OnDiskKernelLoaderSpec : public KernelLoaderSpec { + public: + ~OnDiskKernelLoaderSpec() override {} + + // Returns the path to the on-disk loadable kernel file. + const string &filename() const { return filename_; } + + // Returns the canonical suffix for this on-disk kernel loader spec format; + // e.g. PTX files on disk have a canonical suffix of ".ptx". + virtual const char *CanonicalSuffix() const = 0; + + protected: + OnDiskKernelLoaderSpec(port::StringPiece filename, + port::StringPiece kernelname); + + string filename_; + + private: + SE_DISALLOW_COPY_AND_ASSIGN(OnDiskKernelLoaderSpec); +}; + +// Kernel loader specification for PTX text that resides on disk. +class CudaPtxOnDisk : public OnDiskKernelLoaderSpec { + public: + CudaPtxOnDisk(port::StringPiece filename, port::StringPiece kernelname); + ~CudaPtxOnDisk() override {} + + const char *CanonicalSuffix() const override { return ".ptx"; } + + private: + SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxOnDisk); +}; + +// Kernel loader specification for CUBIN binary that resides on disk. +class CudaCubinOnDisk : public OnDiskKernelLoaderSpec { + public: + CudaCubinOnDisk(port::StringPiece filename, port::StringPiece kernelname); + ~CudaCubinOnDisk() override {} + + const string &filename() const { return filename_; } + + const char *CanonicalSuffix() const override { return ".cubin"; } + + private: + string filename_; + + SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinOnDisk); +}; + +// Kernel loader specification for PTX text that resides in memory. +class CudaPtxInMemory : public KernelLoaderSpec { + public: + // Components: compute capability major number, compute capability minor + // number, and PTX source. + typedef std::tuple<int, int, port::StringPiece> PtxSpec; + + // Single-PTX constructor. Adds the provided PTX version with an unknown + // compute capability. Since the CC is unknown, the PTX is assumed to be very + // generally usable - in other words, PTX specified in this manner is VERY + // likely to be used as the default! Note that the PTX can be compressed, + // which is indicated by the argument ptx_compressed. + // + // Warning: the string backing the provided port::StringPiece ptx must outlive this + // instance. + CudaPtxInMemory(port::StringPiece ptx, port::StringPiece kernelname, + bool ptx_compressed = false); + + // Multiple-PTX-version constructor. Adds each item in spec_list to this + // object. Note that the PTX can be compressed, which is indicated by the + // argument ptx_compressed. + CudaPtxInMemory(const std::initializer_list<PtxSpec> &spec_list, + port::StringPiece kernel_name, bool ptx_compressed = false); + ~CudaPtxInMemory() override {} + + // Add the PTX implementation described by ptx_spec to this object. On + // collision (i.e., if a version with the same compute_capability already + // exists), the existing implementation will be overwritten. + void AddSpec(PtxSpec ptx_spec); + + // Returns pointer to the ptx of available implementation with the + // lowest-valued compute capability. For example, if PTX written to CC2.0, + // 3.0, and 3.5 are all available, the version for CC2.0 will be set. Returns + // nullptr on failed lookup (if any version is not available). + // When the ptx is compressed, returns the decompressed ptx. + const char *default_text() const; + + // Similar to default_text(). + // When the ptx is compressed, returns the decompressed ptx. + const char *original_default_text() const; + + // Returns pointer to the ptx for the requested compute capability. + // Returns nullptr on failed lookup (if the requested version is not + // available). + // When the ptx is compressed, returns the decompressed ptx. + const char *text(int compute_capability_major, + int compute_capability_minor) const; + + // Similar to text(). + // When the ptx is compressed, returns the original compressed ptx. + const char *original_text(int compute_capability_major, + int compute_capability_minor) const; + + // Decompresses the PTX string using bzip2. + static string DecompressPtx(const char *ptx); + + private: + // PTX translation unit text contents in memory. The key is of as a tuple + // "<cc_major>,<cc_minor>", i.e., "2,0", "3,0", "3,5". Because CC's + // represented in this way have a clear sorting order, map::begin() will give + // the lowest-numbered version available, i.e. the default. + std::map<std::tuple<int, int>, const char *, + bool (*)(const std::tuple<int, int> &, const std::tuple<int, int> &)> + ptx_by_compute_capability_; + + // Stores all decompressed ptx strings, with original ptx string as keys. + // It is marked as mutable for lazy decompression. + mutable std::map<const char *, string> decompressed_ptx_; + mutable mutex mu_; + + // Defines the minimum compute capability possible. Used when PTX has no + // compute capability specified (in the single-PTX constructor). + static const std::tuple<int, int> kMinimumCapability; + + SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxInMemory); +}; + +// Kernel loader specification for OpenCL text that resides on disk. +class OpenCLTextOnDisk : public OnDiskKernelLoaderSpec { + public: + OpenCLTextOnDisk(port::StringPiece filename, port::StringPiece kernelname); + ~OpenCLTextOnDisk() override {} + + const char *CanonicalSuffix() const override { return ".ocl"; } + + private: + SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextOnDisk); +}; + +// Kernel loader specification for OpenCL binary that resides on disk. +class OpenCLBinaryOnDisk : public OnDiskKernelLoaderSpec { + public: + OpenCLBinaryOnDisk(port::StringPiece filename, port::StringPiece kernelname); + ~OpenCLBinaryOnDisk() override {} + + const char *CanonicalSuffix() const override { return ".aocx"; } + + private: + SE_DISALLOW_COPY_AND_ASSIGN(OpenCLBinaryOnDisk); +}; + +// Kernel loader specification for OpenCL text that resides in memory. +class OpenCLTextInMemory : public KernelLoaderSpec { + public: + OpenCLTextInMemory(port::StringPiece text, port::StringPiece kernelname); + ~OpenCLTextInMemory() override {} + + // Returns the OpenCL text contents. + const string &text() const { return text_; } + + private: + // OpenCL translation unit text contents in memory. + string text_; + + SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextInMemory); +}; + +// Kernel loader specification for a CUBIN blob that resides in memory. +class CudaCubinInMemory : public KernelLoaderSpec { + public: + CudaCubinInMemory(const char *bytes, port::StringPiece kernelname); + ~CudaCubinInMemory() override {} + + const char *bytes() const { return bytes_; } + + private: + const char *bytes_; + + SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinInMemory); +}; + +// Describes how to load a kernel on any subset of a number of target platforms. +class MultiKernelLoaderSpec { + public: + explicit MultiKernelLoaderSpec(size_t arity); + + // Returns the number of arguments that this kernel accepts. + size_t arity() const { return arity_; } + + // Convenience getters for testing whether these platform variants have + // kernel loader specifications available. + bool has_cuda_ptx_on_disk() const { return cuda_ptx_on_disk_ != nullptr; } + bool has_cuda_cubin_on_disk() const { return cuda_cubin_on_disk_ != nullptr; } + bool has_cuda_cubin_in_memory() const { + return cuda_cubin_in_memory_ != nullptr; + } + bool has_cuda_ptx_in_memory() const { return cuda_ptx_in_memory_ != nullptr; } + bool has_ocl_text_on_disk() const { return ocl_text_on_disk_ != nullptr; } + bool has_ocl_binary_on_disk() const { return ocl_binary_on_disk_ != nullptr; } + bool has_ocl_text_in_memory() const { return ocl_text_in_memory_ != nullptr; } + + // Accessors for platform variant kernel load specifications. + // Precondition: corresponding has_* is true. + const CudaPtxOnDisk &cuda_ptx_on_disk() const { + CHECK(has_cuda_ptx_on_disk()); + return *cuda_ptx_on_disk_; + } + const CudaCubinOnDisk &cuda_cubin_on_disk() const { + CHECK(has_cuda_cubin_on_disk()); + return *cuda_cubin_on_disk_; + } + const CudaCubinInMemory &cuda_cubin_in_memory() const { + CHECK(has_cuda_cubin_in_memory()); + return *cuda_cubin_in_memory_; + } + const CudaPtxInMemory &cuda_ptx_in_memory() const { + CHECK(has_cuda_ptx_in_memory()); + return *cuda_ptx_in_memory_; + } + const OpenCLTextOnDisk &ocl_text_on_disk() const { + CHECK(has_ocl_text_on_disk()); + return *ocl_text_on_disk_; + } + const OpenCLBinaryOnDisk &ocl_binary_on_disk() const { + CHECK(has_ocl_binary_on_disk()); + return *ocl_binary_on_disk_; + } + const OpenCLTextInMemory &ocl_text_in_memory() const { + CHECK(has_ocl_text_in_memory()); + return *ocl_text_in_memory_; + } + + // Builder-pattern-like methods for use in initializing a + // MultiKernelLoaderSpec. Each of these should be used at most once for a + // single MultiKernelLoaderSpec object. See file comment for example usage. + // + // Note that the kernelname parameter must be consistent with the kernel in + // the PTX or OpenCL being loaded. Also be aware that in CUDA C++ the kernel + // name may be mangled by the compiler if it is not declared in an + // extern "C" scope. + MultiKernelLoaderSpec *AddOpenCLTextOnDisk(port::StringPiece filename, + port::StringPiece kernelname); + MultiKernelLoaderSpec *AddOpenCLBinaryOnDisk(port::StringPiece filename, + port::StringPiece kernelname); + MultiKernelLoaderSpec *AddOpenCLTextInMemory(port::StringPiece ocl_text, + port::StringPiece kernelname); + MultiKernelLoaderSpec *AddCudaPtxOnDisk(port::StringPiece filename, + port::StringPiece kernelname); + MultiKernelLoaderSpec *AddCudaCubinOnDisk(port::StringPiece filename, + port::StringPiece kernelname); + MultiKernelLoaderSpec *AddCudaCubinInMemory(const char *cubin_bytes, + port::StringPiece kernelname); + MultiKernelLoaderSpec *AddCudaPtxInMemory(port::StringPiece ptx, + port::StringPiece kernelname); + MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory( + port::StringPiece ptx, port::StringPiece kernelname); + MultiKernelLoaderSpec *AddCudaPtxInMemory( + std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list, + port::StringPiece kernelname); + MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory( + std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list, + port::StringPiece kernelname); + + private: + std::unique_ptr<CudaPtxOnDisk> + cuda_ptx_on_disk_; // PTX text that resides in a file. + std::unique_ptr<CudaCubinOnDisk> + cuda_cubin_on_disk_; // Binary CUDA program in a file. + std::unique_ptr<CudaCubinInMemory> + cuda_cubin_in_memory_; // Binary CUDA program in memory. + std::unique_ptr<CudaPtxInMemory> + cuda_ptx_in_memory_; // PTX text that resides in memory. + std::unique_ptr<OpenCLTextOnDisk> + ocl_text_on_disk_; // OpenCL text that resides on disk. + std::unique_ptr<OpenCLBinaryOnDisk> + ocl_binary_on_disk_; // OpenCL binary that resides on disk. + std::unique_ptr<OpenCLTextInMemory> + ocl_text_in_memory_; // OpenCL text that resides in memory. + + // Number of parameters that the kernel takes. (This is nicer to have in a + // constexpr than having to determine it from the types via template + // metaprogramming). + size_t arity_; +}; + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_ diff --git a/tensorflow/stream_executor/launch_dim.h b/tensorflow/stream_executor/launch_dim.h new file mode 100644 index 0000000000..9b870ed6aa --- /dev/null +++ b/tensorflow/stream_executor/launch_dim.h @@ -0,0 +1,65 @@ +// Types to express dimensionality of a kernel launch. Blocks and threads +// are (up to) 3-dimensional. +// +// A thread is conceptually like a SIMD lane. Some number, typically 32 +// (though that fact should not be relied on) SIMD lanes are tied together with +// a single PC in a unit called a warp. There is a maximum number of threads +// that can execute in a shared-context entity called a block. Presently, that +// number is 1024 -- again, something that should not be relied on from this +// comment, but checked via perftools::gputools::DeviceDescription. +// +// For additional information, see +// http://docs.nvidia.com/cuda/kepler-tuning-guide/#device-utilization-and-occupancy +// +// Because of that modest thread-per-block limit, a kernel can be launched with +// multiple blocks. Each block is indivisibly scheduled onto a single core. +// Blocks can also be used in a multi-dimensional configuration, and the block +// count has much less modest limits -- typically they're similar to the maximum +// amount of addressable memory. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_LAUNCH_DIM_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LAUNCH_DIM_H_ + +#include "tensorflow/stream_executor/platform/port.h" + +#include "tensorflow/stream_executor/lib/strcat.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { + +// Basic type that represents a 3-dimensional index space. +struct Dim3D { + uint64 x, y, z; + + Dim3D(uint64 x, uint64 y, uint64 z) : x(x), y(y), z(z) {} +}; + +// Thread dimensionality for use in a kernel launch. See file comment for +// details. +struct ThreadDim : public Dim3D { + explicit ThreadDim(uint64 x = 1, uint64 y = 1, uint64 z = 1) + : Dim3D(x, y, z) {} + + // Returns a string representation of the thread dimensionality. + string ToString() const { + return port::StrCat("ThreadDim{", x, ", ", y, ", ", z, "}"); + } +}; + +// Block dimensionality for use in a kernel launch. See file comment for +// details. +struct BlockDim : public Dim3D { + explicit BlockDim(uint64 x = 1, uint64 y = 1, uint64 z = 1) + : Dim3D(x, y, z) {} + + // Returns a string representation of the block dimensionality. + string ToString() const { + return port::StrCat("BlockDim{", x, ", ", y, ", ", z, "}"); + } +}; + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LAUNCH_DIM_H_ diff --git a/tensorflow/stream_executor/lib/array_slice.h b/tensorflow/stream_executor/lib/array_slice.h new file mode 100644 index 0000000000..271b1c15a0 --- /dev/null +++ b/tensorflow/stream_executor/lib/array_slice.h @@ -0,0 +1,17 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_ARRAY_SLICE_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_ARRAY_SLICE_H_ + +#include "tensorflow/core/lib/gtl/array_slice.h" + +namespace perftools { +namespace gputools { +namespace port { + +using tensorflow::gtl::ArraySlice; +using tensorflow::gtl::MutableArraySlice; + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_ARRAY_SLICE_H_ diff --git a/tensorflow/stream_executor/lib/casts.h b/tensorflow/stream_executor/lib/casts.h new file mode 100644 index 0000000000..61ff2ab00e --- /dev/null +++ b/tensorflow/stream_executor/lib/casts.h @@ -0,0 +1,85 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_CASTS_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_CASTS_H_ + +#include <stdlib.h> + +namespace perftools { +namespace gputools { +namespace port { + +// port::bit_cast<Dest,Source> is a template function that implements the +// equivalent of "*reinterpret_cast<Dest*>(&source)". We need this in +// very low-level functions like the protobuf library and fast math +// support. +// +// float f = 3.14159265358979; +// int i = port::bit_cast<int32>(f); +// // i = 0x40490fdb +// +// The classical address-casting method is: +// +// // WRONG +// float f = 3.14159265358979; // WRONG +// int i = * reinterpret_cast<int*>(&f); // WRONG +// +// The address-casting method actually produces undefined behavior +// according to ISO C++ specification section 3.10 -15 -. Roughly, this +// section says: if an object in memory has one type, and a program +// accesses it with a different type, then the result is undefined +// behavior for most values of "different type". +// +// This is true for any cast syntax, either *(int*)&f or +// *reinterpret_cast<int*>(&f). And it is particularly true for +// conversions between integral lvalues and floating-point lvalues. +// +// The purpose of 3.10 -15- is to allow optimizing compilers to assume +// that expressions with different types refer to different memory. gcc +// 4.0.1 has an optimizer that takes advantage of this. So a +// non-conforming program quietly produces wildly incorrect output. +// +// The problem is not the use of reinterpret_cast. The problem is type +// punning: holding an object in memory of one type and reading its bits +// back using a different type. +// +// The C++ standard is more subtle and complex than this, but that +// is the basic idea. +// +// Anyways ... +// +// port::bit_cast<> calls memcpy() which is blessed by the standard, +// especially by the example in section 3.9 . Also, of course, +// port::bit_cast<> wraps up the nasty logic in one place. +// +// Fortunately memcpy() is very fast. In optimized mode, with a +// constant size, gcc 2.95.3, gcc 4.0.1, and msvc 7.1 produce inline +// code with the minimal amount of data movement. On a 32-bit system, +// memcpy(d,s,4) compiles to one load and one store, and memcpy(d,s,8) +// compiles to two loads and two stores. +// +// I tested this code with gcc 2.95.3, gcc 4.0.1, icc 8.1, and msvc 7.1. +// +// WARNING: if Dest or Source is a non-POD type, the result of the memcpy +// is likely to surprise you. +// +// Props to Bill Gibbons for the compile time assertion technique and +// Art Komninos and Igor Tandetnik for the msvc experiments. +// +// -- mec 2005-10-17 + +template <class Dest, class Source> +inline Dest bit_cast(const Source& source) { + // Compile time assertion: sizeof(Dest) == sizeof(Source) + // A compile error here means your Dest and Source have different sizes. + static_assert(sizeof(Dest) == sizeof(Source), + "src and dst types must have equal sizes"); + + Dest dest; + memcpy(&dest, &source, sizeof(dest)); + return dest; +} + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_CASTS_H_ diff --git a/tensorflow/stream_executor/lib/demangle.cc b/tensorflow/stream_executor/lib/demangle.cc new file mode 100644 index 0000000000..6b837b803a --- /dev/null +++ b/tensorflow/stream_executor/lib/demangle.cc @@ -0,0 +1,38 @@ +#include "tensorflow/stream_executor/lib/demangle.h" + +#if (__GNUC__ >= 4 || (__GNUC__ >= 3 && __GNUC_MINOR__ >= 4)) && \ + !defined(__mips__) +# define HAS_CXA_DEMANGLE 1 +#else +# define HAS_CXA_DEMANGLE 0 +#endif + +#include <stdlib.h> +#if HAS_CXA_DEMANGLE +#include <cxxabi.h> +#endif + +namespace perftools { +namespace gputools { +namespace port { + +// The API reference of abi::__cxa_demangle() can be found in +// libstdc++'s manual. +// https://gcc.gnu.org/onlinedocs/libstdc++/libstdc++-html-USERS-4.3/a01696.html +string Demangle(const char *mangled) { + string demangled; + int status = 0; + char *result = NULL; +#if HAS_CXA_DEMANGLE + result = abi::__cxa_demangle(mangled, NULL, NULL, &status); +#endif + if (status == 0 && result != NULL) { // Demangling succeeeded. + demangled.append(result); + free(result); + } + return demangled; +} + +} // namespace port +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/lib/demangle.h b/tensorflow/stream_executor/lib/demangle.h new file mode 100644 index 0000000000..0420f7101f --- /dev/null +++ b/tensorflow/stream_executor/lib/demangle.h @@ -0,0 +1,16 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_DEMANGLE_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_DEMANGLE_H_ + +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { +namespace port { + +string Demangle(const char* mangled); + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_DEMANGLE_H_ diff --git a/tensorflow/stream_executor/lib/env.h b/tensorflow/stream_executor/lib/env.h new file mode 100644 index 0000000000..74b50ad42d --- /dev/null +++ b/tensorflow/stream_executor/lib/env.h @@ -0,0 +1,29 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_ENV_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_ENV_H_ + +#include "tensorflow/core/public/env.h" +#include "tensorflow/stream_executor/lib/stringpiece.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { +namespace port { + +using tensorflow::Env; +using tensorflow::ReadFileToString; +using tensorflow::Thread; +using tensorflow::WriteStringToFile; + +inline bool FileExists(const string& filename) { + return Env::Default()->FileExists(filename); +} + +inline bool FileExists(const port::StringPiece& filename) { + return Env::Default()->FileExists(filename.ToString()); +} + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_ENV_H_ diff --git a/tensorflow/stream_executor/lib/error.h b/tensorflow/stream_executor/lib/error.h new file mode 100644 index 0000000000..376ddd3d07 --- /dev/null +++ b/tensorflow/stream_executor/lib/error.h @@ -0,0 +1,16 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_ERROR_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_ERROR_H_ + +#include "tensorflow/core/lib/core/error_codes.pb.h" + +namespace perftools { +namespace gputools { +namespace port { + +namespace error = tensorflow::error; + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_ERROR_H_ diff --git a/tensorflow/stream_executor/lib/human_readable.h b/tensorflow/stream_executor/lib/human_readable.h new file mode 100644 index 0000000000..78df4a4a70 --- /dev/null +++ b/tensorflow/stream_executor/lib/human_readable.h @@ -0,0 +1,58 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_HUMAN_READABLE_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_HUMAN_READABLE_H_ + +#include <assert.h> +#include <limits> + +#include "tensorflow/stream_executor/lib/stringprintf.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { +namespace port { + +class HumanReadableNumBytes { + public: + static string ToString(int64 num_bytes) { + if (num_bytes == std::numeric_limits<int64>::min()) { + // Special case for number with not representable nagation. + return "-8E"; + } + + const char* neg_str = GetNegStr(&num_bytes); + + // Special case for bytes. + if (num_bytes < 1024LL) { + // No fractions for bytes. + return port::Printf("%s%lldB", neg_str, num_bytes); + } + + static const char units[] = "KMGTPE"; // int64 only goes up to E. + const char* unit = units; + while (num_bytes >= (1024LL) * (1024LL)) { + num_bytes /= (1024LL); + ++unit; + assert(unit < units + sizeof(units)); + } + + return port::Printf(((*unit == 'K') ? "%s%.1f%c" : "%s%.2f%c"), neg_str, + num_bytes / 1024.0, *unit); + } + + private: + template <typename T> + static const char* GetNegStr(T* value) { + if (*value < 0) { + *value = -(*value); + return "-"; + } else { + return ""; + } + } +}; + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_HUMAN_READABLE_H_ diff --git a/tensorflow/stream_executor/lib/initialize.h b/tensorflow/stream_executor/lib/initialize.h new file mode 100644 index 0000000000..d1832d6b26 --- /dev/null +++ b/tensorflow/stream_executor/lib/initialize.h @@ -0,0 +1,35 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_INITIALIZE_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_INITIALIZE_H_ + +#include "tensorflow/stream_executor/platform/port.h" + +#if defined(PLATFORM_GOOGLE) +#else + +#undef REGISTER_MODULE_INITIALIZER + +namespace perftools { +namespace gputools { +namespace port { + +class Initializer { + public: + typedef void (*InitializerFunc)(); + explicit Initializer(InitializerFunc func) { func(); } +}; + +} // namespace port +} // namespace gputools +} // namespace perftools + +#define REGISTER_INITIALIZER(type, name, body) \ + static void google_init_##type##_##name() { body; } \ + perftools::gputools::port::Initializer google_initializer_##type##_##name( \ + google_init_##type##_##name) + +#define REGISTER_MODULE_INITIALIZER(name, body) \ + REGISTER_INITIALIZER(module, name, body) + +#endif // !defined(PLATFORM_GOOGLE) + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_INITIALIZE_H_ diff --git a/tensorflow/stream_executor/lib/inlined_vector.h b/tensorflow/stream_executor/lib/inlined_vector.h new file mode 100644 index 0000000000..e1f7a29904 --- /dev/null +++ b/tensorflow/stream_executor/lib/inlined_vector.h @@ -0,0 +1,16 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_INLINED_VECTOR_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_INLINED_VECTOR_H_ + +#include "tensorflow/core/lib/gtl/inlined_vector.h" + +namespace perftools { +namespace gputools { +namespace port { + +using tensorflow::gtl::InlinedVector; + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_INLINED_VECTOR_H_ diff --git a/tensorflow/stream_executor/lib/mathutil.h b/tensorflow/stream_executor/lib/mathutil.h new file mode 100644 index 0000000000..dd3d37a19c --- /dev/null +++ b/tensorflow/stream_executor/lib/mathutil.h @@ -0,0 +1,88 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_MATHUTIL_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_MATHUTIL_H_ + +#include <algorithm> +#include <cmath> +#include <limits> +#include <type_traits> +#include <vector> + +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { +namespace port { + +class MathUtil { + public: + template <typename IntegralType> + static IntegralType CeilOfRatio(IntegralType numerator, + IntegralType denominator) { + return CeilOrFloorOfRatio<IntegralType, true>(numerator, denominator); + } + template <typename IntegralType> + static IntegralType FloorOfRatio(IntegralType numerator, + IntegralType denominator) { + return CeilOrFloorOfRatio<IntegralType, false>(numerator, denominator); + } + template <typename IntegralType, bool ceil> + static IntegralType CeilOrFloorOfRatio(IntegralType numerator, + IntegralType denominator); +}; + +// ---- CeilOrFloorOfRatio ---- +// This is a branching-free, cast-to-double-free implementation. +// +// Casting to double is in general incorrect because of loss of precision +// when casting an int64 into a double. +// +// There's a bunch of 'recipes' to compute a integer ceil (or floor) on the web, +// and most of them are incorrect. +template<typename IntegralType, bool ceil> +IntegralType MathUtil::CeilOrFloorOfRatio(IntegralType numerator, + IntegralType denominator) { + static_assert(std::is_integral<IntegralType>::value, + "CeilOfRatio_is_only_defined_for_integral_types"); + assert(denominator != 0); + // Dividing the smallest signed integer by -1 is not supported: it would + // SIGFPE + assert(!std::is_signed<IntegralType>::value || + numerator != std::numeric_limits<IntegralType>::min() || + denominator != -1); + + const IntegralType rounded_toward_zero = numerator / denominator; + const IntegralType intermediate_product = rounded_toward_zero * denominator; + + if (ceil) { // Compile-time condition: not an actual branching + // When rounded_toward_zero is negative, then an adjustment is never needed: + // the real ratio is negative, and so rounded toward zero is the ceil. + // When rounded_toward_zero is non-negative, an adjustment is needed if the + // sign of the difference numerator - intermediate_product is the same as + // the sign of the denominator. + // + // Using a bool and then a static_cast to IntegralType is not strictly + // necessary, but it makes the code clear, and anyway the compiler should + // get rid of it. + const bool needs_adjustment = (rounded_toward_zero >= 0) && + ((denominator > 0 && numerator > intermediate_product) || + (denominator < 0 && numerator < intermediate_product)); + const IntegralType adjustment = static_cast<IntegralType>(needs_adjustment); + const IntegralType ceil_of_ratio = rounded_toward_zero + adjustment; + return ceil_of_ratio; + } else { + // Floor case: symmetrical to the previous one + const bool needs_adjustment = (rounded_toward_zero <= 0) && + ((denominator > 0 && numerator < intermediate_product) || + (denominator < 0 && numerator > intermediate_product)); + const IntegralType adjustment = static_cast<IntegralType>(needs_adjustment); + const IntegralType floor_of_ratio = rounded_toward_zero - adjustment; + return floor_of_ratio; + } +} + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_MATHUTIL_H_ diff --git a/tensorflow/stream_executor/lib/notification.h b/tensorflow/stream_executor/lib/notification.h new file mode 100644 index 0000000000..2baa458fc9 --- /dev/null +++ b/tensorflow/stream_executor/lib/notification.h @@ -0,0 +1,16 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_NOTIFICATION_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_NOTIFICATION_H_ + +#include "tensorflow/core/lib/core/notification.h" + +namespace perftools { +namespace gputools { +namespace port { + +using tensorflow::Notification; + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_NOTIFICATION_H_ diff --git a/tensorflow/stream_executor/lib/numbers.cc b/tensorflow/stream_executor/lib/numbers.cc new file mode 100644 index 0000000000..a9981b0ce6 --- /dev/null +++ b/tensorflow/stream_executor/lib/numbers.cc @@ -0,0 +1,27 @@ +#include "tensorflow/stream_executor/lib/numbers.h" + +#include <stdlib.h> + +namespace perftools { +namespace gputools { +namespace port { + +bool safe_strto32(const char* str, int32* value) { + char* endptr; + *value = strtol(str, &endptr, 10); // NOLINT + if (endptr != str) { + while (isspace(*endptr)) ++endptr; + } + return *str != '\0' && *endptr == '\0'; +} + +// Convert strings to floating point values. +// Leading and trailing spaces are allowed. +// Values may be rounded on over- and underflow. +bool safe_strto32(const string& str, int32* value) { + return port::safe_strto32(str.c_str(), value); +} + +} // namespace port +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/lib/numbers.h b/tensorflow/stream_executor/lib/numbers.h new file mode 100644 index 0000000000..17b2893743 --- /dev/null +++ b/tensorflow/stream_executor/lib/numbers.h @@ -0,0 +1,19 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_NUMBERS_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_NUMBERS_H_ + +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { +namespace port { + +// Convert strings to floating point values. +// Leading and trailing spaces are allowed. +// Values may be rounded on over- and underflow. +bool safe_strto32(const string& str, int32* value); + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_NUMBERS_H_ diff --git a/tensorflow/stream_executor/lib/path.cc b/tensorflow/stream_executor/lib/path.cc new file mode 100644 index 0000000000..a6e76e99b7 --- /dev/null +++ b/tensorflow/stream_executor/lib/path.cc @@ -0,0 +1,50 @@ +#include "tensorflow/stream_executor/lib/path.h" +#include "tensorflow/stream_executor/lib/strcat.h" + +using ::perftools::gputools::port::StringPiece; +using ::perftools::gputools::port::StrAppend; + +namespace perftools { +namespace gputools { +namespace port { +namespace internal { + +static bool IsAbsolutePath(port::StringPiece path) { + return !path.empty() && path[0] == '/'; +} + +// For an array of paths of length count, append them all together, +// ensuring that the proper path separators are inserted between them. +string JoinPathImpl(std::initializer_list<port::StringPiece> paths) { + string result; + + for (port::StringPiece path : paths) { + if (path.empty()) continue; + + if (result.empty()) { + result = path.ToString(); + continue; + } + + if (result[result.size() - 1] == '/') { + if (IsAbsolutePath(path)) { + StrAppend(&result, path.substr(1)); + } else { + StrAppend(&result, path); + } + } else { + if (IsAbsolutePath(path)) { + StrAppend(&result, path); + } else { + StrAppend(&result, "/", path); + } + } + } + + return result; +} + +} // namespace internal +} // namespace port +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/lib/path.h b/tensorflow/stream_executor/lib/path.h new file mode 100644 index 0000000000..1d648e8de1 --- /dev/null +++ b/tensorflow/stream_executor/lib/path.h @@ -0,0 +1,44 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_PATH_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_PATH_H_ + +#include "tensorflow/stream_executor/lib/stringpiece.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { +namespace port { + +namespace internal { +// TODO(rspringer): Move to cc/implementation file. +// Not part of the public API. +string JoinPathImpl(std::initializer_list<port::StringPiece> paths); +} // namespace internal + +// Join multiple paths together. +// JoinPath unconditionally joins all paths together. For example: +// +// Arguments | JoinPath +// ---------------------------+--------------------- +// '/foo', 'bar' | /foo/bar +// '/foo/', 'bar' | /foo/bar +// '/foo', '/bar' | /foo/bar +// '/foo', '/bar', '/baz' | /foo/bar/baz +// +// All paths will be treated as relative paths, regardless of whether or not +// they start with a leading '/'. That is, all paths will be concatenated +// together, with the appropriate path separator inserted in between. +// Arguments must be convertible to port::StringPiece. +// +// Usage: +// string path = file::JoinPath("/var/log", dirname, filename); +// string path = file::JoinPath(FLAGS_test_srcdir, filename); +template <typename... T> +inline string JoinPath(const T&... args) { + return internal::JoinPathImpl({args...}); +} + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_PATH_H_ diff --git a/tensorflow/stream_executor/lib/process_state.cc b/tensorflow/stream_executor/lib/process_state.cc new file mode 100644 index 0000000000..c20493b263 --- /dev/null +++ b/tensorflow/stream_executor/lib/process_state.cc @@ -0,0 +1,37 @@ +#include "tensorflow/stream_executor/lib/process_state.h" + +#include <unistd.h> + +#include <memory> + +namespace perftools { +namespace gputools { +namespace port { + +string Hostname() { + char hostname[1024]; + gethostname(hostname, sizeof hostname); + hostname[sizeof hostname - 1] = 0; + return hostname; +} + +bool GetCurrentDirectory(string* dir) { + size_t len = 128; + std::unique_ptr<char[]> a(new char[len]); + for (;;) { + char* p = getcwd(a.get(), len); + if (p != NULL) { + *dir = p; + return true; + } else if (errno == ERANGE) { + len += len; + a.reset(new char[len]); + } else { + return false; + } + } +} + +} // namespace port +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/lib/process_state.h b/tensorflow/stream_executor/lib/process_state.h new file mode 100644 index 0000000000..b75879499b --- /dev/null +++ b/tensorflow/stream_executor/lib/process_state.h @@ -0,0 +1,17 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_PROCESS_STATE_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_PROCESS_STATE_H_ + +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { +namespace port { + +string Hostname(); +bool GetCurrentDirectory(string* dir); + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_PROCESS_STATE_H_ diff --git a/tensorflow/stream_executor/lib/ptr_util.h b/tensorflow/stream_executor/lib/ptr_util.h new file mode 100644 index 0000000000..d10d0bcb8c --- /dev/null +++ b/tensorflow/stream_executor/lib/ptr_util.h @@ -0,0 +1,48 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_PTR_UTIL_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_PTR_UTIL_H_ + +namespace perftools { +namespace gputools { +namespace port { + +// Trait to select overloads and return types for MakeUnique. +template <typename T> +struct MakeUniqueResult { + using scalar = std::unique_ptr<T>; +}; +template <typename T> +struct MakeUniqueResult<T[]> { + using array = std::unique_ptr<T[]>; +}; +template <typename T, size_t N> +struct MakeUniqueResult<T[N]> { + using invalid = void; +}; + +// MakeUnique<T>(...) is an early implementation of C++14 std::make_unique. +// It is designed to be 100% compatible with std::make_unique so that the +// eventual switchover will be a simple renaming operation. +template <typename T, typename... Args> +typename MakeUniqueResult<T>::scalar MakeUnique(Args&&... args) { // NOLINT + return std::unique_ptr<T>( + new T(std::forward<Args>(args)...)); // NOLINT(build/c++11) +} + +// Overload for array of unknown bound. +// The allocation of arrays needs to use the array form of new, +// and cannot take element constructor arguments. +template <typename T> +typename MakeUniqueResult<T>::array MakeUnique(size_t n) { + return std::unique_ptr<T>(new typename std::remove_extent<T>::type[n]()); +} + +// Reject arrays of known bound. +template <typename T, typename... Args> +typename MakeUniqueResult<T>::invalid MakeUnique(Args&&... /* args */) = + delete; // NOLINT + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_PTR_UTIL_H_ diff --git a/tensorflow/stream_executor/lib/stacktrace.h b/tensorflow/stream_executor/lib/stacktrace.h new file mode 100644 index 0000000000..e7d478efe3 --- /dev/null +++ b/tensorflow/stream_executor/lib/stacktrace.h @@ -0,0 +1,18 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STACKTRACE_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_STACKTRACE_H_ + +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { +namespace port { + +#if !defined(PLATFORM_GOOGLE) +inline string CurrentStackTrace() { return "No stack trace available"; } +#endif + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_STACKTRACE_H_ diff --git a/tensorflow/stream_executor/lib/static_threadlocal.h b/tensorflow/stream_executor/lib/static_threadlocal.h new file mode 100644 index 0000000000..9227b2cf0d --- /dev/null +++ b/tensorflow/stream_executor/lib/static_threadlocal.h @@ -0,0 +1,30 @@ +// Copyright 2006 Google Inc. +// All rights reserved. +// Author: Yaz Saito (saito@google.com) +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATIC_THREADLOCAL_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_STATIC_THREADLOCAL_H_ + +// For POD types in TLS mode, s_obj_VAR is the thread-local variable. +#define SE_STATIC_THREAD_LOCAL_POD(_Type_, _var_) \ + static thread_local _Type_ s_obj_##_var_; \ + namespace { \ + class ThreadLocal_##_var_ { \ + public: \ + ThreadLocal_##_var_() {} \ + void Init() {} \ + inline _Type_ *pointer() const { \ + return &s_obj_##_var_; \ + } \ + inline _Type_ *safe_pointer() const { \ + return &s_obj_##_var_; \ + } \ + _Type_ &get() const { \ + return s_obj_##_var_; \ + } \ + bool is_native_tls() const { return true; } \ + private: \ + SE_DISALLOW_COPY_AND_ASSIGN(ThreadLocal_##_var_); \ + } _var_; \ + } + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_STATIC_THREADLOCAL_H_ diff --git a/tensorflow/stream_executor/lib/status.h b/tensorflow/stream_executor/lib/status.h new file mode 100644 index 0000000000..b3ad13b0ae --- /dev/null +++ b/tensorflow/stream_executor/lib/status.h @@ -0,0 +1,23 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATUS_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_STATUS_H_ + +#include "tensorflow/core/public/status.h" +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/platform/logging.h" + +namespace perftools { +namespace gputools { +namespace port { + +using tensorflow::Status; + +#define SE_CHECK_OK(val) \ + CHECK_EQ(::perftools::gputools::port::Status::OK(), (val)) +#define SE_ASSERT_OK(val) \ + ASSERT_EQ(::perftools::gputools::port::Status::OK(), (val)) + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_STATUS_H_ diff --git a/tensorflow/stream_executor/lib/status_macros.h b/tensorflow/stream_executor/lib/status_macros.h new file mode 100644 index 0000000000..7e1de92a98 --- /dev/null +++ b/tensorflow/stream_executor/lib/status_macros.h @@ -0,0 +1,54 @@ +// Helper macros for dealing with the port::Status datatype. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATUS_MACROS_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_STATUS_MACROS_H_ + +// Early-returns the status if it is in error; otherwise, proceeds. +// +// The argument expression is guaranteed to be evaluated exactly once. +#define SE_RETURN_IF_ERROR(__status) \ + do { \ + auto status = __status; \ + if (!status.ok()) { \ + return status; \ + } \ + } while (false) + +// Identifier concatenation helper macros. +#define SE_MACRO_CONCAT_INNER(__x, __y) __x##__y +#define SE_MACRO_CONCAT(__x, __y) SE_MACRO_CONCAT_INNER(__x, __y) + +// Implementation of SE_ASSIGN_OR_RETURN that uses a unique temporary identifier +// for avoiding collision in the enclosing scope. +#define SE_ASSIGN_OR_RETURN_IMPL(__lhs, __rhs, __name) \ + auto __name = (__rhs); \ + if (!__name.ok()) { \ + return __name.status(); \ + } \ + __lhs = __name.ConsumeValueOrDie(); + +// Early-returns the status if it is in error; otherwise, assigns the +// right-hand-side expression to the left-hand-side expression. +// +// The right-hand-side expression is guaranteed to be evaluated exactly once. +#define SE_ASSIGN_OR_RETURN(__lhs, __rhs) \ + SE_ASSIGN_OR_RETURN_IMPL(__lhs, __rhs, \ + SE_MACRO_CONCAT(__status_or_value, __COUNTER__)) + +// Logs the status and returns false if it is in error; otherwise, returns true. +// +// The argument expression is guaranteed to be evaluated exactly once. +// +// TODO(leary) remove as many of these as possible with port::Status +// proliferation. +#define SE_RETURN_STATUS_AS_BOOL(__status) \ + do { \ + auto status = __status; \ + if (__status.ok()) { \ + return true; \ + } \ + LOG(ERROR) << status; \ + return false; \ + } while (false) + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_STATUS_MACROS_H_ diff --git a/tensorflow/stream_executor/lib/statusor.h b/tensorflow/stream_executor/lib/statusor.h new file mode 100644 index 0000000000..38ce35e46e --- /dev/null +++ b/tensorflow/stream_executor/lib/statusor.h @@ -0,0 +1,234 @@ +// Copyright 2008 Google Inc. All Rights Reserved. +// Author: acm@google.com (Andrew Morrow) +// Author: zhengxq@google.com (Xiaoqiang Zheng) +// +// StatusOr<T> is the union of a Status object and a T +// object. StatusOr models the concept of an object that is either a +// usable value, or an error Status explaining why such a value is +// not present. To this end, StatusOr<T> does not allow its Status +// value to be Status::OK. Further, StatusOr<T*> does not allow the +// contained pointer to be NULL. +// +// The primary use-case for StatusOr<T> is as the return value of a +// function which may fail. +// +// Example client usage for a StatusOr<T>, where T is not a pointer: +// +// StatusOr<float> result = DoBigCalculationThatCouldFail(); +// if (result.ok()) { +// float answer = result.ValueOrDie(); +// printf("Big calculation yielded: %f", answer); +// } else { +// LOG(ERROR) << result.status(); +// } +// +// Example client usage for a StatusOr<T*>: +// +// StatusOr<Foo*> result = FooFactory::MakeNewFoo(arg); +// if (result.ok()) { +// std::unique_ptr<Foo> foo(result.ValueOrDie()); +// foo->DoSomethingCool(); +// } else { +// LOG(ERROR) << result.status(); +// } +// +// Example client usage for a StatusOr<std::unique_ptr<T>>: +// +// StatusOr<std::unique_ptr<Foo>> result = FooFactory::MakeNewFoo(arg); +// if (result.ok()) { +// std::unique_ptr<Foo> foo = result.ConsumeValueOrDie(); +// foo->DoSomethingCool(); +// } else { +// LOG(ERROR) << result.status(); +// } +// +// Example factory implementation returning StatusOr<T*>: +// +// StatusOr<Foo*> FooFactory::MakeNewFoo(int arg) { +// if (arg <= 0) { +// return Status(port::error::INVALID_ARGUMENT, +// "Arg must be positive"); +// } else { +// return new Foo(arg); +// } +// } +// + +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_ + +#include <new> +#include "tensorflow/stream_executor/platform/port.h" +#include <type_traits> +#include <utility> + +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { +namespace port { + +template<typename T> +class StatusOr { + template<typename U> friend class StatusOr; + + public: + // Construct a new StatusOr with Status::UNKNOWN status + StatusOr() : status_(error::UNKNOWN, "") {} + + // Construct a new StatusOr with the given non-ok status. After calling + // this constructor, calls to ValueOrDie() is invalid. + // + // NOTE: Not explicit - we want to use StatusOr<T> as a return + // value, so it is convenient and sensible to be able to do 'return + // Status()' when the return type is StatusOr<T>. + // + // REQUIRES: status != Status::OK. + // In optimized builds, passing Status::OK here will have the effect + // of passing PosixErrorSpace::EINVAL as a fallback. + StatusOr(const Status& status); // NOLINT + + // Construct a new StatusOr with the given value. If T is a plain pointer, + // value must not be NULL. After calling this constructor, calls to + // ValueOrDie() will succeed, and calls to status() will return OK. + // + // NOTE: Not explicit - we want to use StatusOr<T> as a return type + // so it is convenient and sensible to be able to do 'return T()' + // when when the return type is StatusOr<T>. + // + // REQUIRES: if T is a plain pointer, value != NULL. + // In optimized builds, passing a NULL pointer here will have + // the effect of passing PosixErrorSpace::EINVAL as a fallback. + StatusOr(const T& value); // NOLINT + + // Conversion copy constructor, T must be copy constructible from U + template <typename U> + StatusOr(const StatusOr<U>& other) // NOLINT + : status_(other.status_), + value_(other.value_) {} + + // Conversion assignment operator, T must be assignable from U + template <typename U> + StatusOr& operator=(const StatusOr<U>& other) { + status_ = other.status_; + value_ = other.value_; + return *this; + } + + // Rvalue-reference overloads of the other constructors and assignment + // operators, to support move-only types and avoid unnecessary copying. + StatusOr(T&& value); // NOLINT + + // Move conversion operator to avoid unecessary copy. + // T must be assignable from U. + // Not marked with explicit so the implicit conversion can happen. + template <typename U> + StatusOr(StatusOr<U>&& other) // NOLINT + : status_(std::move(other.status_)), + value_(std::move(other.value_)) {} + + // Move assignment opeartor to avoid unnecessary copy. + // T must be assignable from U + template <typename U> + StatusOr& operator=(StatusOr<U>&& other) { + status_ = std::move(other.status_); + value_ = std::move(other.value_); + return *this; + } + + // Returns a reference to our status. If this contains a T, then + // returns Status::OK. + const Status& status() const { return status_; } + + // Returns this->status().ok() + bool ok() const { return status_.ok(); } + + // Returns a reference to our current value, requires that this->ok(). + // If you need to initialize a T object from the stored value, + // ConsumeValueOrDie() may be more efficient. + const T& ValueOrDie() const; + + // Returns our current value, requires this->ok(). Use this if + // you would otherwise want to say std::move(s.ValueOrDie()), for example + // if you need to initialize a T object from the stored value and you don't + // need subsequent access to the stored value. It uses T's move constructor, + // if it has one, so it will work with move-only types, and will often be + // more efficient than ValueOrDie, but may leave the stored value + // in an arbitrary valid state. + T ConsumeValueOrDie(); + + private: + Status status_; + T value_; + + void CheckValueNotNull(const T& value); + + template <typename U> + struct IsNull { + // For non-pointer U, a reference can never be NULL. + static inline bool IsValueNull(const U& t) { return false; } + }; + + template <typename U> + struct IsNull<U*> { + static inline bool IsValueNull(const U* t) { return t == NULL; } + }; +}; + +//////////////////////////////////////////////////////////////////////////////// +// Implementation details for StatusOr<T> + +template <typename T> +StatusOr<T>::StatusOr(const T& value) + : status_(), value_(value) { + CheckValueNotNull(value); +} + +template <typename T> +const T& StatusOr<T>::ValueOrDie() const { + assert(status_.ok()); + return value_; +} + +template <typename T> +T StatusOr<T>::ConsumeValueOrDie() { + assert(status_.ok()); + return std::move(value_); +} + +template <typename T> +StatusOr<T>::StatusOr(const Status& status) + : status_(status) { + assert(!status.ok()); + if (status.ok()) { + status_ = + Status(error::INTERNAL, + "Status::OK is not a valid constructor argument to StatusOr<T>"); + } +} + +template <typename T> +StatusOr<T>::StatusOr(T&& value) + : status_() { + CheckValueNotNull(value); + value_ = std::move(value); +} + +template <typename T> +void StatusOr<T>::CheckValueNotNull(const T& value) { + assert(!IsNull<T>::IsValueNull(value)); + if (IsNull<T>::IsValueNull(value)) { + status_ = + Status(error::INTERNAL, + "NULL is not a valid constructor argument to StatusOr<T*>"); + } +} + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_ diff --git a/tensorflow/stream_executor/lib/str_util.h b/tensorflow/stream_executor/lib/str_util.h new file mode 100644 index 0000000000..021f54dfec --- /dev/null +++ b/tensorflow/stream_executor/lib/str_util.h @@ -0,0 +1,30 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STR_UTIL_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_STR_UTIL_H_ + +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/stream_executor/lib/stringpiece.h" + +namespace perftools { +namespace gputools { +namespace port { + +using tensorflow::str_util::Join; +using tensorflow::str_util::Split; + +// Returns a copy of the input string 'str' with the given 'suffix' +// removed. If the suffix doesn't match, returns a copy of the original string. +inline string StripSuffixString(port::StringPiece str, port::StringPiece suffix) { + if (str.ends_with(suffix)) { + str.remove_suffix(suffix.size()); + } + return str.ToString(); +} + +using tensorflow::str_util::Lowercase; +using tensorflow::str_util::Uppercase; + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_STR_UTIL_H_ diff --git a/tensorflow/stream_executor/lib/strcat.h b/tensorflow/stream_executor/lib/strcat.h new file mode 100644 index 0000000000..b3fe4da327 --- /dev/null +++ b/tensorflow/stream_executor/lib/strcat.h @@ -0,0 +1,17 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STRCAT_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_STRCAT_H_ + +#include "tensorflow/core/lib/strings/strcat.h" + +namespace perftools { +namespace gputools { +namespace port { + +using tensorflow::strings::StrCat; +using tensorflow::strings::StrAppend; + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_STRCAT_H_ diff --git a/tensorflow/stream_executor/lib/stringpiece.h b/tensorflow/stream_executor/lib/stringpiece.h new file mode 100644 index 0000000000..14e6fc99d7 --- /dev/null +++ b/tensorflow/stream_executor/lib/stringpiece.h @@ -0,0 +1,17 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STRINGPIECE_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_STRINGPIECE_H_ + +#include "tensorflow/core/lib/core/stringpiece.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { +namespace port { + +using tensorflow::StringPiece; + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_STRINGPIECE_H_ diff --git a/tensorflow/stream_executor/lib/stringprintf.h b/tensorflow/stream_executor/lib/stringprintf.h new file mode 100644 index 0000000000..379e7e9a83 --- /dev/null +++ b/tensorflow/stream_executor/lib/stringprintf.h @@ -0,0 +1,18 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STRINGPRINTF_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_STRINGPRINTF_H_ + +#include "tensorflow/core/lib/strings/stringprintf.h" + +namespace perftools { +namespace gputools { +namespace port { + +using tensorflow::strings::Printf; +using tensorflow::strings::Appendf; +using tensorflow::strings::Appendv; + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_STRINGPRINTF_H_ diff --git a/tensorflow/stream_executor/lib/thread_options.h b/tensorflow/stream_executor/lib/thread_options.h new file mode 100644 index 0000000000..7d436578d6 --- /dev/null +++ b/tensorflow/stream_executor/lib/thread_options.h @@ -0,0 +1,16 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_THREAD_OPTIONS_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_THREAD_OPTIONS_H_ + +#include "tensorflow/core/public/env.h" + +namespace perftools { +namespace gputools { +namespace port { + +using tensorflow::ThreadOptions; + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_THREAD_OPTIONS_H_ diff --git a/tensorflow/stream_executor/lib/threadpool.h b/tensorflow/stream_executor/lib/threadpool.h new file mode 100644 index 0000000000..3cf297d57b --- /dev/null +++ b/tensorflow/stream_executor/lib/threadpool.h @@ -0,0 +1,19 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_THREADPOOL_H_ +#define TENSORFLOW_STREAM_EXECUTOR_LIB_THREADPOOL_H_ + +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/stream_executor/lib/env.h" +#include "tensorflow/stream_executor/lib/notification.h" +#include "tensorflow/stream_executor/lib/thread_options.h" + +namespace perftools { +namespace gputools { +namespace port { + +using tensorflow::thread::ThreadPool; + +} // namespace port +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_LIB_THREADPOOL_H_ diff --git a/tensorflow/stream_executor/machine_manager.cc b/tensorflow/stream_executor/machine_manager.cc new file mode 100644 index 0000000000..6d7bc50379 --- /dev/null +++ b/tensorflow/stream_executor/machine_manager.cc @@ -0,0 +1,276 @@ +#include "tensorflow/stream_executor/machine_manager.h" + +#include "tensorflow/stream_executor/platform/port.h" + +#include "tensorflow/stream_executor/dso_loader.h" +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { + +mutex MachineManager::mu_{LINKER_INITIALIZED}; + +MachineManager *MachineManager::singleton_ = nullptr; + +PlatformKind MachineManager::DetectPreferredPlatform() { +// TODO(leary) for KNC card experiments, figure out a legitimate way to +// determine this. For now, we use a compile-time hint so we can compile tests +// for both. +#if defined TENSORFLOW_STREAM_EXECUTOR_MACHINE_MANAGER_PREFER_OPENCL + return PlatformKind::kOpenCL; +#elif defined TENSORFLOW_STREAM_EXECUTOR_MACHINE_MANAGER_PREFER_HOST + return PlatformKind::kHost; +#else + return PlatformKind::kCuda; +#endif +} + +/* static */ port::StatusOr<std::unique_ptr<MachineManager>> +MachineManager::Create(PlatformKind kind, DeviceOptions options, + const PluginConfig &config) { + std::unique_ptr<MachineManager> machine_manager{ + new MachineManager{kind, options, config}}; + auto init_status = machine_manager->Init(); + if (!init_status.ok()) { + return init_status; + } + + return std::move(machine_manager); +} + +MachineManager::MachineManager(PlatformKind platform, + DeviceOptions device_options, + const PluginConfig &config) + : platform_(platform), + device_options_(device_options), + plugin_config_(config), + min_numa_node_(0), + limit_numa_node_(0) {} + +port::Status MachineManager::Init() { + // Initialize the first StreamExecutor, then use that platform interface to + // grab the device count. + executors_.resize(1); + executors_[0].reset(new StreamExecutor{platform_, plugin_config_}); + auto status = executors_[0]->Init(0 /* = device_ordinal */, device_options_); + if (!status.ok()) { + return port::Status{ + port::error::FAILED_PRECONDITION, + port::StrCat( + "failed to initialize StreamExecutor for device ordinal 0: ", + status.ToString())}; + } + int device_count = executors_[0]->PlatformDeviceCount(); + if (device_count == 0) { + LOG(WARNING) << "no devices found for platform " + << PlatformKindString(platform_); + min_numa_node_ = limit_numa_node_ = 0; + return port::Status::OK(); + } + + streams_.resize(device_count); + streams_[0].reset(new Stream(executors_[0].get())); + if (!streams_[0]->Init().ok()) { + return port::Status{ + port::error::FAILED_PRECONDITION, + "failed to initialize default stream for device ordinal 0"}; + } + + min_numa_node_ = executors_[0]->GetDeviceDescription().numa_node(); + limit_numa_node_ = min_numa_node_ + 1; + + executors_.resize(device_count); + for (int device_ordinal = 1; device_ordinal < device_count; + ++device_ordinal) { + StreamExecutor *stream_exec = new StreamExecutor{platform_, plugin_config_}; + executors_[device_ordinal].reset(stream_exec); + auto status = stream_exec->Init(device_ordinal, device_options_); + if (!status.ok()) { + return port::Status( + port::error::FAILED_PRECONDITION, + port::StrCat( + "failed to initialize StreamExecutor for device ordinal ", + device_ordinal, ": ", status.ToString())); + } + + min_numa_node_ = std::min(min_numa_node_, + stream_exec->GetDeviceDescription().numa_node()); + limit_numa_node_ = std::max( + limit_numa_node_, stream_exec->GetDeviceDescription().numa_node() + 1); + + if (!stream_exec->GetDeviceDescription().ecc_enabled()) { + LOG(WARNING) << "ECC not enabled for device ordinal: " << device_ordinal; + } + + streams_[device_ordinal].reset( + new Stream(executors_[device_ordinal].get())); + if (!streams_[device_ordinal]->Init().ok()) { + return port::Status( + port::error::FAILED_PRECONDITION, + port::StrCat( + "failed to initialize default stream for device ordinal ", + device_ordinal)); + } + } + + return port::Status::OK(); +} + +int MachineManager::device_count() const { return executors_.size(); } + +port::Status MachineManager::EnablePeerAccess() { + auto peer_access_map = GetPeerAccessMap(); + for (const auto &access : *peer_access_map) { + auto devices = access.first; + if (access.second) { + StreamExecutor *from = executors_[devices.first].get(); + StreamExecutor *to = executors_[devices.second].get(); + auto status = from->EnablePeerAccessTo(to); + if (!status.ok()) { + return status; + } + } else { + LOG(INFO) << "cannot enable peer access from device ordinal " + << devices.first << " to device ordinal " << devices.second; + } + } + return port::Status::OK(); +} + +std::unique_ptr<std::map<std::pair<int, int>, bool>> +MachineManager::GetPeerAccessMap() { + auto *map = new std::map<std::pair<int, int>, bool>; + for (int i = 0; i < device_count(); ++i) { + for (int j = 0; j < device_count(); ++j) { + StreamExecutor *from = executors_[i].get(); + StreamExecutor *to = executors_[j].get(); + (*map)[{i, j}] = from->CanEnablePeerAccessTo(to); + } + } + + return std::unique_ptr<std::map<std::pair<int, int>, bool>>{map}; +} + +StreamExecutor *MachineManager::executor_for_device(int device_ordinal) const { + CHECK_GE(device_ordinal, 0) << "device ordinal must be non-negative"; + CHECK(0 <= device_ordinal && device_ordinal < device_count()) + << "device " << device_ordinal << " out of range with device count " + << device_count(); + StreamExecutor *executor = executors_[device_ordinal].get(); + CHECK(executor != nullptr); + return executor; +} + +int MachineManager::ExecutorToBus(const StreamExecutor *stream_exec) const { + return stream_exec->GetDeviceDescription().numa_node() - min_numa_node_; +} + +int MachineManager::DeviceToBus(int device_ordinal) const { + return ExecutorToBus(executor_for_device(device_ordinal)); +} + +int MachineManager::ExecutorToNumaNode( + const StreamExecutor *stream_exec) const { + return stream_exec->GetDeviceDescription().numa_node(); +} + +int MachineManager::DeviceToNumaNode(int device_ordinal) const { + return ExecutorToNumaNode(executor_for_device(device_ordinal)); +} + +StreamExecutor *MachineManager::first_executor_for_bus(int bus_ordinal) { + CHECK_LT(bus_ordinal, bus_count()) << "bus ordinal out of available range"; + for (auto &executor : executors_) { + if (ExecutorToBus(executor.get()) == bus_ordinal) { + return executor.get(); + } + } + + LOG(WARNING) << "could not find executor requested for bus ordinal: " + << bus_ordinal; + return nullptr; +} + +StreamExecutor *MachineManager::first_executor_for_numa_node(int numa_node) { + for (auto &executor : executors_) { + if (ExecutorToNumaNode(executor.get()) == numa_node) { + return executor.get(); + } + } + + LOG(WARNING) << "could not find executor requested for numa_node: " + << numa_node; + return nullptr; +} + +Stream *MachineManager::stream_for_device(int device_ordinal) { + CHECK(0 <= device_ordinal && device_ordinal < device_count()); + Stream *stream = streams_[device_ordinal].get(); + CHECK(stream != nullptr); + return stream; +} + +/* static */ port::StatusOr<MachineManager *> +MachineManager::CreateSingletonInternal(PlatformKind platform, + DeviceOptions options, + const PluginConfig &config) { + if (singleton_ != nullptr) { + return port::Status{ + port::error::ALREADY_EXISTS, + "cannot create machine manager singleton; one already exists"}; + } + + auto create_status = Create(platform, options, config); + if (!create_status.ok()) { + return create_status.status(); + } + + singleton_ = create_status.ConsumeValueOrDie().release(); + + VLOG(1) << "machine manager singleton is " << singleton_ << " with platform " + << PlatformKindString(platform) << " and device options " + << options.ToString(); + + return singleton_; +} + +/* static */ MachineManager *MachineManager::CreateSingletonOrDie( + PlatformKind platform, DeviceOptions options, const PluginConfig &config) { + auto status = CreateSingleton(platform, options, config); + if (!status.ok()) { + LOG(FATAL) << "failed to create MachineManager singleton: " + << status.status(); + } + return status.ValueOrDie(); +} + +/* static */ port::StatusOr<MachineManager *> MachineManager::CreateSingleton( + PlatformKind platform, DeviceOptions device_options, + const PluginConfig &config) { + mutex_lock lock{mu_}; + return CreateSingletonInternal(platform, device_options, config); +} + +/* static */ MachineManager *MachineManager::singleton() { + mutex_lock lock{mu_}; + if (singleton_ == nullptr) { + PlatformKind platform = DetectPreferredPlatform(); + DeviceOptions options = DeviceOptions::Default(); + auto status = CreateSingletonInternal(platform, options, PluginConfig()); + if (!status.ok()) { + LOG(FATAL) + << "failed to create MachineManager singleton: " + "singleton accessor attempted lazy construction but failed: " + << status.status(); + } + return status.ValueOrDie(); + } + + return singleton_; +} + +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/machine_manager.h b/tensorflow/stream_executor/machine_manager.h new file mode 100644 index 0000000000..bcff7a9da0 --- /dev/null +++ b/tensorflow/stream_executor/machine_manager.h @@ -0,0 +1,197 @@ +// This interface provides a machine-wide resource management singleton +// interface as a convenience for users who will want to exploit all of the GPU +// resources present on the system. +// +// To use the singleton interface: +// +// // At start of program or in your module initializer. +// // Do not call this with different sets of arguments! +// MachineManager::CreateSingletonOrDie( +// MachineManager::DetectPreferredPlatform(), DeviceOptions::Default()); +// +// // At any point after that, this convenience interface avoids you having to +// // pass those two parameters: +// StreamExecutor *device0_executor = +// MachineManager::singleton()->executor_for_device(0 /* = ordinal */); +// ... + +// ----------------- THIS CLASS IS DEPRECATED - DO NOT USE ------------------ +// This class is not suitable for open-sourcing, as it does not support +// plugins and depends on hardcoded PlatformKind enums. MultiPlatformManager and +// Platform plugins are the replacements. +// ----------------- THIS CLASS IS DEPRECATED - DO NOT USE ------------------ + +#ifndef TENSORFLOW_STREAM_EXECUTOR_MACHINE_MANAGER_H_ +#define TENSORFLOW_STREAM_EXECUTOR_MACHINE_MANAGER_H_ + +#include <map> +#include <memory> +#include <utility> +#include <vector> + +#include "tensorflow/stream_executor/device_options.h" // IWYU pragma: export +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/platform/thread_annotations.h" +#include "tensorflow/stream_executor/stream.h" +#include "tensorflow/stream_executor/stream_executor.h" + +namespace perftools { +namespace gputools { + +// MachineManager is used to instantiate and manage singleton resources for +// all the GPUs present on a machine. This basically amounts to having a +// StreamExecutor-per-device pool. +// +// Thread-safe. +class MachineManager { + public: + // Inspects the host to determine the preferred GPU execution platform. + // To force OpenCL from a build target on a machine that has both OpenCL and + // CUDA capabilities, link against the :stream_executor_prefer_opencl target. + static PlatformKind DetectPreferredPlatform(); + + // Returns the machine manager singleton. + // If the singleton has not yet been created when this is invoked, this + // creates it with resonable default options, otherwise it returns the + // already-created singleton. If there are errors during creation, this call + // will terminate the program. + static MachineManager *singleton(); + + // Returns a singleton instance of the machine manager -- it's generally + // assumed that users will have one of these for a real-world application as a + // form of resource manager. + // + // This should only be called once, at the initialization of an application, + // if at all -- MachineManager::singleton() will return a value with sensible + // default as determined by DetectPreferredPlatform. Attempts to create the + // singleton with options multiple times will result in an error. + static port::StatusOr<MachineManager *> CreateSingleton( + PlatformKind platform, DeviceOptions device_options, + const PluginConfig &config = PluginConfig()); + + // Convenience "or die" wrapper around the above call. + static MachineManager *CreateSingletonOrDie( + PlatformKind platform, DeviceOptions device_options, + const PluginConfig &config = PluginConfig()); + + // Creates a new instantiation of the MachineManager. + // Warning: generally users will want to use the singleton form, see + // MachineManager::singleton(). + // + // The machine manager has a number of devices that it detects on creation + // that does not change over the course of its lifetime. This does not support + // things like hot-plugging of GPUs or the event of GPUs dropping off the bus + // in a recoverable manner. + static port::StatusOr<std::unique_ptr<MachineManager>> Create( + PlatformKind kind, DeviceOptions options, + const PluginConfig &config = PluginConfig()); + + // Returns the number of devices visible to the machine manager. + int device_count() const; + + // Returns the StreamExecutor for one of the machine-manager visible devices. + // Checks that device_ordinal is within device_count() bound. + StreamExecutor *executor_for_device(int device_ordinal) const; + + // Returns the bus ordinal count (as determined by the span of NUMA nodes + // associated with the available devices). + int bus_count() const { return limit_numa_node_ - min_numa_node_; } + + // Returns the bus ordinal associated with a given device ordinal. + int DeviceToBus(int device_ordinal) const; + + // Returns the NUMA node associated with a given device ordinal. + int DeviceToNumaNode(int device_ordinal) const; + + // Returns the first StreamExecutor (within device_count() ordinals that has + // the corresponding bus ordinal, or nullptr if none is found. + // + // The valid bus ordinals can be enumerated by scanning through the executors + // and seeing what bus number they are on. + StreamExecutor *first_executor_for_bus(int bus_ordinal); + + // Returns the first StreamExecutor associated with the specified + // numa_node, or nullptr if none is found. + StreamExecutor *first_executor_for_numa_node(int numa_node); + + // Returns the default stream for the default executor (that returned by + // executor_for_device()). The same stream will be returned for all calls to + // stream_for_device() (with the same device_ordinal). + Stream *stream_for_device(int device_ordinal); + + // Returns the platform that this machine manager was created to target. + PlatformKind platform() const { return platform_; } + + // Enables peer access between all possible devices on this platform. + // Only dies due to failure to enable peer access for devices in which + // GetPeerAccessMap() is true. + port::Status EnablePeerAccess(); + + // Returns a map that says, for pairs (device ordinal i, device ordinal j), + // whether i can access j's memory space. + std::unique_ptr<std::map<std::pair<int, int>, bool>> GetPeerAccessMap(); + + private: + // Guts of the singleton creation mechanism that requires the exclusive + // singleton lock to be held, in order to prevent deadlock due to method + // composition. + static port::StatusOr<MachineManager *> CreateSingletonInternal( + PlatformKind platform, DeviceOptions options, const PluginConfig &config) + EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // Private constructor used in singleton creation. + MachineManager(PlatformKind platform, DeviceOptions options, + const PluginConfig &config); + + // Populates the executors_ vector with an executor per observable device + // ordinal on the platform. Logs and returns false if any of the + // Stream Executors cannot be created. + port::Status Init(); + + // Converts a StreamExecutor's NUMA node association into a bus ordinal for + // this machine. + int ExecutorToBus(const StreamExecutor *stream_exec) const; + + // Returns the NUMA node association for the StreamExecutor. + int ExecutorToNumaNode(const StreamExecutor *stream_exec) const; + + // Mutex that guards the initialization of the machine manager static + // variable. + static mutex mu_; + + // Singleton MachineManager value -- assignment to this is protected by a + // static singleton guard clause. + static MachineManager *singleton_ GUARDED_BY(mu_); + + // Holds an executor associated with each device ordinal present in the + // system, which are the indices. Immutable after initialization. + std::vector<std::unique_ptr<StreamExecutor>> executors_; + + // Holds an stream associated with each device ordinal present in the + // system, which are the indices. Immutable after initialization. + std::vector<std::unique_ptr<Stream>> streams_; + + // The platform that this is managing for the machine. + PlatformKind platform_; + + // Options used to create StreamExecutors on each of the respective devices. + DeviceOptions device_options_; + + // Plugin configuration to use for all StreamExecutors created by this object. + PluginConfig plugin_config_; + + // The smallest NUMA node value for any device managed by this machine + // manager. Used, along with limit_numa_node_, to convert NUMA nodes into bus + // ordinals. The NUMA node space occupied by GPUs is assumed to be dense. + int min_numa_node_; + + // Larger than the NUMA node value for any device managed by this machine + // manager. + int limit_numa_node_; +}; + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_MACHINE_MANAGER_H_ diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc new file mode 100644 index 0000000000..a65add05c5 --- /dev/null +++ b/tensorflow/stream_executor/multi_platform_manager.cc @@ -0,0 +1,66 @@ +#include "tensorflow/stream_executor/multi_platform_manager.h" + +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/str_util.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" + +namespace perftools { +namespace gputools { + +/* static */ mutex MultiPlatformManager::platforms_mutex_(LINKER_INITIALIZED); + +/* static */ port::Status MultiPlatformManager::RegisterPlatform( + std::unique_ptr<Platform> platform) { + CHECK(platform != nullptr); + string key = port::Lowercase(platform->Name()); + mutex_lock lock(platforms_mutex_); + if (GetPlatformMap()->find(key) != GetPlatformMap()->end()) { + return port::Status(port::error::INTERNAL, + "platform is already registered with name: \"" + + platform->Name() + "\""); + } + GetPlatformByIdMap()->insert(std::make_pair(platform->id(), platform.get())); + // Release ownership/uniqueness to prevent destruction on program exit. + // This avoids Platforms "cleaning up" on program exit, because otherwise, + // there are _very_ tricky races between StreamExecutor and underlying + // platforms (CUDA, OpenCL) during exit. Since these are fixed-size and 1x per + // program, these are deemed acceptable. + (*GetPlatformMap())[key] = platform.release(); + return port::Status::OK(); +} + +/* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithName( + const string& target) { + mutex_lock lock(platforms_mutex_); + auto it = GetPlatformMap()->find(port::Lowercase(target)); + + if (it == GetPlatformMap()->end()) { + return port::Status( + port::error::NOT_FOUND, + "could not find registered platform with name: \"" + target + "\""); + } + + return it->second; +} + +/* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithId( + const Platform::Id& id) { + mutex_lock lock(platforms_mutex_); + auto it = GetPlatformByIdMap()->find(id); + if (it == GetPlatformByIdMap()->end()) { + return port::Status( + port::error::NOT_FOUND, + port::Printf("could not find registered platform with id: 0x%p", id)); + } + + return it->second; +} + +/* static */ void MultiPlatformManager::ClearPlatformRegistry() { + mutex_lock lock(platforms_mutex_); + GetPlatformMap()->clear(); + GetPlatformByIdMap()->clear(); +} + +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/multi_platform_manager.h b/tensorflow/stream_executor/multi_platform_manager.h new file mode 100644 index 0000000000..ade7fac24b --- /dev/null +++ b/tensorflow/stream_executor/multi_platform_manager.h @@ -0,0 +1,144 @@ +// This is a registration-oriented interface for multiple platforms. It will +// replace the MachineManager singleton interface, as MachineManager does not +// currently support simultaneous use of multiple platforms. +// +// Usage: +// +// In your BUILD rule, add a dependency on a platform plugin that you'd like +// to use, such as: +// +// //perftools/gputools/executor/cuda:cuda_platform +// //perftools/gputools/executor/opencl:opencl_platform +// +// This will register platform plugins that can be discovered via this +// interface. Sample API usage: +// +// port::StatusOr<Platform*> platform_status = +// gpu::MultiPlatformManager::PlatformWithName("OpenCL"); +// if (!platform_status.ok()) { ... } +// Platform* platform = platform_status.ValueOrDie(); +// LOG(INFO) << platform->VisibleDeviceCount() << " devices visible"; +// if (platform->VisibleDeviceCount() <= 0) { return; } +// +// for (int i = 0; i < platform->VisibleDeviceCount(); ++i) { +// port::StatusOr<StreamExecutor*> executor_status = +// platform->ExecutorForDevice(i); +// if (!executor_status.ok()) { +// LOG(INFO) << "could not retrieve executor for device ordinal " << i +// << ": " << executor_status.status(); +// continue; +// } +// LOG(INFO) << "found usable executor: " << executor_status.ValueOrDie(); +// } +// +// A few things to note: +// - There is no standard formatting/practice for identifying the name of a +// platform. Ideally, a platform will list its registered name in its header +// or in other associated documentation. +// - Platform name lookup is case-insensitive. "OpenCL" or "opencl" (or even +// ("OpEnCl") would work correctly in the above example. +// +// And similarly, for standard interfaces (BLAS, RNG, etc.) you can add +// dependencies on support libraries, e.g.: +// +// //perftools/gputools/executor/cuda:pluton_blas_plugin +// //perftools/gputools/executor/cuda:cudnn_plugin +// //perftools/gputools/executor/cuda:cublas_plugin +// //perftools/gputools/executor/cuda:curand_plugin + +#ifndef TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_ +#define TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_ + +#include <functional> +#include <map> +#include <memory> +#include "tensorflow/stream_executor/platform/port.h" + +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/platform.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { + +// Manages multiple platforms that may be present on the current machine. +class MultiPlatformManager { + public: + // Registers a platform object, returns an error status if the platform is + // already registered. The associated listener, if not null, will be used to + // trace events for ALL executors for that platform. + // Takes ownership of listener. + static port::Status RegisterPlatform(std::unique_ptr<Platform> platform); + + // Retrieves the platform registered with the given platform name; e.g. + // "CUDA", "OpenCL", ... + // + // If the requested platform is not registered, an error status is returned. + // Ownership of the platform is NOT transferred to the caller -- + // the MultiPlatformManager owns the platforms in a singleton-like fashion. + static port::StatusOr<Platform*> PlatformWithName(const string& target); + + // Retrieves the platform registered with the given platform ID, which + // is an opaque (but comparable) value. + // + // If the requested platform is not registered, an error status is returned. + // Ownership of the platform is NOT transferred to the caller -- + // the MultiPlatformManager owns the platforms in a singleton-like fashion. + static port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id); + + // Clears the set of registered platforms, primarily used for testing. + static void ClearPlatformRegistry(); + + // Although the MultiPlatformManager "owns" its platforms, it holds them as + // undecorated pointers to prevent races during program exit (between this + // object's data and the underlying platforms (e.g., CUDA, OpenCL). + // Because certain platforms have unpredictable deinitialization + // times/sequences, it is not possible to strucure a safe deinitialization + // sequence. Thus, we intentionally "leak" allocated platforms to defer + // cleanup to the OS. This should be acceptable, as these are one-time + // allocations per program invocation. + // The MultiPlatformManager should be considered the owner + // of any platforms registered with it, and leak checking should be disabled + // during allocation of such Platforms, to avoid spurious reporting at program + // exit. + using PlatformMap = std::map<string, Platform*>; + + // Provides access to the available set of platforms under a lock. + static port::Status WithPlatforms( + std::function<port::Status(PlatformMap*)> callback) { + mutex_lock lock(platforms_mutex_); + return callback(GetPlatformMap()); + } + + private: + // mutex that guards the platform map. + static mutex platforms_mutex_; + + // TODO(b/22689637): Clean up these two maps; make sure they coexist nicely. + // TODO(b/22689637): Move this (whatever the final/"official" map is) to + // plugin_regstry.h, along with the associated functionality. + // Platform-name-to-object mapping. These platforms are registered via module + // initializers, and linkage determines which platforms are available to a + // given target. + static PlatformMap* GetPlatformMap() { + static PlatformMap* instance = new PlatformMap; + return instance; + } + + // Holds a Platform::Id-to-object mapping. + // Unlike platforms_ above, this map does not own its contents. + static std::map<Platform::Id, Platform*>* GetPlatformByIdMap() { + using PlatformIdMap = std::map<Platform::Id, Platform*>; + static PlatformIdMap* instance = new PlatformIdMap; + return instance; + } + + SE_DISALLOW_COPY_AND_ASSIGN(MultiPlatformManager); +}; + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_ diff --git a/tensorflow/stream_executor/platform.cc b/tensorflow/stream_executor/platform.cc new file mode 100644 index 0000000000..8be9353bbe --- /dev/null +++ b/tensorflow/stream_executor/platform.cc @@ -0,0 +1,115 @@ +#include "tensorflow/stream_executor/platform.h" + +#include "tensorflow/stream_executor/platform/port.h" + +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/strcat.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/stream_executor_pimpl.h" + +namespace perftools { +namespace gputools { + +string PlatformKindString(PlatformKind kind) { + switch (kind) { + case PlatformKind::kCuda: + return "CUDA"; + case PlatformKind::kOpenCL: + return "OpenCL"; + case PlatformKind::kOpenCLAltera: + return "OpenCL+Altera"; + case PlatformKind::kHost: + return "Host"; + case PlatformKind::kMock: + return "Mock"; + default: + return port::StrCat("InvalidPlatformKind(", static_cast<int>(kind), ")"); + } +} + +PlatformKind PlatformKindFromString(string kind) { + for (int i = 0; i < static_cast<int>(PlatformKind::kSize); ++i) { + if (kind == PlatformKindString(static_cast<PlatformKind>(i))) { + return static_cast<PlatformKind>(i); + } + } + + return PlatformKind::kInvalid; +} + +bool PlatformIsRunnable(PlatformKind kind) { + switch (kind) { + case PlatformKind::kCuda: + case PlatformKind::kOpenCL: + case PlatformKind::kHost: + return true; + default: + return false; + } +} + +bool PlatformIsRunnableOnDevice(PlatformKind kind) { + switch (kind) { + case PlatformKind::kCuda: + case PlatformKind::kOpenCL: + return true; + default: + return false; + } +} + +void CheckPlatformKindIsValid(PlatformKind kind) { + CHECK(static_cast<int>(PlatformKind::kCuda) <= static_cast<int>(kind) && + static_cast<int>(kind) <= static_cast<int>(PlatformKind::kMock)) + << "invalid GPU executor kind: " << PlatformKindString(kind); +} + +StreamExecutorConfig::StreamExecutorConfig() + : ordinal(-1), device_options(DeviceOptions::Default()) {} + +StreamExecutorConfig::StreamExecutorConfig(int ordinal_in) + : ordinal(ordinal_in), device_options(DeviceOptions::Default()) {} + +Platform::~Platform() {} + +port::Status Platform::ForceExecutorShutdown() { + return port::Status(port::error::UNIMPLEMENTED, + "executor shutdown is not supported on this platform"); +} + +std::unique_ptr<Platform::PeerAccessMap> Platform::GetPeerAccessMap() { + auto *map = new PeerAccessMap; + + int device_count = VisibleDeviceCount(); + for (int i = 0; i < device_count; ++i) { + for (int j = 0; j < device_count; ++j) { + StreamExecutor *from = ExecutorForDevice(i).ValueOrDie(); + StreamExecutor *to = ExecutorForDevice(j).ValueOrDie(); + (*map)[{i, j}] = from->CanEnablePeerAccessTo(to); + } + } + + return std::unique_ptr<Platform::PeerAccessMap>{map}; +} + +port::Status Platform::EnablePeerAccess() { + auto peer_access_map = GetPeerAccessMap(); + for (const auto &access : *peer_access_map) { + auto devices = access.first; + if (access.second) { + StreamExecutor *from = ExecutorForDevice(devices.first).ValueOrDie(); + StreamExecutor *to = ExecutorForDevice(devices.second).ValueOrDie(); + auto status = from->EnablePeerAccessTo(to); + if (!status.ok()) { + return status; + } + } else { + LOG(INFO) << "cannot enable peer access from device ordinal " + << devices.first << " to device ordinal " << devices.second; + } + } + return port::Status::OK(); +} + +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/platform.h b/tensorflow/stream_executor/platform.h new file mode 100644 index 0000000000..c8b500b424 --- /dev/null +++ b/tensorflow/stream_executor/platform.h @@ -0,0 +1,185 @@ +// Defines types and declares functions for identifying and extracting +// information about the types of platforms and supporting libraries for which +// StreamExecutor implementations exist. +#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_H_ +#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_H_ + +#include <map> +#include "tensorflow/stream_executor/platform/port.h" + +#include "tensorflow/stream_executor/device_options.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/plugin.h" +#include "tensorflow/stream_executor/trace_listener.h" + +namespace perftools { +namespace gputools { + +class StreamExecutor; + +// Describes the platform for a StreamExecutor instantiation to act upon. +// +// Implementors: if you add a value here be sure to update PlatformKindString +// and CheckPlatformKindIsValid. +enum class PlatformKind { + kInvalid, + kCuda, + kOpenCL, + kOpenCLAltera, // Altera FPGA OpenCL platform. + // See documentation: go/fpgaopencl + // (StreamExecutor integration) + kHost, + kMock, + kSize, +}; + +// Returns true if kind represents a valid platform capable of enqueuing items +// on a stream, but not necessarily on an accelerator device. +// Returns false for kMock and any invalid PlatformKind values. +bool PlatformIsRunnable(PlatformKind kind); + +// Returns true if kind represents a valid platform capable of running kernels +// on an accelerator device. Returns false for kHost*, kMock and any invalid +// PlatformKind values. +bool PlatformIsRunnableOnDevice(PlatformKind kind); + +// Returns a printable description of a PlatformKind. +string PlatformKindString(PlatformKind kind); + +// Returns the PlatformKind corresponding to the input string; returns kInvalid +// in the case of no match. +PlatformKind PlatformKindFromString(string platform_string); + +// Checks that kind takes on a valid value. +void CheckPlatformKindIsValid(PlatformKind kind); + +// StreamExecutorConfig encapsulates the set of options for constructing a +// StreamExecutor for a given platform. +struct StreamExecutorConfig { + // Sets members to defaults: -1 for ordinal (must be changed), and default + // PluginConfig and DeviceOptions. + StreamExecutorConfig(); + + // Simple ordinal-setting constructor. + explicit StreamExecutorConfig(int ordinal); + + // The ordinal of the device to be managed by the returned StreamExecutor. + int ordinal; + + // The PluginConfig for the returned StreamExecutor. + PluginConfig plugin_config; + + // The DeviceOptions for the returned StreamExecutor. + DeviceOptions device_options; +}; + +// Abstract base class for a platform registered with the MultiPlatformManager. +class Platform { + public: + virtual ~Platform(); + + // A platform ID is a unique identifier for each registered platform type - + // each platform is required to expose an ID to ensure unique registration and + // as a target against which plugins can register. + // + // The macro below is provided to help generate a [process-unique] identifer. + using Id = void*; + +// Helper macro to define a plugin ID. To be used only inside plugin +// implementation files. Works by "reserving" an address/value (guaranteed to be +// unique) inside a process space. +#define PLATFORM_DEFINE_ID(ID_VAR_NAME) \ + namespace { \ + int plugin_id_value; \ + } \ + const perftools::gputools::Platform::Id ID_VAR_NAME = &plugin_id_value; + + // Returns a key uniquely identifying this platform. + virtual Id id() const = 0; + + // Returns the number of devices accessible on this platform. + // + // Note that, though these devices are visible, if there is only one userspace + // context allowed for the device at a time and another process is using this + // device, a call to ExecutorForDevice may return an error status. + virtual int VisibleDeviceCount() const = 0; + + // Name of this platform. + virtual const string& Name() const = 0; + + // Returns a device with the given ordinal on this platform with a default + // plugin configuration or, if none can be found with the given ordinal or + // there is an error in opening a context to communicate with the device, an + // error status is returned. + // + // Ownership of the executor is NOT transferred to the caller -- + // the Platform owns the executors in a singleton-like fashion. + virtual port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) = 0; + + // Returns a device or error, as above, with the specified plugins. + // + // Ownership of the executor is NOT transferred to the caller. + virtual port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig( + int ordinal, const PluginConfig& plugin_config) = 0; + + // Returns a device constructed with the options specified in "config". + // Ownership of the executor is NOT transferred to the caller. + virtual port::StatusOr<StreamExecutor*> GetExecutor( + const StreamExecutorConfig& config) = 0; + + // Returns a device constructed with the options specified in "config" without + // looking in or storing to the Platform's executor cache. + // Ownership IS transferred to the caller. + virtual port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor( + const StreamExecutorConfig& config) = 0; + + // Warning: this is a dangerous API and should be used with caution. + // + // Forces the platform to delete executor instances, releasing their + // associated device contexts. There must be no held instances of the executor + // and there must be no outstanding activity on the devices for this platform. + // + // This is only useful on platforms which bind a device to a single process + // that has obtained the device context. May return UNIMPLEMENTED on platforms + // that have no reason to destroy device contexts. + virtual port::Status ForceExecutorShutdown(); + + // Registers a TraceListener to listen to all StreamExecutors for this + // platform. + // Takes ownership of listener. + virtual void RegisterTraceListener( + std::unique_ptr<TraceListener> listener) = 0; + + // Removes the specified TraceListener from all StreamExecutors. + virtual void UnregisterTraceListener(TraceListener* listener) = 0; + + // Map of executor-to-executor coordinate and boolean, indicating if the first + // executor can access the second's memory. + using PeerAccessMap = std::map<std::pair<int, int>, bool>; + + // Returns a matrix indicating which executors can access which other + // executors' memory. + virtual std::unique_ptr<PeerAccessMap> GetPeerAccessMap(); + + // Attempts to enable all peer-to-peer access links described by the result of + // GetPeerAccessMap(). Note that calling this routine will force the creation + // of a default-argument (see StreamExecutorConfig) StreamExecutor object for + // each device ordinal in the system, should any not yet exist. + virtual port::Status EnablePeerAccess(); + + protected: + // SE_DISALLOW_COPY_AND_ASSIGN declares a constructor, which suppresses the + // presence of the default constructor. This statement re-enables it, which + // simplifies subclassing. + Platform() = default; + + private: + SE_DISALLOW_COPY_AND_ASSIGN(Platform); +}; + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_H_ diff --git a/tensorflow/stream_executor/platform/default/mutex.h b/tensorflow/stream_executor/platform/default/mutex.h new file mode 100644 index 0000000000..371eb7f156 --- /dev/null +++ b/tensorflow/stream_executor/platform/default/mutex.h @@ -0,0 +1,60 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_MUTEX_H_ +#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_MUTEX_H_ + +#include <chrono> // NOLINT +#include <condition_variable> // NOLINT + +#include "tensorflow/stream_executor/platform/port.h" + +// std::shared_timed_mutex is a C++14 feature. +#if (__cplusplus >= 201402L) +#define STREAM_EXECUTOR_USE_SHARED_MUTEX +#endif // __cplusplus >= 201402L + +#ifdef STREAM_EXECUTOR_USE_SHARED_MUTEX +#include <shared_mutex> // NOLINT +#else +#include <mutex> // NOLINT +#endif + +namespace perftools { +namespace gputools { + +enum ConditionResult { kCond_Timeout, kCond_MaybeNotified }; + +#ifdef STREAM_EXECUTOR_USE_SHARED_MUTEX +typedef std::shared_timed_mutex BaseMutex; +#else +typedef std::mutex BaseMutex; +#endif + +// A class that wraps around the std::mutex implementation, only adding an +// additional LinkerInitialized constructor interface. +class mutex : public BaseMutex { + public: + mutex() {} + // The default implementation of std::mutex is safe to use after the linker + // initializations + explicit mutex(LinkerInitialized x) {} +}; + +typedef std::unique_lock<BaseMutex> mutex_lock; + +#ifdef STREAM_EXECUTOR_USE_SHARED_MUTEX +typedef std::shared_lock<BaseMutex> shared_lock; +#else +typedef std::unique_lock<BaseMutex> shared_lock; +#endif + +using std::condition_variable; + +inline ConditionResult WaitForMilliseconds(mutex_lock* mu, + condition_variable* cv, int64 ms) { + std::cv_status s = cv->wait_for(*mu, std::chrono::milliseconds(ms)); + return (s == std::cv_status::timeout) ? kCond_Timeout : kCond_MaybeNotified; +} + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_MUTEX_H_ diff --git a/tensorflow/stream_executor/platform/logging.h b/tensorflow/stream_executor/platform/logging.h new file mode 100644 index 0000000000..a3e2385dd3 --- /dev/null +++ b/tensorflow/stream_executor/platform/logging.h @@ -0,0 +1,21 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_LOGGING_H_ +#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_LOGGING_H_ + +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/stream_executor/platform/port.h" + +#if !defined(PLATFORM_GOOGLE) + +// A CHECK() macro that lets you assert the success of a function that +// returns -1 and sets errno in case of an error. E.g. +// +// CHECK_ERR(mkdir(path, 0700)); +// +// or +// +// int fd = open(filename, flags); CHECK_ERR(fd) << ": open " << filename; +#define CHECK_ERR(invocation) CHECK((invocation) != -1) << #invocation + +#endif + +#endif // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_LOGGING_H_ diff --git a/tensorflow/stream_executor/platform/mutex.h b/tensorflow/stream_executor/platform/mutex.h new file mode 100644 index 0000000000..21b1894737 --- /dev/null +++ b/tensorflow/stream_executor/platform/mutex.h @@ -0,0 +1,12 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_MUTEX_H_ +#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_MUTEX_H_ + +#include "tensorflow/core/platform/port.h" + +#if defined(PLATFORM_GOOGLE) +#include "tensorflow/stream_executor/platform/google/mutex.h" +#else +#include "tensorflow/stream_executor/platform/default/mutex.h" +#endif + +#endif // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_MUTEX_H_ diff --git a/tensorflow/stream_executor/platform/port.h b/tensorflow/stream_executor/platform/port.h new file mode 100644 index 0000000000..ebe0cf517b --- /dev/null +++ b/tensorflow/stream_executor/platform/port.h @@ -0,0 +1,40 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PORT_H_ +#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PORT_H_ + +#include "tensorflow/core/platform/port.h" + +namespace perftools { +namespace gputools { + +using tensorflow::int8; +using tensorflow::int16; +using tensorflow::int32; +using tensorflow::int64; + +using tensorflow::uint8; +using tensorflow::uint16; +using tensorflow::uint32; +using tensorflow::uint64; + +#if !defined(PLATFORM_GOOGLE) +using std::string; +#endif + +#if !defined(COMPILER_MSVC) +#define ARRAYSIZE(a) \ + ((sizeof(a) / sizeof(*(a))) / \ + static_cast<size_t>(!(sizeof(a) % sizeof(*(a))))) +#endif + +using tensorflow::LinkerInitialized; +using tensorflow::LINKER_INITIALIZED; + +#define SE_FALLTHROUGH_INTENDED TF_FALLTHROUGH_INTENDED + +} // namespace gputools +} // namespace perftools + +#define SE_DISALLOW_COPY_AND_ASSIGN TF_DISALLOW_COPY_AND_ASSIGN +#define SE_MUST_USE_RESULT TF_MUST_USE_RESULT + +#endif // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PORT_H_ diff --git a/tensorflow/stream_executor/platform/thread_annotations.h b/tensorflow/stream_executor/platform/thread_annotations.h new file mode 100644 index 0000000000..bce4bb3794 --- /dev/null +++ b/tensorflow/stream_executor/platform/thread_annotations.h @@ -0,0 +1,6 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_THREAD_ANNOTATIONS_H_ +#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_THREAD_ANNOTATIONS_H_ + +#include "tensorflow/core/platform/thread_annotations.h" + +#endif // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_THREAD_ANNOTATIONS_H_ diff --git a/tensorflow/stream_executor/plugin.cc b/tensorflow/stream_executor/plugin.cc new file mode 100644 index 0000000000..8ca8ecff38 --- /dev/null +++ b/tensorflow/stream_executor/plugin.cc @@ -0,0 +1,40 @@ +#include "tensorflow/stream_executor/plugin.h" + +namespace perftools { +namespace gputools { + +// Mostly-arbitrary ID only used as a sentinel "not otherwise initialized" +// value. This value should never [need to] be specified aside by initialization +// functions defined in this file and in PluginRegistry. +PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(PluginConfig::kDefault); + +PluginConfig::PluginConfig() + : blas_(kDefault), dnn_(kDefault), fft_(kDefault), rng_(kDefault) {} + +bool PluginConfig::operator==(const PluginConfig& rhs) const { + return blas_ == rhs.blas_ && dnn_ == rhs.dnn_ && fft_ == rhs.fft_ && + rng_ == rhs.rng_; +} + +PluginConfig& PluginConfig::SetBlas(PluginId blas) { + blas_ = blas; + return *this; +} + +PluginConfig& PluginConfig::SetDnn(PluginId dnn) { + dnn_ = dnn; + return *this; +} + +PluginConfig& PluginConfig::SetFft(PluginId fft) { + fft_ = fft; + return *this; +} + +PluginConfig& PluginConfig::SetRng(PluginId rng) { + rng_ = rng; + return *this; +} + +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/plugin.h b/tensorflow/stream_executor/plugin.h new file mode 100644 index 0000000000..5dc39b7928 --- /dev/null +++ b/tensorflow/stream_executor/plugin.h @@ -0,0 +1,74 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_PLUGIN_H_ +#define TENSORFLOW_STREAM_EXECUTOR_PLUGIN_H_ + +namespace perftools { +namespace gputools { + +// A plugin ID is a unique identifier for each registered plugin type. +typedef void* PluginId; + +// Helper macro to define a plugin ID. To be used only inside plugin +// implementation files. Works by "reserving" an address/value (guaranteed to be +// unique) inside a process space. +#define PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(ID_VAR_NAME) \ + namespace { \ + int plugin_id_value; \ + } \ + const PluginId ID_VAR_NAME = &plugin_id_value; + +// kNullPlugin denotes an invalid plugin identifier. +extern const PluginId kNullPlugin; + +// Enumeration to list the supported types of plugins / support libraries. +enum class PluginKind { + kInvalid, + kBlas, + kDnn, + kFft, + kRng, +}; + +// A PluginConfig describes the set of plugins to be used by a StreamExecutor +// instance. Each plugin is defined by an arbitrary identifier, usually best set +// to the address static member in the implementation (to avoid conflicts). +// +// A PluginConfig may be passed to the StreamExecutor constructor - the plugins +// described therein will be used to provide BLAS, DNN, FFT, and RNG +// functionality. Platform-approprate defaults will be used for any un-set +// libraries. If a platform does not support a specified plugin (ex. cuBLAS on +// an OpenCL executor), then an error will be logged and no plugin operations +// will succeed. +// +// The StreamExecutor BUILD target does not link ANY plugin libraries - even +// common host fallbacks! Any plugins must be explicitly linked by dependent +// targets. See the cuda, opencl and host BUILD files for implemented plugin +// support (search for "plugin"). +class PluginConfig { + public: + // Value specifying the platform's default option for that plugin. + static const PluginId kDefault; + + // Initializes all members to the default options. + PluginConfig(); + + bool operator==(const PluginConfig& rhs) const; + + // Sets the appropriate library kind to that passed in. + PluginConfig& SetBlas(PluginId blas); + PluginConfig& SetDnn(PluginId dnn); + PluginConfig& SetFft(PluginId fft); + PluginConfig& SetRng(PluginId rng); + + PluginId blas() const { return blas_; } + PluginId dnn() const { return dnn_; } + PluginId fft() const { return fft_; } + PluginId rng() const { return rng_; } + + private: + PluginId blas_, dnn_, fft_, rng_; +}; + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_PLUGIN_H_ diff --git a/tensorflow/stream_executor/plugin_registry.cc b/tensorflow/stream_executor/plugin_registry.cc new file mode 100644 index 0000000000..eda44d1146 --- /dev/null +++ b/tensorflow/stream_executor/plugin_registry.cc @@ -0,0 +1,228 @@ +#include "tensorflow/stream_executor/plugin_registry.h" + +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" +#include "tensorflow/stream_executor/multi_platform_manager.h" + +namespace perftools { +namespace gputools { + +const PluginId kNullPlugin = nullptr; + +// Returns the string representation of the specified PluginKind. +string PluginKindString(PluginKind plugin_kind) { + switch (plugin_kind) { + case PluginKind::kBlas: + return "BLAS"; + case PluginKind::kDnn: + return "DNN"; + case PluginKind::kFft: + return "FFT"; + case PluginKind::kRng: + return "RNG"; + case PluginKind::kInvalid: + default: + return "kInvalid"; + } +} + +PluginRegistry::DefaultFactories::DefaultFactories() : + blas(kNullPlugin), dnn(kNullPlugin), fft(kNullPlugin), rng(kNullPlugin) { } + +/* static */ mutex PluginRegistry::mu_(LINKER_INITIALIZED); +/* static */ PluginRegistry* PluginRegistry::instance_ = nullptr; + +PluginRegistry::PluginRegistry() {} + +/* static */ PluginRegistry* PluginRegistry::Instance() { + mutex_lock lock{mu_}; + if (instance_ == nullptr) { + instance_ = new PluginRegistry(); + } + return instance_; +} + +void PluginRegistry::MapPlatformKindToId(PlatformKind platform_kind, + Platform::Id platform_id) { + platform_id_by_kind_[platform_kind] = platform_id; +} + +template <typename FACTORY_TYPE> +port::Status PluginRegistry::RegisterFactoryInternal( + PluginId plugin_id, const string& plugin_name, FACTORY_TYPE factory, + std::map<PluginId, FACTORY_TYPE>* factories) { + mutex_lock lock{mu_}; + + if (factories->find(plugin_id) != factories->end()) { + return port::Status{ + port::error::ALREADY_EXISTS, + port::Printf("Attempting to register factory for plugin %s when " + "one has already been registered", + plugin_name.c_str())}; + } + + (*factories)[plugin_id] = factory; + plugin_names_[plugin_id] = plugin_name; + return port::Status::OK(); +} + +template <typename FACTORY_TYPE> +port::StatusOr<FACTORY_TYPE> PluginRegistry::GetFactoryInternal( + PluginId plugin_id, const std::map<PluginId, FACTORY_TYPE>& factories, + const std::map<PluginId, FACTORY_TYPE>& generic_factories) const { + auto iter = factories.find(plugin_id); + if (iter == factories.end()) { + iter = generic_factories.find(plugin_id); + if (iter == generic_factories.end()) { + return port::Status{ + port::error::NOT_FOUND, + port::Printf("Plugin ID %p not registered.", plugin_id)}; + } + } + + return iter->second; +} + +bool PluginRegistry::SetDefaultFactory(Platform::Id platform_id, + PluginKind plugin_kind, + PluginId plugin_id) { + if (!HasFactory(platform_id, plugin_kind, plugin_id)) { + port::StatusOr<Platform*> status = + MultiPlatformManager::PlatformWithId(platform_id); + string platform_name = "<unregistered platform>"; + if (status.ok()) { + platform_name = status.ValueOrDie()->Name(); + } + + LOG(ERROR) << "A factory must be registered for a platform before being " + << "set as default! " + << "Platform name: " << platform_name + << ", PluginKind: " << PluginKindString(plugin_kind) + << ", PluginId: " << plugin_id; + return false; + } + + switch (plugin_kind) { + case PluginKind::kBlas: + default_factories_[platform_id].blas = plugin_id; + break; + case PluginKind::kDnn: + default_factories_[platform_id].dnn = plugin_id; + break; + case PluginKind::kFft: + default_factories_[platform_id].fft = plugin_id; + break; + case PluginKind::kRng: + default_factories_[platform_id].rng = plugin_id; + break; + default: + LOG(ERROR) << "Invalid plugin kind specified: " + << static_cast<int>(plugin_kind); + return false; + } + + return true; +} + +bool PluginRegistry::HasFactory(const PluginFactories& factories, + PluginKind plugin_kind, + PluginId plugin_id) const { + switch (plugin_kind) { + case PluginKind::kBlas: + return factories.blas.find(plugin_id) != factories.blas.end(); + case PluginKind::kDnn: + return factories.dnn.find(plugin_id) != factories.dnn.end(); + case PluginKind::kFft: + return factories.fft.find(plugin_id) != factories.fft.end(); + case PluginKind::kRng: + return factories.rng.find(plugin_id) != factories.rng.end(); + default: + LOG(ERROR) << "Invalid plugin kind specified: " + << PluginKindString(plugin_kind); + return false; + } +} + +bool PluginRegistry::HasFactory(Platform::Id platform_id, + PluginKind plugin_kind, + PluginId plugin_id) const { + auto iter = factories_.find(platform_id); + if (iter != factories_.end()) { + if (HasFactory(iter->second, plugin_kind, plugin_id)) { + return true; + } + } + + return HasFactory(generic_factories_, plugin_kind, plugin_id); +} + +// Explicit instantiations to support types exposed in user/public API. +#define EMIT_PLUGIN_SPECIALIZATIONS(FACTORY_TYPE, FACTORY_VAR, PLUGIN_STRING) \ + template port::StatusOr<PluginRegistry::FACTORY_TYPE> \ + PluginRegistry::GetFactoryInternal<PluginRegistry::FACTORY_TYPE>( \ + PluginId plugin_id, \ + const std::map<PluginId, PluginRegistry::FACTORY_TYPE>& factories, \ + const std::map<PluginId, PluginRegistry::FACTORY_TYPE>& \ + generic_factories) const; \ + \ + template port::Status \ + PluginRegistry::RegisterFactoryInternal<PluginRegistry::FACTORY_TYPE>( \ + PluginId plugin_id, const string& plugin_name, \ + PluginRegistry::FACTORY_TYPE factory, \ + std::map<PluginId, PluginRegistry::FACTORY_TYPE>* factories); \ + \ + template <> \ + port::Status PluginRegistry::RegisterFactory<PluginRegistry::FACTORY_TYPE>( \ + Platform::Id platform_id, PluginId plugin_id, const string& name, \ + PluginRegistry::FACTORY_TYPE factory) { \ + return RegisterFactoryInternal(plugin_id, name, factory, \ + &factories_[platform_id].FACTORY_VAR); \ + } \ + \ + template <> \ + port::Status PluginRegistry::RegisterFactoryForAllPlatforms< \ + PluginRegistry::FACTORY_TYPE>(PluginId plugin_id, const string& name, \ + PluginRegistry::FACTORY_TYPE factory) { \ + return RegisterFactoryInternal(plugin_id, name, factory, \ + &generic_factories_.FACTORY_VAR); \ + } \ + \ + template <> \ + port::StatusOr<PluginRegistry::FACTORY_TYPE> PluginRegistry::GetFactory( \ + Platform::Id platform_id, PluginId plugin_id) { \ + if (plugin_id == PluginConfig::kDefault) { \ + plugin_id = default_factories_[platform_id].FACTORY_VAR; \ + \ + if (plugin_id == kNullPlugin) { \ + return port::Status{port::error::FAILED_PRECONDITION, \ + "No suitable " PLUGIN_STRING \ + " plugin registered, default or otherwise."}; \ + } else { \ + VLOG(2) << "Selecting default " PLUGIN_STRING " plugin, " \ + << plugin_names_[plugin_id]; \ + } \ + } \ + return GetFactoryInternal(plugin_id, factories_[platform_id].FACTORY_VAR, \ + generic_factories_.FACTORY_VAR); \ + } \ + \ + /* TODO(b/22689637): Also temporary WRT MultiPlatformManager */ \ + template <> \ + port::StatusOr<PluginRegistry::FACTORY_TYPE> PluginRegistry::GetFactory( \ + PlatformKind platform_kind, PluginId plugin_id) { \ + auto iter = platform_id_by_kind_.find(platform_kind); \ + if (iter == platform_id_by_kind_.end()) { \ + return port::Status{port::error::FAILED_PRECONDITION, \ + port::Printf("Platform kind %d not registered.", \ + static_cast<int>(platform_kind))}; \ + } \ + return GetFactory<PluginRegistry::FACTORY_TYPE>(iter->second, plugin_id); \ + } + +EMIT_PLUGIN_SPECIALIZATIONS(BlasFactory, blas, "BLAS"); +EMIT_PLUGIN_SPECIALIZATIONS(DnnFactory, dnn, "DNN"); +EMIT_PLUGIN_SPECIALIZATIONS(FftFactory, fft, "FFT"); +EMIT_PLUGIN_SPECIALIZATIONS(RngFactory, rng, "RNG"); + +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/plugin_registry.h b/tensorflow/stream_executor/plugin_registry.h new file mode 100644 index 0000000000..f1ea59853d --- /dev/null +++ b/tensorflow/stream_executor/plugin_registry.h @@ -0,0 +1,155 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_PLUGIN_REGISTRY_H_ +#define TENSORFLOW_STREAM_EXECUTOR_PLUGIN_REGISTRY_H_ + +#include <map> + +#include "tensorflow/stream_executor/blas.h" +#include "tensorflow/stream_executor/dnn.h" +#include "tensorflow/stream_executor/fft.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/platform.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/plugin.h" +#include "tensorflow/stream_executor/rng.h" + +namespace perftools { +namespace gputools { + +namespace internal { +class StreamExecutorInterface; +} + +// The PluginRegistry is a singleton that maintains the set of registered +// "support library" plugins. Currently, there are four kinds of plugins: +// BLAS, DNN, FFT, and RNG. Each interface is defined in the corresponding +// gpu_{kind}.h header. +// +// At runtime, a StreamExecutor object will query the singleton registry to +// retrieve the plugin kind that StreamExecutor was configured with (refer to +// the StreamExecutor and PluginConfig declarations). +// +// Plugin libraries are best registered using REGISTER_MODULE_INITIALIZER, +// but can be registered at any time. When registering a DSO-backed plugin, it +// is usually a good idea to load the DSO at registration time, to prevent +// late-loading from distorting performance/benchmarks as much as possible. +class PluginRegistry { + public: + typedef blas::BlasSupport* (*BlasFactory)(internal::StreamExecutorInterface*); + typedef dnn::DnnSupport* (*DnnFactory)(internal::StreamExecutorInterface*); + typedef fft::FftSupport* (*FftFactory)(internal::StreamExecutorInterface*); + typedef rng::RngSupport* (*RngFactory)(internal::StreamExecutorInterface*); + + // Gets (and creates, if necessary) the singleton PluginRegistry instance. + static PluginRegistry* Instance(); + + // Registers the specified factory with the specified platform. + // Returns a non-successful status if the factory has already been registered + // with that platform (but execution should be otherwise unaffected). + template <typename FactoryT> + port::Status RegisterFactory(Platform::Id platform_id, PluginId plugin_id, + const string& name, FactoryT factory); + + // Registers the specified factory as usable by _all_ platform types. + // Reports errors just as RegisterFactory. + template <typename FactoryT> + port::Status RegisterFactoryForAllPlatforms(PluginId plugin_id, + const string& name, + FactoryT factory); + + // TODO(b/22689637): Setter for temporary mapping until all users are using + // MultiPlatformManager / PlatformId. + void MapPlatformKindToId(PlatformKind platform_kind, + Platform::Id platform_id); + + // Potentially sets the plugin identified by plugin_id to be the default + // for the specified platform and plugin kind. If this routine is called + // multiple types for the same PluginKind, the PluginId given in the last call + // will be used. + bool SetDefaultFactory(Platform::Id platform_id, PluginKind plugin_kind, + PluginId plugin_id); + + // Return true if the factory/id has been registered for the + // specified platform and plugin kind and false otherwise. + bool HasFactory(Platform::Id platform_id, PluginKind plugin_kind, + PluginId plugin) const; + + // Retrieves the factory registered for the specified kind, + // or a port::Status on error. + template <typename FactoryT> + port::StatusOr<FactoryT> GetFactory(Platform::Id platform_id, + PluginId plugin_id); + + // TODO(b/22689637): Deprecated/temporary. Will be deleted once all users are + // on MultiPlatformManager / PlatformId. + template <typename FactoryT> + port::StatusOr<FactoryT> GetFactory(PlatformKind platform_kind, + PluginId plugin_id); + + private: + // Containers for the sets of registered factories, by plugin kind. + struct PluginFactories { + std::map<PluginId, BlasFactory> blas; + std::map<PluginId, DnnFactory> dnn; + std::map<PluginId, FftFactory> fft; + std::map<PluginId, RngFactory> rng; + }; + + // Simple structure to hold the currently configured default plugins (for a + // particular Platform). + struct DefaultFactories { + DefaultFactories(); + PluginId blas, dnn, fft, rng; + }; + + PluginRegistry(); + + // Actually performs the work of registration. + template <typename FactoryT> + port::Status RegisterFactoryInternal(PluginId plugin_id, + const string& plugin_name, + FactoryT factory, + std::map<PluginId, FactoryT>* factories); + + // Actually performs the work of factory retrieval. + template <typename FactoryT> + port::StatusOr<FactoryT> GetFactoryInternal( + PluginId plugin_id, const std::map<PluginId, FactoryT>& factories, + const std::map<PluginId, FactoryT>& generic_factories) const; + + // Returns true if the specified plugin has been registered with the specified + // platform factories. Unlike the other overload of this method, this does + // not implicitly examine the default factory lists. + bool HasFactory(const PluginFactories& factories, PluginKind plugin_kind, + PluginId plugin) const; + + // As this object is a singleton, a global mutex can be used for static and + // instance protection. + static mutex mu_; + + // The singleton itself. + static PluginRegistry* instance_; + + // TODO(b/22689637): Temporary mapping until all users are using + // MultiPlatformManager / PlatformId. + std::map<PlatformKind, Platform::Id> platform_id_by_kind_; + + // The set of registered factories, keyed by platform ID. + std::map<Platform::Id, PluginFactories> factories_; + + // Plugins supported for all platform kinds. + PluginFactories generic_factories_; + + // The sets of default factories, keyed by platform ID. + std::map<Platform::Id, DefaultFactories> default_factories_; + + // Lookup table for plugin names. + std::map<PluginId, string> plugin_names_; + + SE_DISALLOW_COPY_AND_ASSIGN(PluginRegistry); +}; + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_PLUGIN_REGISTRY_H_ diff --git a/tensorflow/stream_executor/rng.cc b/tensorflow/stream_executor/rng.cc new file mode 100644 index 0000000000..052b502194 --- /dev/null +++ b/tensorflow/stream_executor/rng.cc @@ -0,0 +1,36 @@ +#include "tensorflow/stream_executor/rng.h" + +#include "tensorflow/stream_executor/platform/logging.h" + +namespace perftools { +namespace gputools { +namespace rng { + +bool RngSupport::CheckSeed(const uint8 *seed, uint64 seed_bytes) { + CHECK(seed != nullptr); + + if (seed_bytes < kMinSeedBytes) { + LOG(INFO) << "Insufficient RNG seed data specified: " << seed_bytes + << ". At least " << RngSupport::kMinSeedBytes + << " bytes are required."; + return false; + } + + if (seed_bytes > kMaxSeedBytes) { + LOG(INFO) << "Too much RNG seed data specified: " << seed_bytes + << ". At most " << RngSupport::kMaxSeedBytes + << " bytes may be provided."; + return false; + } + + return true; +} + +#if defined(__APPLE__) +const int RngSupport::kMinSeedBytes; +const int RngSupport::kMaxSeedBytes; +#endif + +} // namespace rng +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/rng.h b/tensorflow/stream_executor/rng.h new file mode 100644 index 0000000000..797631d01d --- /dev/null +++ b/tensorflow/stream_executor/rng.h @@ -0,0 +1,80 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_RNG_H_ +#define TENSORFLOW_STREAM_EXECUTOR_RNG_H_ + +#include <limits.h> +#include <complex> + +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { + +class Stream; +template <typename ElemT> +class DeviceMemory; + +namespace rng { + +// Random-number-generation support interface -- this can be derived from a GPU +// executor when the underlying platform has an RNG library implementation +// available. See StreamExecutor::AsRng(). +// When a seed is not specified, the backing RNG will be initialized with the +// default seed for that implementation. +// +// Thread-hostile: see StreamExecutor class comment for details on +// thread-hostility. +class RngSupport { + public: + static const int kMinSeedBytes = 16; + static const int kMaxSeedBytes = INT_MAX; + + // Releases any random-number-generation resources associated with this + // support object in the underlying platform implementation. + virtual ~RngSupport() {} + + // Populates a GPU memory allocation with random values appropriate for the + // DeviceMemory element type; i.e. populates DeviceMemory<float> with random + // float values. + virtual bool DoPopulateRandUniform(Stream *stream, + DeviceMemory<float> *v) = 0; + virtual bool DoPopulateRandUniform(Stream *stream, + DeviceMemory<double> *v) = 0; + virtual bool DoPopulateRandUniform(Stream *stream, + DeviceMemory<std::complex<float>> *v) = 0; + virtual bool DoPopulateRandUniform(Stream *stream, + DeviceMemory<std::complex<double>> *v) = 0; + + // Populates a GPU memory allocation with random values sampled from a + // Gaussian distribution with the given mean and standard deviation. + virtual bool DoPopulateRandGaussian(Stream *stream, float mean, float stddev, + DeviceMemory<float> *v) { + LOG(ERROR) + << "platform's random number generator does not support gaussian"; + return false; + } + virtual bool DoPopulateRandGaussian(Stream *stream, double mean, + double stddev, DeviceMemory<double> *v) { + LOG(ERROR) + << "platform's random number generator does not support gaussian"; + return false; + } + + // Specifies the seed used to initialize the RNG. + // This call does not transfer ownership of the buffer seed; its data should + // not be altered for the lifetime of this call. At least 16 bytes of seed + // data must be provided, but not all seed data will necessarily be used. + // seed: Pointer to seed data. Must not be null. + // seed_bytes: Size of seed buffer in bytes. Must be >= 16. + virtual bool SetSeed(Stream *stream, const uint8 *seed, + uint64 seed_bytes) = 0; + + protected: + static bool CheckSeed(const uint8 *seed, uint64 seed_bytes); +}; + +} // namespace rng +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_RNG_H_ diff --git a/tensorflow/stream_executor/shared_memory_config.h b/tensorflow/stream_executor/shared_memory_config.h new file mode 100644 index 0000000000..f2bfe27117 --- /dev/null +++ b/tensorflow/stream_executor/shared_memory_config.h @@ -0,0 +1,21 @@ +// This file defines a uniform interface to configuration options for shared +// memory for supported devices. As with many StreamExecutor-supported features, +// support for the options defined herein is device-dependent. +#ifndef TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_ +#define TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_ + +namespace perftools { +namespace gputools { + +// SharedMemoryConfig enum describes potential widths of shared memory banks for +// a device or kernel. +enum class SharedMemoryConfig { + kDefault, // Use the device default configuration. + kFourByte, // Sets shared memory banks to be four bytes wide. + kEightByte, // Sets shared memory banks to be eight bytes wide. +}; + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_ diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc new file mode 100644 index 0000000000..ca3ef9aa1a --- /dev/null +++ b/tensorflow/stream_executor/stream.cc @@ -0,0 +1,3329 @@ +#include "tensorflow/stream_executor/stream.h" + +#include "tensorflow/stream_executor/platform/port.h" + +#include "tensorflow/stream_executor/blas.h" +#include "tensorflow/stream_executor/lib/strcat.h" +#include "tensorflow/stream_executor/platform.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/rng.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" +#include "tensorflow/stream_executor/stream_executor_pimpl.h" + +namespace perftools { +namespace gputools { + +namespace { +static internal::StreamInterface *CreateStreamImplementation( + StreamExecutor *parent) { + PlatformKind platform_kind = parent->platform_kind(); + if (platform_kind == PlatformKind::kCuda) { + return (*internal::MakeCUDAStreamImplementation())(parent); + } else if (platform_kind == PlatformKind::kOpenCL || + platform_kind == PlatformKind::kOpenCLAltera) { + return (*internal::MakeOpenCLStreamImplementation())(parent); + } else if (platform_kind == PlatformKind::kHost) { + return internal::MakeHostStreamImplementation(parent); + } else { + LOG(FATAL) << "cannot create stream implementation for platform kind: " + << PlatformKindString(platform_kind); + } +} + +// Code to turn parameters to functions on stream into strings that +// will be VLOG'ed. We need overloads, instead of +// e.g. BatchDescriptorToVlogString(), as the code that calls these +// functions does not know what the type of the parameter is. +string ToVlogString(const dnn::BatchDescriptor &descriptor) { + return descriptor.ToShortString(); +} + +string ToVlogString(const dnn::FilterDescriptor &descriptor) { + return descriptor.ToShortString(); +} + +string ToVlogString(const dnn::ConvolutionDescriptor &descriptor) { + return descriptor.ToShortString(); +} + +string ToVlogString(const dnn::PoolingDescriptor &descriptor) { + return descriptor.ToShortString(); +} + +string ToVlogString(const dnn::NormalizeDescriptor &descriptor) { + return descriptor.ToShortString(); +} + +string ToVlogString(dnn::ActivationMode mode) { + return dnn::ActivationModeString(mode); +} + +string ToVlogString(dnn::ElementwiseOperation op) { + return dnn::ElementwiseOperationString(op); +} + +string ToVlogString(blas::Transpose t) { return blas::TransposeString(t); } + +string ToVlogString(blas::UpperLower ul) { return blas::UpperLowerString(ul); } + +string ToVlogString(blas::Diagonal d) { return blas::DiagonalString(d); } + +string ToVlogString(blas::Side s) { return blas::SideString(s); } + +string ToVlogString(const void *ptr) { + if (ptr == nullptr) { + return "null"; + } + + // StrCat does not convert pointers to text. + std::ostringstream out; + out << ptr; + return out.str(); +} + +template <class T> +string ToVlogString(const std::complex<T> &c) { + // StrCat does not convert std::complex to text. + std::ostringstream out; + out << c; + return out.str(); +} + +template <class T> +string ToVlogString(const std::function<T> &f) { + return f == nullptr ? "null" : "<non-null function>"; +} + +string ToVlogString(const DeviceMemoryBase &memory) { + return ToVlogString(memory.opaque()); +} + +string ToVlogString(const DeviceMemoryBase *memory) { + return ToVlogString(*memory); +} + +string ToVlogString(int i) { return port::StrCat(i); } + +string ToVlogString(uint32 i) { return port::StrCat(i); } + +string ToVlogString(uint64 i) { return port::StrCat(i); } + +string ToVlogString(float f) { return port::StrCat(f); } + +string ToVlogString(double d) { return port::StrCat(d); } + +template <class T> +string ToVlogString(port::ArraySlice<T> elements) { + string str = port::StrCat( + ToVlogString(reinterpret_cast<const void *>(elements.data())), "[", + elements.size(), "]{"); + const char *separator = ""; + size_t max_to_show = std::numeric_limits<size_t>::max(); + if (!VLOG_IS_ON(2)) { + max_to_show = 5; + } else if (!VLOG_IS_ON(3)) { + max_to_show = 20; + } else if (!VLOG_IS_ON(11)) { + max_to_show = 1000; + } + for (size_t i = 0; i < elements.size(); ++i) { + if (i == max_to_show) { + str += ", ..."; + break; + } + port::StrAppend(&str, separator, ToVlogString(elements[i])); + separator = ", "; + } + str += "}"; + return str; +} + +template <class T> +string ToVlogString(port::MutableArraySlice<T> elements) { + return ToVlogString(port::ArraySlice<T>(elements)); +} + +// Used together with PARAM to VLOG calls made to the stream. Intended +// to be used like this: +// +// VLOG(1) << CallStr("MyFunction", this, {PARAM(a), PARAM(b)}); +// +// where a and b are the parameters to MyFunction. +// +// See VLOG_CALL for a short-hand for this. This way of doing it saves +// a tremendous amount of boilerplate code given how many functions +// there are on Stream and how many parameters they each have. +string CallStr(const char *function_name, Stream *stream, + std::vector<std::pair<const char *, string>> params) { + // Do not call this function unless VLOG is on since just + // constructing all the strings in params is expensive. + CHECK(VLOG_IS_ON(1)); + + string str = port::StrCat("Called Stream::", function_name, "("); + const char *separator = ""; + for (const auto ¶m : params) { + port::StrAppend(&str, separator, param.first, "=", param.second); + separator = ", "; + } + port::StrAppend(&str, ") stream=", ToVlogString(stream)); + return str; +} + +// Use this macro to avoid having to type every parameter twice to log +// it with VLOG and CallStr. +#define PARAM(parameter) \ + { #parameter, ToVlogString(parameter) } + +// Use this macro to avoid having to type out the name of each +// function and to save some boilerplate. Intended to be used like this: +// +// VLOG_CALL(PARAM(a), PARAM(b)) +// +// This saves a tremendous amount of boilerplate compared to the alternative: +// +// VLOG(1) << "Calling MyFunction(a=" << ToVlogString(a) +// << ", b=" << ToVlogString(b); +// +// Note here that most of the parameter names are not short and that +// most of the functions take many more than 2 parameters. +#define VLOG_CALL(...) VLOG(1) << CallStr(__func__, this, {__VA_ARGS__}) + +} // namespace + +Stream::Stream(StreamExecutor *parent) + : implementation_(CreateStreamImplementation(parent)), + parent_(parent), + allocated_(false), + ok_(false), + temporary_memory_manager_(this) { + VLOG_CALL(PARAM(parent)); +} + +Stream::Stream(StreamExecutor *parent, + internal::StreamInterface *implementation) + : implementation_(implementation), + parent_(parent), + allocated_(false), + ok_(false), + temporary_memory_manager_(this) { + VLOG_CALL(PARAM(parent), PARAM(implementation)); +} + +Stream::~Stream() { + VLOG_CALL(); + + temporary_memory_manager_.ForceDeallocateAll(); + + if (allocated_) { + parent_->DeallocateStream(this); + } +} + +Stream &Stream::Init() { + VLOG_CALL(); + + mutex_lock lock{mu_}; + CHECK_EQ(false, allocated_) + << "stream appears to already have been initialized"; + CHECK(!ok_) << "stream should be in !ok() state pre-initialization"; + + if (parent_->AllocateStream(this)) { + // Successful initialization! + allocated_ = true; + ok_ = true; + } else { + LOG(ERROR) << "failed to allocate stream during initialization"; + } + + return *this; +} + +Stream &Stream::InitTimer(Timer *timer) { + VLOG_CALL(PARAM(timer)); + + if (ok()) { + CheckError(parent_->AllocateTimer(timer)); + } else { + LOG(INFO) << "did not allocate timer: " << timer; + } + return *this; +} + +Stream &Stream::InitWithTimer(Timer *timer) { + VLOG_CALL(PARAM(timer)); + + return Init().InitTimer(timer); +} + +Stream &Stream::ThenRecordEvent(Event *event) { + VLOG_CALL(PARAM(event)); + + port::Status status = parent_->RecordEvent(this, event); + if (!status.ok()) { + LOG(ERROR) << "Error recording event in stream: " << status.error_message() + << "; not marking stream as bad, as the Event object may be " + << "at fault. Monitor for further errors."; + } + + return *this; +} + +Stream &Stream::ThenConvolve( + const dnn::BatchDescriptor &batch_descriptor, + const DeviceMemory<float> &input_data, + const dnn::FilterDescriptor &filter_descriptor, + const DeviceMemory<float> &filter_data, + const dnn::ConvolutionDescriptor &convolution_descriptor, + const dnn::BatchDescriptor &output_descriptor, + DeviceMemory<float> *output) { + VLOG_CALL(PARAM(batch_descriptor), PARAM(input_data), + PARAM(filter_descriptor), PARAM(filter_data), + PARAM(convolution_descriptor), PARAM(output_descriptor), + PARAM(output)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoConvolve( + this, batch_descriptor, input_data, filter_descriptor, filter_data, + convolution_descriptor, output_descriptor, output)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenSeparableConvolve( + const dnn::BatchDescriptor &batch_descriptor, + const DeviceMemory<float> &input_data, + const dnn::FilterDescriptor &filter_descriptor, int depth_multiplier, + const DeviceMemory<float> &first_weights, + const DeviceMemory<float> &second_weights, + const dnn::ConvolutionDescriptor &convolution_descriptor, + const dnn::BatchDescriptor &output_descriptor, + DeviceMemory<float> *output) { + VLOG_CALL( + PARAM(batch_descriptor), PARAM(input_data), PARAM(filter_descriptor), + PARAM(depth_multiplier), PARAM(first_weights), PARAM(second_weights), + PARAM(convolution_descriptor), PARAM(output_descriptor), PARAM(output)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoSeparableConvolve( + this, batch_descriptor, input_data, filter_descriptor, + depth_multiplier, first_weights, second_weights, + convolution_descriptor, output_descriptor, output)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenConvolveBackwardData( + const dnn::FilterDescriptor &filter_descriptor, + const DeviceMemory<float> &filter_data, + const dnn::BatchDescriptor &output_descriptor, + DeviceMemory<float> backward_output_data, + const dnn::ConvolutionDescriptor &convolution_descriptor, + const dnn::BatchDescriptor &input_descriptor, + DeviceMemory<float> *backward_input_data) { + VLOG_CALL(PARAM(filter_descriptor), PARAM(filter_data), + PARAM(output_descriptor), PARAM(backward_output_data), + PARAM(convolution_descriptor), PARAM(input_descriptor), + PARAM(backward_input_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoConvolveBackwardData( + this, filter_descriptor, filter_data, output_descriptor, + backward_output_data, convolution_descriptor, input_descriptor, + backward_input_data)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenConvolveBackwardFilter( + const dnn::BatchDescriptor &input_descriptor, + const DeviceMemory<float> &input_data, + const dnn::BatchDescriptor &output_descriptor, + DeviceMemory<float> backward_output_data, + const dnn::ConvolutionDescriptor &convolution_descriptor, + const dnn::FilterDescriptor &filter_descriptor, + DeviceMemory<float> *backward_filter_data) { + VLOG_CALL(PARAM(input_descriptor), PARAM(input_data), + PARAM(output_descriptor), PARAM(backward_output_data), + PARAM(convolution_descriptor), PARAM(filter_descriptor), + PARAM(backward_filter_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoConvolveBackwardFilter( + this, input_descriptor, input_data, output_descriptor, + backward_output_data, convolution_descriptor, filter_descriptor, + backward_filter_data)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenMatMul(const DeviceMemory<float> &input_data, + const DeviceMemory<float> &weights, + const dnn::BatchDescriptor &input_dimensions, + const dnn::BatchDescriptor &output_dimensions, + DeviceMemory<float> *output_data) { + VLOG_CALL(PARAM(input_data), PARAM(weights), PARAM(input_dimensions), + PARAM(output_dimensions), PARAM(output_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoMatMul(this, input_data, weights, input_dimensions, + output_dimensions, output_data)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenMatMulQuantized( + const DeviceMemory<float> &input_data, const DeviceMemory<int8> &weights, + const DeviceMemory<float> &weight_scales, + const dnn::BatchDescriptor &input_dimensions, + const dnn::BatchDescriptor &output_dimensions, + DeviceMemory<float> *output_data) { + VLOG_CALL(PARAM(input_data), PARAM(weights), PARAM(weight_scales), + PARAM(input_dimensions), PARAM(output_dimensions), + PARAM(output_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoMatMulQuantized(this, input_data, weights, + weight_scales, input_dimensions, + output_dimensions, output_data)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenMatMulQuantized( + const DeviceMemory<float> &input_data, const DeviceMemory<int16> &weights, + const DeviceMemory<float> &weight_scales, + const dnn::BatchDescriptor &input_dimensions, + const dnn::BatchDescriptor &output_dimensions, + DeviceMemory<float> *output_data) { + VLOG_CALL(PARAM(input_data), PARAM(weights), PARAM(weight_scales), + PARAM(input_dimensions), PARAM(output_dimensions), + PARAM(output_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoMatMulQuantized(this, input_data, weights, + weight_scales, input_dimensions, + output_dimensions, output_data)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenBiasAdd(const DeviceMemory<float> &input_data, + const DeviceMemory<float> &biases, + const dnn::BatchDescriptor &dimensions, + DeviceMemory<float> *output_data) { + VLOG_CALL(PARAM(input_data), PARAM(biases), PARAM(dimensions), + PARAM(output_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError( + dnn->DoBiasAdd(this, input_data, biases, dimensions, output_data)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenPoolForward( + const dnn::PoolingDescriptor &pooling_dimensions, + const dnn::BatchDescriptor &input_dimensions, + const DeviceMemory<float> &input_data, + const dnn::BatchDescriptor &output_dimensions, + DeviceMemory<float> *output_data) { + VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions), + PARAM(input_data), PARAM(output_dimensions), PARAM(output_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions, + input_data, output_dimensions, + output_data)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenPoolBackward( + const dnn::PoolingDescriptor &pooling_dimensions, + const dnn::BatchDescriptor &input_dimensions, + const DeviceMemory<float> &input_data, + const dnn::BatchDescriptor &output_dimensions, + const DeviceMemory<float> &output_data, + const DeviceMemory<float> &input_diff_data, + DeviceMemory<float> *output_diff_data) { + VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions), + PARAM(input_data), PARAM(output_dimensions), PARAM(output_data), + PARAM(input_diff_data), PARAM(output_diff_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions, + input_data, output_dimensions, output_data, + input_diff_data, output_diff_data)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenNormalize( + const dnn::NormalizeDescriptor &normalize_descriptor, + const DeviceMemory<float> &input_data, DeviceMemory<float> *output_data) { + VLOG_CALL(PARAM(normalize_descriptor), PARAM(input_data), PARAM(output_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoNormalize(this, normalize_descriptor, input_data, + output_data)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenActivate(dnn::ActivationMode activation_mode, + const dnn::BatchDescriptor &dimensions, + const DeviceMemory<float> &input_data, + DeviceMemory<float> *output_data) { + VLOG_CALL(PARAM(activation_mode), PARAM(dimensions), PARAM(input_data), + PARAM(output_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoActivate(this, activation_mode, dimensions, input_data, + output_data)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenDepthConcatenate( + port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float> *> input_data, + DeviceMemory<float> *output_data) { + VLOG_CALL(PARAM(input_dimensions), PARAM(input_data), PARAM(output_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoDepthConcatenate(this, input_dimensions, input_data, + output_data)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenElementwiseOperate( + dnn::ElementwiseOperation operation, + port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float> *> input_data, + const dnn::BatchDescriptor &output_dimensions, + DeviceMemory<float> *output_data) { + VLOG_CALL(PARAM(operation), PARAM(input_dimensions), PARAM(input_data), + PARAM(output_dimensions), PARAM(output_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoElementwiseOperate(this, operation, input_dimensions, + input_data, output_dimensions, + output_data)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenMemcpyD2HQuantized( + const DeviceMemory<float> &gpu_unquantized_src, + port::MutableArraySlice<uint8> host_dst) { + VLOG_CALL(PARAM(gpu_unquantized_src), PARAM(host_dst)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError( + dnn->DoMemcpyD2HQuantized(this, gpu_unquantized_src, host_dst)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenMemcpyD2HQuantized( + const DeviceMemory<float> &gpu_unquantized_src, + port::MutableArraySlice<uint16> host_dst) { + VLOG_CALL(PARAM(gpu_unquantized_src), PARAM(host_dst)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError( + dnn->DoMemcpyD2HQuantized(this, gpu_unquantized_src, host_dst)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenMemcpyD2HQuantized( + const DeviceMemory<float> &gpu_unquantized_src, + port::MutableArraySlice<int32> host_dst) { + VLOG_CALL(PARAM(gpu_unquantized_src), PARAM(host_dst)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError( + dnn->DoMemcpyD2HQuantized(this, gpu_unquantized_src, host_dst)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenMemcpyH2DQuantized( + port::ArraySlice<uint8> host_src, + DeviceMemory<float> *gpu_unquantized_dst) { + VLOG_CALL(PARAM(host_src), PARAM(gpu_unquantized_dst)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError( + dnn->DoMemcpyH2DQuantized(this, host_src, gpu_unquantized_dst)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream *Stream::GetOrCreateSubStream() { + mutex_lock lock{mu_}; + for (auto &stream : sub_streams_) { + if (stream.second) { + stream.second = false; + return stream.first.get(); + } + } + sub_streams_.emplace_back(std::unique_ptr<Stream>{new Stream{parent_}}, + false); + Stream *sub_stream = sub_streams_.back().first.get(); + sub_stream->Init(); + CHECK(ok_) << "sub-stream failed to be initialized"; + + return sub_stream; +} + +void Stream::ReturnSubStream(Stream *sub_stream) { + mutex_lock lock{mu_}; + for (auto &stream : sub_streams_) { + if (stream.first.get() == sub_stream) { + stream.second = true; + return; + } + } + LOG(FATAL) << "the sub-stream to be returned is not created by this stream"; +} + +Stream &Stream::ThenStartTimer(Timer *t) { + VLOG_CALL(PARAM(t)); + + if (ok()) { + CheckError(parent_->StartTimer(this, t)); + } else { + LOG(INFO) << "stream " << this << " did not enqueue 'start timer': " << t; + } + return *this; +} + +Stream &Stream::ThenStopTimer(Timer *t) { + VLOG_CALL(PARAM(t)); + + if (ok()) { + CheckError(parent_->StopTimer(this, t)); + } else { + LOG(INFO) << "stream " << this << " did not enqueue 'stop timer': " << t; + } + return *this; +} + +Stream &Stream::ThenWaitFor(Stream *other) { + VLOG_CALL(PARAM(other)); + + CHECK(this != other) << "stream cannot wait for itself"; + if (ok() && other->ok()) { + CheckError(parent_->CreateStreamDependency(this, other)); + } else { + SetError(); + LOG(INFO) << "stream " << this << " did not wait for stream: " << other; + } + return *this; +} + +Stream &Stream::ThenWaitFor(std::vector<std::unique_ptr<Stream>> *others) { + VLOG_CALL(PARAM(others)); + + for (auto &stream : *others) { + CHECK_NE(stream.get(), this); + ThenWaitFor(stream.get()); + } + return *this; +} + +Stream &Stream::ThenWaitFor(Event *event) { + VLOG_CALL(PARAM(event)); + + if (ok()) { + port::Status status = parent_->WaitForEvent(this, event); + if (!status.ok()) { + LOG(ERROR) << "Error waiting for event in stream: " + << status.error_message() + << "; not marking stream as bad, as the Event object may be " + << "at fault. Monitor for further errors."; + } + } else { + LOG(INFO) << "stream " << this << " did not wait for an event."; + } + return *this; +} + +// A functor that implements ThenBlasXXX interfaces, which calls DoBlasXXX +// functions and logs for errors. +template <typename... Args> +struct ThenBlasImpl { + // blas_func is the DoBlasXXX member function pointer, and args are its + // arguments except the first one of Stream* type. + Stream &operator()(Stream *stream, + bool (blas::BlasSupport::*blas_func)(Stream *, Args...), + Args... args); +}; + +template <typename... Args> +Stream &ThenBlasImpl<Args...>::operator()( + Stream *stream, bool (blas::BlasSupport::*blas_func)(Stream *, Args...), + Args... args) { + if (stream->ok()) { + if (blas::BlasSupport *blas = stream->parent_->AsBlas()) { + stream->CheckError((blas->*blas_func)(stream, args...)); + } else { + stream->CheckError(false); + LOG(WARNING) + << "attempting to perform BLAS operation using StreamExecutor " + "without BLAS support"; + } + } + return *stream; +} + +Stream &Stream::ThenBlasAsum(uint64 elem_count, const DeviceMemory<float> &x, + int incx, DeviceMemory<float> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<float> &, int, DeviceMemory<float> *> + impl; + return impl(this, &blas::BlasSupport::DoBlasAsum, elem_count, x, incx, + result); +} + +Stream &Stream::ThenBlasAsum(uint64 elem_count, const DeviceMemory<double> &x, + int incx, DeviceMemory<double> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<double> &, int, + DeviceMemory<double> *> impl; + return impl(this, &blas::BlasSupport::DoBlasAsum, elem_count, x, incx, + result); +} + +Stream &Stream::ThenBlasAsum(uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, + int incx, DeviceMemory<float> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<float> *> impl; + return impl(this, &blas::BlasSupport::DoBlasAsum, elem_count, x, incx, + result); +} + +Stream &Stream::ThenBlasAsum(uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, + int incx, DeviceMemory<double> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<double> *> impl; + return impl(this, &blas::BlasSupport::DoBlasAsum, elem_count, x, incx, + result); +} + +Stream &Stream::ThenBlasAxpy(uint64 elem_count, float alpha, + const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *y, int incy) { + VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), + PARAM(incy)); + + ThenBlasImpl<uint64, float, const DeviceMemory<float> &, int, + DeviceMemory<float> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasAxpy, elem_count, alpha, x, incx, + y, incy); +} + +Stream &Stream::ThenBlasAxpy(uint64 elem_count, double alpha, + const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *y, int incy) { + VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), + PARAM(incy)); + + ThenBlasImpl<uint64, double, const DeviceMemory<double> &, int, + DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasAxpy, elem_count, alpha, x, incx, + y, incy); +} + +Stream &Stream::ThenBlasAxpy(uint64 elem_count, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, + int incx, DeviceMemory<std::complex<float>> *y, + int incy) { + VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), + PARAM(incy)); + + ThenBlasImpl<uint64, std::complex<float>, + const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<std::complex<float>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasAxpy, elem_count, alpha, x, incx, + y, incy); +} + +Stream &Stream::ThenBlasAxpy(uint64 elem_count, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, + int incx, DeviceMemory<std::complex<double>> *y, + int incy) { + VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), + PARAM(incy)); + + ThenBlasImpl<uint64, std::complex<double>, + const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasAxpy, elem_count, alpha, x, incx, + y, incy); +} + +Stream &Stream::ThenBlasCopy(uint64 elem_count, const DeviceMemory<float> &x, + int incx, DeviceMemory<float> *y, int incy) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); + + ThenBlasImpl<uint64, const DeviceMemory<float> &, int, DeviceMemory<float> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasCopy, elem_count, x, incx, y, + incy); +} + +Stream &Stream::ThenBlasCopy(uint64 elem_count, const DeviceMemory<double> &x, + int incx, DeviceMemory<double> *y, int incy) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); + + ThenBlasImpl<uint64, const DeviceMemory<double> &, int, + DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasCopy, elem_count, x, incx, y, + incy); +} + +Stream &Stream::ThenBlasCopy(uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, + int incx, DeviceMemory<std::complex<float>> *y, + int incy) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); + + ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<std::complex<float>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasCopy, elem_count, x, incx, y, + incy); +} + +Stream &Stream::ThenBlasCopy(uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, + int incx, DeviceMemory<std::complex<double>> *y, + int incy) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); + + ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasCopy, elem_count, x, incx, y, + incy); +} + +Stream &Stream::ThenBlasDot(uint64 elem_count, const DeviceMemory<float> &x, + int incx, const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), + PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<float> &, int, + const DeviceMemory<float> &, int, DeviceMemory<float> *> impl; + return impl(this, &blas::BlasSupport::DoBlasDot, elem_count, x, incx, y, incy, + result); +} + +Stream &Stream::ThenBlasDot(uint64 elem_count, const DeviceMemory<double> &x, + int incx, const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), + PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<double> &, int, + const DeviceMemory<double> &, int, DeviceMemory<double> *> impl; + return impl(this, &blas::BlasSupport::DoBlasDot, elem_count, x, incx, y, incy, + result); +} + +Stream &Stream::ThenBlasDotc(uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, + int incx, + const DeviceMemory<std::complex<float>> &y, + int incy, + DeviceMemory<std::complex<float>> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), + PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int, + const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<std::complex<float>> *> impl; + return impl(this, &blas::BlasSupport::DoBlasDotc, elem_count, x, incx, y, + incy, result); +} + +Stream &Stream::ThenBlasDotc(uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, + int incx, + const DeviceMemory<std::complex<double>> &y, + int incy, + DeviceMemory<std::complex<double>> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), + PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int, + const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<std::complex<double>> *> impl; + return impl(this, &blas::BlasSupport::DoBlasDotc, elem_count, x, incx, y, + incy, result); +} + +Stream &Stream::ThenBlasDotu(uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, + int incx, + const DeviceMemory<std::complex<float>> &y, + int incy, + DeviceMemory<std::complex<float>> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), + PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int, + const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<std::complex<float>> *> impl; + return impl(this, &blas::BlasSupport::DoBlasDotu, elem_count, x, incx, y, + incy, result); +} + +Stream &Stream::ThenBlasDotu(uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, + int incx, + const DeviceMemory<std::complex<double>> &y, + int incy, + DeviceMemory<std::complex<double>> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), + PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int, + const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<std::complex<double>> *> impl; + return impl(this, &blas::BlasSupport::DoBlasDotu, elem_count, x, incx, y, + incy, result); +} + +Stream &Stream::ThenBlasNrm2(uint64 elem_count, const DeviceMemory<float> &x, + int incx, DeviceMemory<float> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<float> &, int, DeviceMemory<float> *> + impl; + return impl(this, &blas::BlasSupport::DoBlasNrm2, elem_count, x, incx, + result); +} + +Stream &Stream::ThenBlasNrm2(uint64 elem_count, const DeviceMemory<double> &x, + int incx, DeviceMemory<double> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<double> &, int, + DeviceMemory<double> *> impl; + return impl(this, &blas::BlasSupport::DoBlasNrm2, elem_count, x, incx, + result); +} + +Stream &Stream::ThenBlasNrm2(uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, + int incx, DeviceMemory<float> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<float> *> impl; + return impl(this, &blas::BlasSupport::DoBlasNrm2, elem_count, x, incx, + result); +} + +Stream &Stream::ThenBlasNrm2(uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, + int incx, DeviceMemory<double> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<double> *> impl; + return impl(this, &blas::BlasSupport::DoBlasNrm2, elem_count, x, incx, + result); +} + +Stream &Stream::ThenBlasRot(uint64 elem_count, DeviceMemory<float> *x, int incx, + DeviceMemory<float> *y, int incy, float c, + float s) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), + PARAM(c), PARAM(s)); + + ThenBlasImpl<uint64, DeviceMemory<float> *, int, DeviceMemory<float> *, int, + float, float> impl; + return impl(this, &blas::BlasSupport::DoBlasRot, elem_count, x, incx, y, incy, + c, s); +} + +Stream &Stream::ThenBlasRot(uint64 elem_count, DeviceMemory<double> *x, + int incx, DeviceMemory<double> *y, int incy, + double c, double s) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), + PARAM(c), PARAM(s)); + + ThenBlasImpl<uint64, DeviceMemory<double> *, int, DeviceMemory<double> *, int, + double, double> impl; + return impl(this, &blas::BlasSupport::DoBlasRot, elem_count, x, incx, y, incy, + c, s); +} + +Stream &Stream::ThenBlasRot(uint64 elem_count, + DeviceMemory<std::complex<float>> *x, int incx, + DeviceMemory<std::complex<float>> *y, int incy, + float c, float s) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), + PARAM(c), PARAM(s)); + + ThenBlasImpl<uint64, DeviceMemory<std::complex<float>> *, int, + DeviceMemory<std::complex<float>> *, int, float, float> impl; + return impl(this, &blas::BlasSupport::DoBlasRot, elem_count, x, incx, y, incy, + c, s); +} + +Stream &Stream::ThenBlasRot(uint64 elem_count, + DeviceMemory<std::complex<double>> *x, int incx, + DeviceMemory<std::complex<double>> *y, int incy, + double c, double s) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), + PARAM(c), PARAM(s)); + + ThenBlasImpl<uint64, DeviceMemory<std::complex<double>> *, int, + DeviceMemory<std::complex<double>> *, int, double, double> impl; + return impl(this, &blas::BlasSupport::DoBlasRot, elem_count, x, incx, y, incy, + c, s); +} + +Stream &Stream::ThenBlasRotg(DeviceMemory<float> *a, DeviceMemory<float> *b, + DeviceMemory<float> *c, DeviceMemory<float> *s) { + VLOG_CALL(PARAM(a), PARAM(b), PARAM(c), PARAM(s)); + + ThenBlasImpl<DeviceMemory<float> *, DeviceMemory<float> *, + DeviceMemory<float> *, DeviceMemory<float> *> impl; + return impl(this, &blas::BlasSupport::DoBlasRotg, a, b, c, s); +} + +Stream &Stream::ThenBlasRotg(DeviceMemory<double> *a, DeviceMemory<double> *b, + DeviceMemory<double> *c, DeviceMemory<double> *s) { + VLOG_CALL(PARAM(a), PARAM(b), PARAM(c), PARAM(s)); + + ThenBlasImpl<DeviceMemory<double> *, DeviceMemory<double> *, + DeviceMemory<double> *, DeviceMemory<double> *> impl; + return impl(this, &blas::BlasSupport::DoBlasRotg, a, b, c, s); +} + +Stream &Stream::ThenBlasRotg(DeviceMemory<std::complex<float>> *a, + DeviceMemory<std::complex<float>> *b, + DeviceMemory<float> *c, + DeviceMemory<std::complex<float>> *s) { + VLOG_CALL(PARAM(a), PARAM(b), PARAM(c), PARAM(s)); + + ThenBlasImpl<DeviceMemory<std::complex<float>> *, + DeviceMemory<std::complex<float>> *, DeviceMemory<float> *, + DeviceMemory<std::complex<float>> *> impl; + return impl(this, &blas::BlasSupport::DoBlasRotg, a, b, c, s); +} + +Stream &Stream::ThenBlasRotg(DeviceMemory<std::complex<double>> *a, + DeviceMemory<std::complex<double>> *b, + DeviceMemory<double> *c, + DeviceMemory<std::complex<double>> *s) { + VLOG_CALL(PARAM(a), PARAM(b), PARAM(c), PARAM(s)); + + ThenBlasImpl<DeviceMemory<std::complex<double>> *, + DeviceMemory<std::complex<double>> *, DeviceMemory<double> *, + DeviceMemory<std::complex<double>> *> impl; + return impl(this, &blas::BlasSupport::DoBlasRotg, a, b, c, s); +} + +Stream &Stream::ThenBlasRotm(uint64 elem_count, DeviceMemory<float> *x, + int incx, DeviceMemory<float> *y, int incy, + const DeviceMemory<float> ¶m) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), + PARAM(param)); + + ThenBlasImpl<uint64, DeviceMemory<float> *, int, DeviceMemory<float> *, int, + const DeviceMemory<float> &> impl; + return impl(this, &blas::BlasSupport::DoBlasRotm, elem_count, x, incx, y, + incy, param); +} + +Stream &Stream::ThenBlasRotm(uint64 elem_count, DeviceMemory<double> *x, + int incx, DeviceMemory<double> *y, int incy, + const DeviceMemory<double> ¶m) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy), + PARAM(param)); + + ThenBlasImpl<uint64, DeviceMemory<double> *, int, DeviceMemory<double> *, int, + const DeviceMemory<double> &> impl; + return impl(this, &blas::BlasSupport::DoBlasRotm, elem_count, x, incx, y, + incy, param); +} + +Stream &Stream::ThenBlasRotmg(DeviceMemory<float> *d1, DeviceMemory<float> *d2, + DeviceMemory<float> *x1, + const DeviceMemory<float> &y1, + DeviceMemory<float> *param) { + VLOG_CALL(PARAM(d1), PARAM(d2), PARAM(x1), PARAM(y1), PARAM(param)); + + ThenBlasImpl<DeviceMemory<float> *, DeviceMemory<float> *, + DeviceMemory<float> *, const DeviceMemory<float> &, + DeviceMemory<float> *> impl; + return impl(this, &blas::BlasSupport::DoBlasRotmg, d1, d2, x1, y1, param); +} + +Stream &Stream::ThenBlasRotmg(DeviceMemory<double> *d1, + DeviceMemory<double> *d2, + DeviceMemory<double> *x1, + const DeviceMemory<double> &y1, + DeviceMemory<double> *param) { + VLOG_CALL(PARAM(d1), PARAM(d2), PARAM(x1), PARAM(y1), PARAM(param)); + + ThenBlasImpl<DeviceMemory<double> *, DeviceMemory<double> *, + DeviceMemory<double> *, const DeviceMemory<double> &, + DeviceMemory<double> *> impl; + return impl(this, &blas::BlasSupport::DoBlasRotmg, d1, d2, x1, y1, param); +} + +Stream &Stream::ThenBlasScal(uint64 elem_count, float alpha, + DeviceMemory<float> *x, int incx) { + VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx)); + + ThenBlasImpl<uint64, float, DeviceMemory<float> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx); +} + +Stream &Stream::ThenBlasScal(uint64 elem_count, double alpha, + DeviceMemory<double> *x, int incx) { + VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx)); + + ThenBlasImpl<uint64, double, DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx); +} + +Stream &Stream::ThenBlasScal(uint64 elem_count, float alpha, + DeviceMemory<std::complex<float>> *x, int incx) { + VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx)); + + ThenBlasImpl<uint64, float, DeviceMemory<std::complex<float>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx); +} + +Stream &Stream::ThenBlasScal(uint64 elem_count, double alpha, + DeviceMemory<std::complex<double>> *x, int incx) { + VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx)); + + ThenBlasImpl<uint64, double, DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx); +} + +Stream &Stream::ThenBlasScal(uint64 elem_count, std::complex<float> alpha, + DeviceMemory<std::complex<float>> *x, int incx) { + VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx)); + + ThenBlasImpl<uint64, std::complex<float>, DeviceMemory<std::complex<float>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx); +} + +Stream &Stream::ThenBlasScal(uint64 elem_count, std::complex<double> alpha, + DeviceMemory<std::complex<double>> *x, int incx) { + VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx)); + + ThenBlasImpl<uint64, std::complex<double>, + DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx); +} + +Stream &Stream::ThenBlasSwap(uint64 elem_count, DeviceMemory<float> *x, + int incx, DeviceMemory<float> *y, int incy) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); + + ThenBlasImpl<uint64, DeviceMemory<float> *, int, DeviceMemory<float> *, int> + impl; + return impl(this, &blas::BlasSupport::DoBlasSwap, elem_count, x, incx, y, + incy); +} + +Stream &Stream::ThenBlasSwap(uint64 elem_count, DeviceMemory<double> *x, + int incx, DeviceMemory<double> *y, int incy) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); + + ThenBlasImpl<uint64, DeviceMemory<double> *, int, DeviceMemory<double> *, int> + impl; + return impl(this, &blas::BlasSupport::DoBlasSwap, elem_count, x, incx, y, + incy); +} + +Stream &Stream::ThenBlasSwap(uint64 elem_count, + DeviceMemory<std::complex<float>> *x, int incx, + DeviceMemory<std::complex<float>> *y, int incy) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); + + ThenBlasImpl<uint64, DeviceMemory<std::complex<float>> *, int, + DeviceMemory<std::complex<float>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasSwap, elem_count, x, incx, y, + incy); +} + +Stream &Stream::ThenBlasSwap(uint64 elem_count, + DeviceMemory<std::complex<double>> *x, int incx, + DeviceMemory<std::complex<double>> *y, int incy) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy)); + + ThenBlasImpl<uint64, DeviceMemory<std::complex<double>> *, int, + DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasSwap, elem_count, x, incx, y, + incy); +} + +Stream &Stream::ThenBlasIamax(uint64 elem_count, const DeviceMemory<float> &x, + int incx, DeviceMemory<int> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<float> &, int, DeviceMemory<int> *> + impl; + return impl(this, &blas::BlasSupport::DoBlasIamax, elem_count, x, incx, + result); +} + +Stream &Stream::ThenBlasIamax(uint64 elem_count, const DeviceMemory<double> &x, + int incx, DeviceMemory<int> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<double> &, int, DeviceMemory<int> *> + impl; + return impl(this, &blas::BlasSupport::DoBlasIamax, elem_count, x, incx, + result); +} + +Stream &Stream::ThenBlasIamax(uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, + int incx, DeviceMemory<int> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<int> *> impl; + return impl(this, &blas::BlasSupport::DoBlasIamax, elem_count, x, incx, + result); +} + +Stream &Stream::ThenBlasIamax(uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, + int incx, DeviceMemory<int> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<int> *> impl; + return impl(this, &blas::BlasSupport::DoBlasIamax, elem_count, x, incx, + result); +} + +Stream &Stream::ThenBlasIamin(uint64 elem_count, const DeviceMemory<float> &x, + int incx, DeviceMemory<int> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<float> &, int, DeviceMemory<int> *> + impl; + return impl(this, &blas::BlasSupport::DoBlasIamin, elem_count, x, incx, + result); +} + +Stream &Stream::ThenBlasIamin(uint64 elem_count, const DeviceMemory<double> &x, + int incx, DeviceMemory<int> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<double> &, int, DeviceMemory<int> *> + impl; + return impl(this, &blas::BlasSupport::DoBlasIamin, elem_count, x, incx, + result); +} + +Stream &Stream::ThenBlasIamin(uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, + int incx, DeviceMemory<int> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<int> *> impl; + return impl(this, &blas::BlasSupport::DoBlasIamin, elem_count, x, incx, + result); +} + +Stream &Stream::ThenBlasIamin(uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, + int incx, DeviceMemory<int> *result) { + VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result)); + + ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<int> *> impl; + return impl(this, &blas::BlasSupport::DoBlasIamin, elem_count, x, incx, + result); +} + +Stream &Stream::ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n, + uint64 kl, uint64 ku, float alpha, + const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &x, int incx, float beta, + DeviceMemory<float> *y, int incy) { + VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(kl), PARAM(ku), + PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), + PARAM(beta), PARAM(y), PARAM(incy)); + + ThenBlasImpl<blas::Transpose, uint64, uint64, uint64, uint64, float, + const DeviceMemory<float> &, int, const DeviceMemory<float> &, + int, float, DeviceMemory<float> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasGbmv, trans, m, n, kl, ku, alpha, + a, lda, x, incx, beta, y, incy); +} + +Stream &Stream::ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n, + uint64 kl, uint64 ku, double alpha, + const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &x, int incx, + double beta, DeviceMemory<double> *y, int incy) { + VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(kl), PARAM(ku), + PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), + PARAM(beta), PARAM(y), PARAM(incy)); + + ThenBlasImpl<blas::Transpose, uint64, uint64, uint64, uint64, double, + const DeviceMemory<double> &, int, const DeviceMemory<double> &, + int, double, DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasGbmv, trans, m, n, kl, ku, alpha, + a, lda, x, incx, beta, y, incy); +} + +Stream &Stream::ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n, + uint64 kl, uint64 ku, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, + int lda, + const DeviceMemory<std::complex<float>> &x, + int incx, std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) { + VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(kl), PARAM(ku), + PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), + PARAM(beta), PARAM(y), PARAM(incy)); + + ThenBlasImpl<blas::Transpose, uint64, uint64, uint64, uint64, + std::complex<float>, const DeviceMemory<std::complex<float>> &, + int, const DeviceMemory<std::complex<float>> &, int, + std::complex<float>, DeviceMemory<std::complex<float>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasGbmv, trans, m, n, kl, ku, alpha, + a, lda, x, incx, beta, y, incy); +} + +Stream &Stream::ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n, + uint64 kl, uint64 ku, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, + int lda, + const DeviceMemory<std::complex<double>> &x, + int incx, std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) { + VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(kl), PARAM(ku), + PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), PARAM(incx), + PARAM(beta), PARAM(y), PARAM(incy)); + + ThenBlasImpl<blas::Transpose, uint64, uint64, uint64, uint64, + std::complex<double>, const DeviceMemory<std::complex<double>> &, + int, const DeviceMemory<std::complex<double>> &, int, + std::complex<double>, DeviceMemory<std::complex<double>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasGbmv, trans, m, n, kl, ku, alpha, + a, lda, x, incx, beta, y, incy); +} + +Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n, + float alpha, const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &x, int incx, float beta, + DeviceMemory<float> *y, int incy) { + VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), + PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), + PARAM(incy)); + + ThenBlasImpl<blas::Transpose, uint64, uint64, float, + const DeviceMemory<float> &, int, const DeviceMemory<float> &, + int, float, DeviceMemory<float> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda, + x, incx, beta, y, incy); +} + +Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n, + double alpha, const DeviceMemory<double> &a, + int lda, const DeviceMemory<double> &x, int incx, + double beta, DeviceMemory<double> *y, int incy) { + VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), + PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), + PARAM(incy)); + + ThenBlasImpl<blas::Transpose, uint64, uint64, double, + const DeviceMemory<double> &, int, const DeviceMemory<double> &, + int, double, DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda, + x, incx, beta, y, incy); +} + +Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, + int lda, + const DeviceMemory<std::complex<float>> &x, + int incx, std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) { + VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), + PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), + PARAM(incy)); + + ThenBlasImpl<blas::Transpose, uint64, uint64, std::complex<float>, + const DeviceMemory<std::complex<float>> &, int, + const DeviceMemory<std::complex<float>> &, int, + std::complex<float>, DeviceMemory<std::complex<float>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda, + x, incx, beta, y, incy); +} + +Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, + int lda, + const DeviceMemory<std::complex<double>> &x, + int incx, std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) { + VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), + PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), + PARAM(incy)); + + ThenBlasImpl<blas::Transpose, uint64, uint64, std::complex<double>, + const DeviceMemory<std::complex<double>> &, int, + const DeviceMemory<std::complex<double>> &, int, + std::complex<double>, DeviceMemory<std::complex<double>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda, + x, incx, beta, y, incy); +} + +Stream &Stream::ThenBlasGer(uint64 m, uint64 n, float alpha, + const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *a, int lda) { + VLOG_CALL(PARAM(m), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), + PARAM(incy), PARAM(a), PARAM(lda)); + + ThenBlasImpl<uint64, uint64, float, const DeviceMemory<float> &, int, + const DeviceMemory<float> &, int, DeviceMemory<float> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasGer, m, n, alpha, x, incx, y, + incy, a, lda); +} + +Stream &Stream::ThenBlasGer(uint64 m, uint64 n, double alpha, + const DeviceMemory<double> &x, int incx, + const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *a, int lda) { + VLOG_CALL(PARAM(m), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), + PARAM(incy), PARAM(a), PARAM(lda)); + + ThenBlasImpl<uint64, uint64, double, const DeviceMemory<double> &, int, + const DeviceMemory<double> &, int, DeviceMemory<double> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasGer, m, n, alpha, x, incx, y, + incy, a, lda); +} + +Stream &Stream::ThenBlasGerc(uint64 m, uint64 n, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, + int incx, + const DeviceMemory<std::complex<float>> &y, + int incy, DeviceMemory<std::complex<float>> *a, + int lda) { + VLOG_CALL(PARAM(m), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), + PARAM(incy), PARAM(a), PARAM(lda)); + + ThenBlasImpl<uint64, uint64, std::complex<float>, + const DeviceMemory<std::complex<float>> &, int, + const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<std::complex<float>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasGerc, m, n, alpha, x, incx, y, + incy, a, lda); +} + +Stream &Stream::ThenBlasGerc(uint64 m, uint64 n, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, + int incx, + const DeviceMemory<std::complex<double>> &y, + int incy, DeviceMemory<std::complex<double>> *a, + int lda) { + VLOG_CALL(PARAM(m), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), + PARAM(incy), PARAM(a), PARAM(lda)); + + ThenBlasImpl<uint64, uint64, std::complex<double>, + const DeviceMemory<std::complex<double>> &, int, + const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasGerc, m, n, alpha, x, incx, y, + incy, a, lda); +} + +Stream &Stream::ThenBlasGeru(uint64 m, uint64 n, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, + int incx, + const DeviceMemory<std::complex<float>> &y, + int incy, DeviceMemory<std::complex<float>> *a, + int lda) { + VLOG_CALL(PARAM(m), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), + PARAM(incy), PARAM(a), PARAM(lda)); + + ThenBlasImpl<uint64, uint64, std::complex<float>, + const DeviceMemory<std::complex<float>> &, int, + const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<std::complex<float>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasGeru, m, n, alpha, x, incx, y, + incy, a, lda); +} + +Stream &Stream::ThenBlasGeru(uint64 m, uint64 n, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, + int incx, + const DeviceMemory<std::complex<double>> &y, + int incy, DeviceMemory<std::complex<double>> *a, + int lda) { + VLOG_CALL(PARAM(m), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y), + PARAM(incy), PARAM(a), PARAM(lda)); + + ThenBlasImpl<uint64, uint64, std::complex<double>, + const DeviceMemory<std::complex<double>> &, int, + const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasGeru, m, n, alpha, x, incx, y, + incy, a, lda); +} + +Stream &Stream::ThenBlasHbmv(blas::UpperLower uplo, uint64 n, uint64 k, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, + int lda, + const DeviceMemory<std::complex<float>> &x, + int incx, std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), + PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); + + ThenBlasImpl<blas::UpperLower, uint64, uint64, std::complex<float>, + const DeviceMemory<std::complex<float>> &, int, + const DeviceMemory<std::complex<float>> &, int, + std::complex<float>, DeviceMemory<std::complex<float>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasHbmv, uplo, n, k, alpha, a, lda, + x, incx, beta, y, incy); +} + +Stream &Stream::ThenBlasHbmv(blas::UpperLower uplo, uint64 n, uint64 k, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, + int lda, + const DeviceMemory<std::complex<double>> &x, + int incx, std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), + PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); + + ThenBlasImpl<blas::UpperLower, uint64, uint64, std::complex<double>, + const DeviceMemory<std::complex<double>> &, int, + const DeviceMemory<std::complex<double>> &, int, + std::complex<double>, DeviceMemory<std::complex<double>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasHbmv, uplo, n, k, alpha, a, lda, + x, incx, beta, y, incy); +} + +Stream &Stream::ThenBlasHemv(blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, + int lda, + const DeviceMemory<std::complex<float>> &x, + int incx, std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), + PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); + + ThenBlasImpl<blas::UpperLower, uint64, std::complex<float>, + const DeviceMemory<std::complex<float>> &, int, + const DeviceMemory<std::complex<float>> &, int, + std::complex<float>, DeviceMemory<std::complex<float>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasHemv, uplo, n, alpha, a, lda, x, + incx, beta, y, incy); +} + +Stream &Stream::ThenBlasHemv(blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, + int lda, + const DeviceMemory<std::complex<double>> &x, + int incx, std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), + PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); + + ThenBlasImpl<blas::UpperLower, uint64, std::complex<double>, + const DeviceMemory<std::complex<double>> &, int, + const DeviceMemory<std::complex<double>> &, int, + std::complex<double>, DeviceMemory<std::complex<double>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasHemv, uplo, n, alpha, a, lda, x, + incx, beta, y, incy); +} + +Stream &Stream::ThenBlasHer(blas::UpperLower uplo, uint64 n, float alpha, + const DeviceMemory<std::complex<float>> &x, + int incx, DeviceMemory<std::complex<float>> *a, + int lda) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), + PARAM(a), PARAM(lda)); + + ThenBlasImpl<blas::UpperLower, uint64, float, + const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<std::complex<float>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasHer, uplo, n, alpha, x, incx, a, + lda); +} + +Stream &Stream::ThenBlasHer(blas::UpperLower uplo, uint64 n, double alpha, + const DeviceMemory<std::complex<double>> &x, + int incx, DeviceMemory<std::complex<double>> *a, + int lda) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), + PARAM(a), PARAM(lda)); + + ThenBlasImpl<blas::UpperLower, uint64, double, + const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasHer, uplo, n, alpha, x, incx, a, + lda); +} + +Stream &Stream::ThenBlasHer2(blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, + int incx, + const DeviceMemory<std::complex<float>> &y, + int incy, DeviceMemory<std::complex<float>> *a, + int lda) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), + PARAM(y), PARAM(incy), PARAM(a), PARAM(lda)); + + ThenBlasImpl<blas::UpperLower, uint64, std::complex<float>, + const DeviceMemory<std::complex<float>> &, int, + const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<std::complex<float>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasHer2, uplo, n, alpha, x, incx, y, + incy, a, lda); +} + +Stream &Stream::ThenBlasHer2(blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, + int incx, + const DeviceMemory<std::complex<double>> &y, + int incy, DeviceMemory<std::complex<double>> *a, + int lda) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), + PARAM(y), PARAM(incy), PARAM(a), PARAM(lda)); + + ThenBlasImpl<blas::UpperLower, uint64, std::complex<double>, + const DeviceMemory<std::complex<double>> &, int, + const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasHer2, uplo, n, alpha, x, incx, y, + incy, a, lda); +} + +Stream &Stream::ThenBlasHpmv(blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &ap, + const DeviceMemory<std::complex<float>> &x, + int incx, std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(ap), PARAM(x), + PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); + + ThenBlasImpl<blas::UpperLower, uint64, std::complex<float>, + const DeviceMemory<std::complex<float>> &, + const DeviceMemory<std::complex<float>> &, int, + std::complex<float>, DeviceMemory<std::complex<float>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasHpmv, uplo, n, alpha, ap, x, incx, + beta, y, incy); +} + +Stream &Stream::ThenBlasHpmv(blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &ap, + const DeviceMemory<std::complex<double>> &x, + int incx, std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(ap), PARAM(x), + PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); + + ThenBlasImpl<blas::UpperLower, uint64, std::complex<double>, + const DeviceMemory<std::complex<double>> &, + const DeviceMemory<std::complex<double>> &, int, + std::complex<double>, DeviceMemory<std::complex<double>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasHpmv, uplo, n, alpha, ap, x, incx, + beta, y, incy); +} + +Stream &Stream::ThenBlasHpr(blas::UpperLower uplo, uint64 n, float alpha, + const DeviceMemory<std::complex<float>> &x, + int incx, DeviceMemory<std::complex<float>> *ap) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), + PARAM(ap)); + + ThenBlasImpl<blas::UpperLower, uint64, float, + const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<std::complex<float>> *> impl; + return impl(this, &blas::BlasSupport::DoBlasHpr, uplo, n, alpha, x, incx, ap); +} + +Stream &Stream::ThenBlasHpr(blas::UpperLower uplo, uint64 n, double alpha, + const DeviceMemory<std::complex<double>> &x, + int incx, DeviceMemory<std::complex<double>> *ap) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), + PARAM(ap)); + + ThenBlasImpl<blas::UpperLower, uint64, double, + const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<std::complex<double>> *> impl; + return impl(this, &blas::BlasSupport::DoBlasHpr, uplo, n, alpha, x, incx, ap); +} + +Stream &Stream::ThenBlasHpr2(blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, + int incx, + const DeviceMemory<std::complex<float>> &y, + int incy, DeviceMemory<std::complex<float>> *ap) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), + PARAM(y), PARAM(incy), PARAM(ap)); + + ThenBlasImpl<blas::UpperLower, uint64, std::complex<float>, + const DeviceMemory<std::complex<float>> &, int, + const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<std::complex<float>> *> impl; + return impl(this, &blas::BlasSupport::DoBlasHpr2, uplo, n, alpha, x, incx, y, + incy, ap); +} + +Stream &Stream::ThenBlasHpr2(blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, + int incx, + const DeviceMemory<std::complex<double>> &y, + int incy, DeviceMemory<std::complex<double>> *ap) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), + PARAM(y), PARAM(incy), PARAM(ap)); + + ThenBlasImpl<blas::UpperLower, uint64, std::complex<double>, + const DeviceMemory<std::complex<double>> &, int, + const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<std::complex<double>> *> impl; + return impl(this, &blas::BlasSupport::DoBlasHpr2, uplo, n, alpha, x, incx, y, + incy, ap); +} + +Stream &Stream::ThenBlasSbmv(blas::UpperLower uplo, uint64 n, uint64 k, + float alpha, const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &x, int incx, float beta, + DeviceMemory<float> *y, int incy) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), + PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); + + ThenBlasImpl<blas::UpperLower, uint64, uint64, float, + const DeviceMemory<float> &, int, const DeviceMemory<float> &, + int, float, DeviceMemory<float> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasSbmv, uplo, n, k, alpha, a, lda, + x, incx, beta, y, incy); +} + +Stream &Stream::ThenBlasSbmv(blas::UpperLower uplo, uint64 n, uint64 k, + double alpha, const DeviceMemory<double> &a, + int lda, const DeviceMemory<double> &x, int incx, + double beta, DeviceMemory<double> *y, int incy) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), + PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); + + ThenBlasImpl<blas::UpperLower, uint64, uint64, double, + const DeviceMemory<double> &, int, const DeviceMemory<double> &, + int, double, DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasSbmv, uplo, n, k, alpha, a, lda, + x, incx, beta, y, incy); +} + +Stream &Stream::ThenBlasSpmv(blas::UpperLower uplo, uint64 n, float alpha, + const DeviceMemory<float> &ap, + const DeviceMemory<float> &x, int incx, float beta, + DeviceMemory<float> *y, int incy) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(ap), PARAM(x), + PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); + + ThenBlasImpl<blas::UpperLower, uint64, float, const DeviceMemory<float> &, + const DeviceMemory<float> &, int, float, DeviceMemory<float> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasSpmv, uplo, n, alpha, ap, x, incx, + beta, y, incy); +} + +Stream &Stream::ThenBlasSpmv(blas::UpperLower uplo, uint64 n, double alpha, + const DeviceMemory<double> &ap, + const DeviceMemory<double> &x, int incx, + double beta, DeviceMemory<double> *y, int incy) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(ap), PARAM(x), + PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); + + ThenBlasImpl<blas::UpperLower, uint64, double, const DeviceMemory<double> &, + const DeviceMemory<double> &, int, double, + DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasSpmv, uplo, n, alpha, ap, x, incx, + beta, y, incy); +} + +Stream &Stream::ThenBlasSpr(blas::UpperLower uplo, uint64 n, float alpha, + const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *ap) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), + PARAM(ap)); + + ThenBlasImpl<blas::UpperLower, uint64, float, const DeviceMemory<float> &, + int, DeviceMemory<float> *> impl; + return impl(this, &blas::BlasSupport::DoBlasSpr, uplo, n, alpha, x, incx, ap); +} + +Stream &Stream::ThenBlasSpr(blas::UpperLower uplo, uint64 n, double alpha, + const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *ap) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), + PARAM(ap)); + + ThenBlasImpl<blas::UpperLower, uint64, double, const DeviceMemory<double> &, + int, DeviceMemory<double> *> impl; + return impl(this, &blas::BlasSupport::DoBlasSpr, uplo, n, alpha, x, incx, ap); +} + +Stream &Stream::ThenBlasSpr2(blas::UpperLower uplo, uint64 n, float alpha, + const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *ap) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), + PARAM(y), PARAM(incy), PARAM(ap)); + + ThenBlasImpl<blas::UpperLower, uint64, float, const DeviceMemory<float> &, + int, const DeviceMemory<float> &, int, + DeviceMemory<float> *> impl; + return impl(this, &blas::BlasSupport::DoBlasSpr2, uplo, n, alpha, x, incx, y, + incy, ap); +} + +Stream &Stream::ThenBlasSpr2(blas::UpperLower uplo, uint64 n, double alpha, + const DeviceMemory<double> &x, int incx, + const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *ap) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), + PARAM(y), PARAM(incy), PARAM(ap)); + + ThenBlasImpl<blas::UpperLower, uint64, double, const DeviceMemory<double> &, + int, const DeviceMemory<double> &, int, + DeviceMemory<double> *> impl; + return impl(this, &blas::BlasSupport::DoBlasSpr2, uplo, n, alpha, x, incx, y, + incy, ap); +} + +Stream &Stream::ThenBlasSymv(blas::UpperLower uplo, uint64 n, float alpha, + const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &x, int incx, float beta, + DeviceMemory<float> *y, int incy) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), + PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); + + ThenBlasImpl<blas::UpperLower, uint64, float, const DeviceMemory<float> &, + int, const DeviceMemory<float> &, int, float, + DeviceMemory<float> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasSymv, uplo, n, alpha, a, lda, x, + incx, beta, y, incy); +} + +Stream &Stream::ThenBlasSymv(blas::UpperLower uplo, uint64 n, double alpha, + const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &x, int incx, + double beta, DeviceMemory<double> *y, int incy) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(x), + PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy)); + + ThenBlasImpl<blas::UpperLower, uint64, double, const DeviceMemory<double> &, + int, const DeviceMemory<double> &, int, double, + DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasSymv, uplo, n, alpha, a, lda, x, + incx, beta, y, incy); +} + +Stream &Stream::ThenBlasSyr(blas::UpperLower uplo, uint64 n, float alpha, + const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *a, int lda) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), + PARAM(a), PARAM(lda)); + + ThenBlasImpl<blas::UpperLower, uint64, float, const DeviceMemory<float> &, + int, DeviceMemory<float> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasSyr, uplo, n, alpha, x, incx, a, + lda); +} + +Stream &Stream::ThenBlasSyr(blas::UpperLower uplo, uint64 n, double alpha, + const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *a, int lda) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), + PARAM(a), PARAM(lda)); + + ThenBlasImpl<blas::UpperLower, uint64, double, const DeviceMemory<double> &, + int, DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasSyr, uplo, n, alpha, x, incx, a, + lda); +} + +Stream &Stream::ThenBlasSyr2(blas::UpperLower uplo, uint64 n, float alpha, + const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *a, int lda) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), + PARAM(y), PARAM(incy), PARAM(a), PARAM(lda)); + + ThenBlasImpl<blas::UpperLower, uint64, float, const DeviceMemory<float> &, + int, const DeviceMemory<float> &, int, DeviceMemory<float> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasSyr2, uplo, n, alpha, x, incx, y, + incy, a, lda); +} + +Stream &Stream::ThenBlasSyr2(blas::UpperLower uplo, uint64 n, double alpha, + const DeviceMemory<double> &x, int incx, + const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *a, int lda) { + VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(alpha), PARAM(x), PARAM(incx), + PARAM(y), PARAM(incy), PARAM(a), PARAM(lda)); + + ThenBlasImpl<blas::UpperLower, uint64, double, const DeviceMemory<double> &, + int, const DeviceMemory<double> &, int, DeviceMemory<double> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasSyr2, uplo, n, alpha, x, incx, y, + incy, a, lda); +} + +Stream &Stream::ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, uint64 k, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(k), + PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + uint64, const DeviceMemory<float> &, int, DeviceMemory<float> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasTbmv, uplo, trans, diag, n, k, a, + lda, x, incx); +} + +Stream &Stream::ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, uint64 k, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(k), + PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + uint64, const DeviceMemory<double> &, int, + DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTbmv, uplo, trans, diag, n, k, a, + lda, x, incx); +} + +Stream &Stream::ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, uint64 k, + const DeviceMemory<std::complex<float>> &a, + int lda, DeviceMemory<std::complex<float>> *x, + int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(k), + PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + uint64, const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<std::complex<float>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTbmv, uplo, trans, diag, n, k, a, + lda, x, incx); +} + +Stream &Stream::ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, uint64 k, + const DeviceMemory<std::complex<double>> &a, + int lda, DeviceMemory<std::complex<double>> *x, + int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(k), + PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + uint64, const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTbmv, uplo, trans, diag, n, k, a, + lda, x, incx); +} + +Stream &Stream::ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, uint64 k, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(k), + PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + uint64, const DeviceMemory<float> &, int, DeviceMemory<float> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasTbsv, uplo, trans, diag, n, k, a, + lda, x, incx); +} + +Stream &Stream::ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, uint64 k, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(k), + PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + uint64, const DeviceMemory<double> &, int, + DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTbsv, uplo, trans, diag, n, k, a, + lda, x, incx); +} + +Stream &Stream::ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, uint64 k, + const DeviceMemory<std::complex<float>> &a, + int lda, DeviceMemory<std::complex<float>> *x, + int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(k), + PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + uint64, const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<std::complex<float>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTbsv, uplo, trans, diag, n, k, a, + lda, x, incx); +} + +Stream &Stream::ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, uint64 k, + const DeviceMemory<std::complex<double>> &a, + int lda, DeviceMemory<std::complex<double>> *x, + int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(k), + PARAM(a), PARAM(lda), PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + uint64, const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTbsv, uplo, trans, diag, n, k, a, + lda, x, incx); +} + +Stream &Stream::ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &ap, + DeviceMemory<float> *x, int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(ap), + PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + const DeviceMemory<float> &, DeviceMemory<float> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTpmv, uplo, trans, diag, n, ap, x, + incx); +} + +Stream &Stream::ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &ap, + DeviceMemory<double> *x, int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(ap), + PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + const DeviceMemory<double> &, DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTpmv, uplo, trans, diag, n, ap, x, + incx); +} + +Stream &Stream::ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &ap, + DeviceMemory<std::complex<float>> *x, int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(ap), + PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + const DeviceMemory<std::complex<float>> &, + DeviceMemory<std::complex<float>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTpmv, uplo, trans, diag, n, ap, x, + incx); +} + +Stream &Stream::ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &ap, + DeviceMemory<std::complex<double>> *x, int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(ap), + PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + const DeviceMemory<std::complex<double>> &, + DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTpmv, uplo, trans, diag, n, ap, x, + incx); +} + +Stream &Stream::ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &ap, + DeviceMemory<float> *x, int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(ap), + PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + const DeviceMemory<float> &, DeviceMemory<float> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTpsv, uplo, trans, diag, n, ap, x, + incx); +} + +Stream &Stream::ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &ap, + DeviceMemory<double> *x, int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(ap), + PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + const DeviceMemory<double> &, DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTpsv, uplo, trans, diag, n, ap, x, + incx); +} + +Stream &Stream::ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &ap, + DeviceMemory<std::complex<float>> *x, int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(ap), + PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + const DeviceMemory<std::complex<float>> &, + DeviceMemory<std::complex<float>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTpsv, uplo, trans, diag, n, ap, x, + incx); +} + +Stream &Stream::ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &ap, + DeviceMemory<std::complex<double>> *x, int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(ap), + PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + const DeviceMemory<std::complex<double>> &, + DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTpsv, uplo, trans, diag, n, ap, x, + incx); +} + +Stream &Stream::ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(a), + PARAM(lda), PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + const DeviceMemory<float> &, int, DeviceMemory<float> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasTrmv, uplo, trans, diag, n, a, + lda, x, incx); +} + +Stream &Stream::ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(a), + PARAM(lda), PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + const DeviceMemory<double> &, int, DeviceMemory<double> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasTrmv, uplo, trans, diag, n, a, + lda, x, incx); +} + +Stream &Stream::ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &a, + int lda, DeviceMemory<std::complex<float>> *x, + int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(a), + PARAM(lda), PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<std::complex<float>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTrmv, uplo, trans, diag, n, a, + lda, x, incx); +} + +Stream &Stream::ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &a, + int lda, DeviceMemory<std::complex<double>> *x, + int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(a), + PARAM(lda), PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTrmv, uplo, trans, diag, n, a, + lda, x, incx); +} + +Stream &Stream::ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(a), + PARAM(lda), PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + const DeviceMemory<float> &, int, DeviceMemory<float> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasTrsv, uplo, trans, diag, n, a, + lda, x, incx); +} + +Stream &Stream::ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(a), + PARAM(lda), PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + const DeviceMemory<double> &, int, DeviceMemory<double> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasTrsv, uplo, trans, diag, n, a, + lda, x, incx); +} + +Stream &Stream::ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &a, + int lda, DeviceMemory<std::complex<float>> *x, + int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(a), + PARAM(lda), PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<std::complex<float>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTrsv, uplo, trans, diag, n, a, + lda, x, incx); +} + +Stream &Stream::ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &a, + int lda, DeviceMemory<std::complex<double>> *x, + int incx) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(diag), PARAM(n), PARAM(a), + PARAM(lda), PARAM(x), PARAM(incx)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64, + const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTrsv, uplo, trans, diag, n, a, + lda, x, incx); +} + +Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb, + uint64 m, uint64 n, uint64 k, float alpha, + const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &b, int ldb, float beta, + DeviceMemory<float> *c, int ldc) { + VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), + PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), + PARAM(beta), PARAM(c), PARAM(ldc)); + + ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, float, + const DeviceMemory<float> &, int, const DeviceMemory<float> &, + int, float, DeviceMemory<float> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k, + alpha, a, lda, b, ldb, beta, c, ldc); +} + +Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb, + uint64 m, uint64 n, uint64 k, double alpha, + const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &b, int ldb, + double beta, DeviceMemory<double> *c, int ldc) { + VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), + PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), + PARAM(beta), PARAM(c), PARAM(ldc)); + + ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, double, + const DeviceMemory<double> &, int, const DeviceMemory<double> &, + int, double, DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k, + alpha, a, lda, b, ldb, beta, c, ldc); +} + +Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb, + uint64 m, uint64 n, uint64 k, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, + int lda, + const DeviceMemory<std::complex<float>> &b, + int ldb, std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) { + VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), + PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), + PARAM(beta), PARAM(c), PARAM(ldc)); + + ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, + std::complex<float>, const DeviceMemory<std::complex<float>> &, + int, const DeviceMemory<std::complex<float>> &, int, + std::complex<float>, DeviceMemory<std::complex<float>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k, + alpha, a, lda, b, ldb, beta, c, ldc); +} + +Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb, + uint64 m, uint64 n, uint64 k, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, + int lda, + const DeviceMemory<std::complex<double>> &b, + int ldb, std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) { + VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), + PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), + PARAM(beta), PARAM(c), PARAM(ldc)); + + ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, + std::complex<double>, const DeviceMemory<std::complex<double>> &, + int, const DeviceMemory<std::complex<double>> &, int, + std::complex<double>, DeviceMemory<std::complex<double>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k, + alpha, a, lda, b, ldb, beta, c, ldc); +} + +Stream &Stream::ThenBlasHemm(blas::Side side, blas::UpperLower uplo, uint64 m, + uint64 n, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, + int lda, + const DeviceMemory<std::complex<float>> &b, + int ldb, std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) { + VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(m), PARAM(n), PARAM(alpha), + PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), + PARAM(ldc)); + + ThenBlasImpl<blas::Side, blas::UpperLower, uint64, uint64, + std::complex<float>, const DeviceMemory<std::complex<float>> &, + int, const DeviceMemory<std::complex<float>> &, int, + std::complex<float>, DeviceMemory<std::complex<float>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasHemm, side, uplo, m, n, alpha, a, + lda, b, ldb, beta, c, ldc); +} + +Stream &Stream::ThenBlasHemm(blas::Side side, blas::UpperLower uplo, uint64 m, + uint64 n, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, + int lda, + const DeviceMemory<std::complex<double>> &b, + int ldb, std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) { + VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(m), PARAM(n), PARAM(alpha), + PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), + PARAM(ldc)); + + ThenBlasImpl<blas::Side, blas::UpperLower, uint64, uint64, + std::complex<double>, const DeviceMemory<std::complex<double>> &, + int, const DeviceMemory<std::complex<double>> &, int, + std::complex<double>, DeviceMemory<std::complex<double>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasHemm, side, uplo, m, n, alpha, a, + lda, b, ldb, beta, c, ldc); +} + +Stream &Stream::ThenBlasHerk(blas::UpperLower uplo, blas::Transpose trans, + uint64 n, uint64 k, float alpha, + const DeviceMemory<std::complex<float>> &a, + int lda, float beta, + DeviceMemory<std::complex<float>> *c, int ldc) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), + PARAM(a), PARAM(lda), PARAM(beta), PARAM(c), PARAM(ldc)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, float, + const DeviceMemory<std::complex<float>> &, int, float, + DeviceMemory<std::complex<float>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasHerk, uplo, trans, n, k, alpha, a, + lda, beta, c, ldc); +} + +Stream &Stream::ThenBlasHerk(blas::UpperLower uplo, blas::Transpose trans, + uint64 n, uint64 k, double alpha, + const DeviceMemory<std::complex<double>> &a, + int lda, double beta, + DeviceMemory<std::complex<double>> *c, int ldc) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), + PARAM(a), PARAM(lda), PARAM(beta), PARAM(c), PARAM(ldc)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, double, + const DeviceMemory<std::complex<double>> &, int, double, + DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasHerk, uplo, trans, n, k, alpha, a, + lda, beta, c, ldc); +} + +Stream &Stream::ThenBlasHer2k(blas::UpperLower uplo, blas::Transpose trans, + uint64 n, uint64 k, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, + int lda, + const DeviceMemory<std::complex<float>> &b, + int ldb, float beta, + DeviceMemory<std::complex<float>> *c, int ldc) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), + PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), + PARAM(ldc)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, + std::complex<float>, const DeviceMemory<std::complex<float>> &, + int, const DeviceMemory<std::complex<float>> &, int, float, + DeviceMemory<std::complex<float>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasHer2k, uplo, trans, n, k, alpha, + a, lda, b, ldb, beta, c, ldc); +} + +Stream &Stream::ThenBlasHer2k(blas::UpperLower uplo, blas::Transpose trans, + uint64 n, uint64 k, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, + int lda, + const DeviceMemory<std::complex<double>> &b, + int ldb, double beta, + DeviceMemory<std::complex<double>> *c, int ldc) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), + PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), + PARAM(ldc)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, + std::complex<double>, const DeviceMemory<std::complex<double>> &, + int, const DeviceMemory<std::complex<double>> &, int, double, + DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasHer2k, uplo, trans, n, k, alpha, + a, lda, b, ldb, beta, c, ldc); +} + +Stream &Stream::ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m, + uint64 n, float alpha, + const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &b, int ldb, float beta, + DeviceMemory<float> *c, int ldc) { + VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(m), PARAM(n), PARAM(alpha), + PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), + PARAM(ldc)); + + ThenBlasImpl<blas::Side, blas::UpperLower, uint64, uint64, float, + const DeviceMemory<float> &, int, const DeviceMemory<float> &, + int, float, DeviceMemory<float> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasSymm, side, uplo, m, n, alpha, a, + lda, b, ldb, beta, c, ldc); +} + +Stream &Stream::ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m, + uint64 n, double alpha, + const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &b, int ldb, + double beta, DeviceMemory<double> *c, int ldc) { + VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(m), PARAM(n), PARAM(alpha), + PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), + PARAM(ldc)); + + ThenBlasImpl<blas::Side, blas::UpperLower, uint64, uint64, double, + const DeviceMemory<double> &, int, const DeviceMemory<double> &, + int, double, DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasSymm, side, uplo, m, n, alpha, a, + lda, b, ldb, beta, c, ldc); +} + +Stream &Stream::ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m, + uint64 n, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, + int lda, + const DeviceMemory<std::complex<float>> &b, + int ldb, std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) { + VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(m), PARAM(n), PARAM(alpha), + PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), + PARAM(ldc)); + + ThenBlasImpl<blas::Side, blas::UpperLower, uint64, uint64, + std::complex<float>, const DeviceMemory<std::complex<float>> &, + int, const DeviceMemory<std::complex<float>> &, int, + std::complex<float>, DeviceMemory<std::complex<float>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasSymm, side, uplo, m, n, alpha, a, + lda, b, ldb, beta, c, ldc); +} + +Stream &Stream::ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m, + uint64 n, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, + int lda, + const DeviceMemory<std::complex<double>> &b, + int ldb, std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) { + VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(m), PARAM(n), PARAM(alpha), + PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), + PARAM(ldc)); + + ThenBlasImpl<blas::Side, blas::UpperLower, uint64, uint64, + std::complex<double>, const DeviceMemory<std::complex<double>> &, + int, const DeviceMemory<std::complex<double>> &, int, + std::complex<double>, DeviceMemory<std::complex<double>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasSymm, side, uplo, m, n, alpha, a, + lda, b, ldb, beta, c, ldc); +} + +Stream &Stream::ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans, + uint64 n, uint64 k, float alpha, + const DeviceMemory<float> &a, int lda, float beta, + DeviceMemory<float> *c, int ldc) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), + PARAM(a), PARAM(lda), PARAM(beta), PARAM(c), PARAM(ldc)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, float, + const DeviceMemory<float> &, int, float, DeviceMemory<float> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasSyrk, uplo, trans, n, k, alpha, a, + lda, beta, c, ldc); +} + +Stream &Stream::ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans, + uint64 n, uint64 k, double alpha, + const DeviceMemory<double> &a, int lda, + double beta, DeviceMemory<double> *c, int ldc) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), + PARAM(a), PARAM(lda), PARAM(beta), PARAM(c), PARAM(ldc)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, double, + const DeviceMemory<double> &, int, double, + DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasSyrk, uplo, trans, n, k, alpha, a, + lda, beta, c, ldc); +} + +Stream &Stream::ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans, + uint64 n, uint64 k, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, + int lda, std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), + PARAM(a), PARAM(lda), PARAM(beta), PARAM(c), PARAM(ldc)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, + std::complex<float>, const DeviceMemory<std::complex<float>> &, + int, std::complex<float>, DeviceMemory<std::complex<float>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasSyrk, uplo, trans, n, k, alpha, a, + lda, beta, c, ldc); +} + +Stream &Stream::ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans, + uint64 n, uint64 k, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, + int lda, std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), + PARAM(a), PARAM(lda), PARAM(beta), PARAM(c), PARAM(ldc)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, + std::complex<double>, const DeviceMemory<std::complex<double>> &, + int, std::complex<double>, DeviceMemory<std::complex<double>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasSyrk, uplo, trans, n, k, alpha, a, + lda, beta, c, ldc); +} + +Stream &Stream::ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans, + uint64 n, uint64 k, float alpha, + const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &b, int ldb, float beta, + DeviceMemory<float> *c, int ldc) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), + PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), + PARAM(ldc)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, float, + const DeviceMemory<float> &, int, const DeviceMemory<float> &, + int, float, DeviceMemory<float> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasSyr2k, uplo, trans, n, k, alpha, + a, lda, b, ldb, beta, c, ldc); +} + +Stream &Stream::ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans, + uint64 n, uint64 k, double alpha, + const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &b, int ldb, + double beta, DeviceMemory<double> *c, int ldc) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), + PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), + PARAM(ldc)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, double, + const DeviceMemory<double> &, int, const DeviceMemory<double> &, + int, double, DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasSyr2k, uplo, trans, n, k, alpha, + a, lda, b, ldb, beta, c, ldc); +} + +Stream &Stream::ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans, + uint64 n, uint64 k, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, + int lda, + const DeviceMemory<std::complex<float>> &b, + int ldb, std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), + PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), + PARAM(ldc)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, + std::complex<float>, const DeviceMemory<std::complex<float>> &, + int, const DeviceMemory<std::complex<float>> &, int, + std::complex<float>, DeviceMemory<std::complex<float>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasSyr2k, uplo, trans, n, k, alpha, + a, lda, b, ldb, beta, c, ldc); +} + +Stream &Stream::ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans, + uint64 n, uint64 k, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, + int lda, + const DeviceMemory<std::complex<double>> &b, + int ldb, std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc) { + VLOG_CALL(PARAM(uplo), PARAM(trans), PARAM(n), PARAM(k), PARAM(alpha), + PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), + PARAM(ldc)); + + ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, + std::complex<double>, const DeviceMemory<std::complex<double>> &, + int, const DeviceMemory<std::complex<double>> &, int, + std::complex<double>, DeviceMemory<std::complex<double>> *, + int> impl; + return impl(this, &blas::BlasSupport::DoBlasSyr2k, uplo, trans, n, k, alpha, + a, lda, b, ldb, beta, c, ldc); +} + +Stream &Stream::ThenBlasTrmm(blas::Side side, blas::UpperLower uplo, + blas::Transpose transa, blas::Diagonal diag, + uint64 m, uint64 n, float alpha, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *b, int ldb) { + VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m), + PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb)); + + ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal, + uint64, uint64, float, const DeviceMemory<float> &, int, + DeviceMemory<float> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTrmm, side, uplo, transa, diag, m, + n, alpha, a, lda, b, ldb); +} + +Stream &Stream::ThenBlasTrmm(blas::Side side, blas::UpperLower uplo, + blas::Transpose transa, blas::Diagonal diag, + uint64 m, uint64 n, double alpha, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *b, int ldb) { + VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m), + PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb)); + + ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal, + uint64, uint64, double, const DeviceMemory<double> &, int, + DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTrmm, side, uplo, transa, diag, m, + n, alpha, a, lda, b, ldb); +} + +Stream &Stream::ThenBlasTrmm(blas::Side side, blas::UpperLower uplo, + blas::Transpose transa, blas::Diagonal diag, + uint64 m, uint64 n, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, + int lda, DeviceMemory<std::complex<float>> *b, + int ldb) { + VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m), + PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb)); + + ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal, + uint64, uint64, std::complex<float>, + const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<std::complex<float>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTrmm, side, uplo, transa, diag, m, + n, alpha, a, lda, b, ldb); +} + +Stream &Stream::ThenBlasTrmm(blas::Side side, blas::UpperLower uplo, + blas::Transpose transa, blas::Diagonal diag, + uint64 m, uint64 n, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, + int lda, DeviceMemory<std::complex<double>> *b, + int ldb) { + VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m), + PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb)); + + ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal, + uint64, uint64, std::complex<double>, + const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTrmm, side, uplo, transa, diag, m, + n, alpha, a, lda, b, ldb); +} + +Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo, + blas::Transpose transa, blas::Diagonal diag, + uint64 m, uint64 n, float alpha, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *b, int ldb) { + VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m), + PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb)); + + ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal, + uint64, uint64, float, const DeviceMemory<float> &, int, + DeviceMemory<float> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m, + n, alpha, a, lda, b, ldb); +} + +Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo, + blas::Transpose transa, blas::Diagonal diag, + uint64 m, uint64 n, double alpha, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *b, int ldb) { + VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m), + PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb)); + + ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal, + uint64, uint64, double, const DeviceMemory<double> &, int, + DeviceMemory<double> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m, + n, alpha, a, lda, b, ldb); +} + +Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo, + blas::Transpose transa, blas::Diagonal diag, + uint64 m, uint64 n, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, + int lda, DeviceMemory<std::complex<float>> *b, + int ldb) { + VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m), + PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb)); + + ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal, + uint64, uint64, std::complex<float>, + const DeviceMemory<std::complex<float>> &, int, + DeviceMemory<std::complex<float>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m, + n, alpha, a, lda, b, ldb); +} + +Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo, + blas::Transpose transa, blas::Diagonal diag, + uint64 m, uint64 n, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, + int lda, DeviceMemory<std::complex<double>> *b, + int ldb) { + VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m), + PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb)); + + ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal, + uint64, uint64, std::complex<double>, + const DeviceMemory<std::complex<double>> &, int, + DeviceMemory<std::complex<double>> *, int> impl; + return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m, + n, alpha, a, lda, b, ldb); +} + +Stream &Stream::ThenBlasGemmBatched( + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, float alpha, const port::ArraySlice<DeviceMemory<float> *> &a, + int lda, const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, + float beta, const port::ArraySlice<DeviceMemory<float> *> &c, int ldc, + int batch_count) { + VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), + PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), + PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count)); + + ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, float, + const port::ArraySlice<DeviceMemory<float> *> &, int, + const port::ArraySlice<DeviceMemory<float> *> &, int, float, + const port::ArraySlice<DeviceMemory<float> *> &, int, int> impl; + return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n, + k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count); +} + +Stream &Stream::ThenBlasGemmBatched( + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, double alpha, const port::ArraySlice<DeviceMemory<double> *> &a, + int lda, const port::ArraySlice<DeviceMemory<double> *> &b, int ldb, + double beta, const port::ArraySlice<DeviceMemory<double> *> &c, int ldc, + int batch_count) { + VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), + PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), + PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count)); + + ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, double, + const port::ArraySlice<DeviceMemory<double> *> &, int, + const port::ArraySlice<DeviceMemory<double> *> &, int, double, + const port::ArraySlice<DeviceMemory<double> *> &, int, int> impl; + return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n, + k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count); +} + +Stream &Stream::ThenBlasGemmBatched( + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, std::complex<float> alpha, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a, int lda, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb, + std::complex<float> beta, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc, + int batch_count) { + VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), + PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), + PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count)); + + ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, + std::complex<float>, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &, + int, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &, + int, std::complex<float>, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &, + int, int> impl; + return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n, + k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count); +} + +Stream &Stream::ThenBlasGemmBatched( + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, std::complex<double> alpha, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a, int lda, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b, int ldb, + std::complex<double> beta, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc, + int batch_count) { + VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), + PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), + PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count)); + + ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, + std::complex<double>, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &, + int, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &, + int, std::complex<double>, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &, + int, int> impl; + return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n, + k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count); +} + +Stream &Stream::ThenSetRngSeed(const uint8 *seed, uint64 seed_bytes) { + VLOG_CALL(PARAM(seed), PARAM(seed_bytes)); + + if (ok()) { + if (rng::RngSupport *rng = parent_->AsRng()) { + CheckError(rng->SetSeed(this, seed, seed_bytes)); + } else { + SetError(); + LOG(INFO) << "stream " << this << " unable to initialize RNG"; + } + } else { + LOG(INFO) << "stream " << this + << " did not set RNG seed: " << static_cast<const void *>(seed) + << "; bytes: " << seed_bytes; + } + return *this; +} + +Stream &Stream::ThenPopulateRandUniform(DeviceMemory<float> *values) { + VLOG_CALL(PARAM(values)); + + if (ok()) { + if (rng::RngSupport *rng = parent_->AsRng()) { + CheckError(rng->DoPopulateRandUniform(this, values)); + } else { + SetError(); + LOG(INFO) << "attempting to perform RNG operation using StreamExecutor " + "without RNG support."; + } + } + return *this; +} + +Stream &Stream::ThenPopulateRandGaussian(float mean, float sd, + DeviceMemory<float> *values) { + VLOG_CALL(PARAM(mean), PARAM(sd), PARAM(values)); + + if (ok()) { + if (rng::RngSupport *rng = parent_->AsRng()) { + CheckError(rng->DoPopulateRandGaussian(this, mean, sd, values)); + } else { + SetError(); + LOG(INFO) << "attempting to perform RNG operation using StreamExecutor " + "without RNG support."; + } + } + return *this; +} + +Stream &Stream::ThenPopulateRandGaussian(double mean, double sd, + DeviceMemory<double> *values) { + VLOG_CALL(PARAM(mean), PARAM(sd), PARAM(values)); + + if (ok()) { + if (rng::RngSupport *rng = parent_->AsRng()) { + CheckError(rng->DoPopulateRandGaussian(this, mean, sd, values)); + } else { + SetError(); + LOG(INFO) << "attempting to perform RNG operation using StreamExecutor " + "without RNG support."; + } + } + return *this; +} + +Stream &Stream::ThenPopulateRandUniform(DeviceMemory<double> *values) { + VLOG_CALL(PARAM(values)); + + if (ok()) { + if (rng::RngSupport *rng = parent_->AsRng()) { + CheckError(rng->DoPopulateRandUniform(this, values)); + } else { + SetError(); + LOG(INFO) << "attempting to perform RNG operation using StreamExecutor " + "without RNG support."; + } + } + return *this; +} + +Stream &Stream::ThenPopulateRandUniform( + DeviceMemory<std::complex<float>> *values) { + VLOG_CALL(PARAM(values)); + + if (ok()) { + if (rng::RngSupport *rng = parent_->AsRng()) { + CheckError(rng->DoPopulateRandUniform(this, values)); + } else { + SetError(); + LOG(INFO) << "attempting to perform RNG operation using StreamExecutor " + "without RNG support."; + } + } + return *this; +} + +Stream &Stream::ThenPopulateRandUniform( + DeviceMemory<std::complex<double>> *values) { + VLOG_CALL(PARAM(values)); + + if (ok()) { + if (rng::RngSupport *rng = parent_->AsRng()) { + CheckError(rng->DoPopulateRandUniform(this, values)); + } else { + SetError(); + LOG(INFO) << "stream " << this + << " attempting to perform RNG operation using StreamExecutor " + "without RNG support."; + } + } + return *this; +} + +Stream &Stream::ThenMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src, + uint64 size) { + VLOG_CALL(PARAM(host_dst), PARAM(gpu_src), PARAM(size)); + + if (ok()) { + CheckError(parent_->Memcpy(this, host_dst, gpu_src, size)); + } else { + LOG(INFO) << "stream " << this + << " did not memcpy device-to-host; source: " << gpu_src.opaque(); + } + return *this; +} + +Stream &Stream::ThenMemcpy(DeviceMemoryBase *gpu_dst, const void *host_src, + uint64 size) { + VLOG_CALL(PARAM(gpu_dst), PARAM(host_src), PARAM(size)); + + if (ok()) { + CheckError(parent_->Memcpy(this, gpu_dst, host_src, size)); + } else { + LOG(INFO) << "stream " << this + << " did not memcpy host-to-device; source: " << host_src; + } + return *this; +} + +Stream &Stream::ThenMemcpy(DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &gpu_src, uint64 size) { + VLOG_CALL(PARAM(gpu_dst), PARAM(gpu_src), PARAM(size)); + + if (ok()) { + CheckError(parent_->MemcpyDeviceToDevice(this, gpu_dst, gpu_src, size)); + } else { + LOG(INFO) << "stream " << this + << " did not memcpy gpu-to-gpu; source: " << &gpu_src; + } + return *this; +} + +Stream &Stream::ThenMemZero(DeviceMemoryBase *location, uint64 size) { + VLOG_CALL(PARAM(location), PARAM(size)); + + if (ok()) { + CheckError(parent_->MemZero(this, location, size)); + } else { + LOG(INFO) << "stream " << this + << " did not memzero GPU location; source: " << location; + } + return *this; +} + +Stream &Stream::ThenMemset32(DeviceMemoryBase *location, const uint32 &pattern, + uint64 size) { + VLOG_CALL(PARAM(location), PARAM(pattern), PARAM(size)); + + if (ok()) { + CheckError(parent_->Memset32(this, location, pattern, size)); + } else { + LOG(INFO) << "stream " << this + << " did not memset GPU location; source: " << location + << "; size: " << size << "; pattern: " << std::hex << pattern; + } + return *this; +} + +Stream &Stream::ThenDoHostCallbackForTest(std::function<void()> callback) { + VLOG_CALL(PARAM(callback)); + + return ThenDoHostCallback(callback); +} + +Stream &Stream::ThenDoHostCallback(std::function<void()> callback) { + VLOG_CALL(PARAM(callback)); + + if (ok()) { + CheckError(parent_->HostCallback(this, callback)); + } else { + LOG(INFO) << "stream " << this + << " was in error state before adding host callback"; + } + return *this; +} + +Stream &Stream::ThenFft(fft::Plan *plan, + const DeviceMemory<std::complex<float>> &input, + DeviceMemory<std::complex<float>> *output) { + VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output)); + + if (ok()) { + if (fft::FftSupport *fft = parent_->AsFft()) { + CheckError(fft->DoFft(this, plan, input, output)); + } else { + SetError(); + LOG(INFO) << "attempting to perform FFT operation using StreamExecutor " + "without FFT support"; + } + } + return *this; +} + +Stream &Stream::ThenFft(fft::Plan *plan, + const DeviceMemory<std::complex<double>> &input, + DeviceMemory<std::complex<double>> *output) { + VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output)); + + if (ok()) { + if (fft::FftSupport *fft = parent_->AsFft()) { + CheckError(fft->DoFft(this, plan, input, output)); + } else { + SetError(); + LOG(INFO) << "attempting to perform FFT operation using StreamExecutor " + "without FFT support"; + } + } + return *this; +} + +Stream &Stream::ThenFft(fft::Plan *plan, const DeviceMemory<float> &input, + DeviceMemory<std::complex<float>> *output) { + VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output)); + + if (ok()) { + if (fft::FftSupport *fft = parent_->AsFft()) { + CheckError(fft->DoFft(this, plan, input, output)); + } else { + SetError(); + LOG(INFO) << "attempting to perform FFT operation using StreamExecutor " + "without FFT support"; + } + } + return *this; +} + +Stream &Stream::ThenFft(fft::Plan *plan, const DeviceMemory<double> &input, + DeviceMemory<std::complex<double>> *output) { + VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output)); + + if (ok()) { + if (fft::FftSupport *fft = parent_->AsFft()) { + CheckError(fft->DoFft(this, plan, input, output)); + } else { + SetError(); + LOG(INFO) << "attempting to perform FFT operation using StreamExecutor " + "without FFT support"; + } + } + return *this; +} + +Stream &Stream::ThenFft(fft::Plan *plan, + const DeviceMemory<std::complex<float>> &input, + DeviceMemory<float> *output) { + VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output)); + + if (ok()) { + if (fft::FftSupport *fft = parent_->AsFft()) { + CheckError(fft->DoFft(this, plan, input, output)); + } else { + SetError(); + LOG(INFO) << "attempting to perform FFT operation using StreamExecutor " + "without FFT support"; + } + } + return *this; +} + +Stream &Stream::ThenFft(fft::Plan *plan, + const DeviceMemory<std::complex<double>> &input, + DeviceMemory<double> *output) { + VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output)); + + if (ok()) { + if (fft::FftSupport *fft = parent_->AsFft()) { + CheckError(fft->DoFft(this, plan, input, output)); + } else { + SetError(); + LOG(INFO) << "attempting to perform FFT operation using StreamExecutor " + "without FFT support"; + } + } + return *this; +} + +// It looks confusing, but all this is doing is inserting a callback at the +// present point in the stream to then enqueue a task on the host executor. +Stream &Stream::ThenEnqueueOnBackgroundThread( + std::function<void(StreamExecutor *)> task) { + VLOG_CALL(PARAM(task)); + + StreamExecutor *stream_executor = this->parent_; + std::function<void()> bound_task = std::bind(task, stream_executor); + + return ThenDoHostCallback([stream_executor, bound_task]() { + stream_executor->EnqueueOnBackgroundThread(bound_task); + }); +} + +bool Stream::BlockHostUntilDone() { + VLOG_CALL(); + + if (!ok()) { + LOG(INFO) + << "stream " << this + << " did not block host until done; was already in an error state"; + return false; + } + + { + // Wait until all active sub-streams have done their tasks. + mutex_lock lock{mu_}; + for (auto &stream : sub_streams_) { + if (!stream.second) { + CheckError(stream.first->BlockHostUntilDone()); + // Set this sub-stream as available. + stream.second = true; + } + } + } + + temporary_memory_manager_.DeallocateFinalizedTemporaries(); + + CheckError(parent_->BlockHostUntilDone(this)); + return ok(); +} + +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h new file mode 100644 index 0000000000..d4d5e7729b --- /dev/null +++ b/tensorflow/stream_executor/stream.h @@ -0,0 +1,1258 @@ +// The Stream is used in conjunction with the StreamExecutor "parent" to +// perform actions with a linear stream of dependencies. Dependencies can also +// be created between Streams to do task management (i.e. limit which tasks +// can be performed concurrently and specify what task dependencies exist). + +#ifndef TENSORFLOW_STREAM_EXECUTOR_STREAM_H_ +#define TENSORFLOW_STREAM_EXECUTOR_STREAM_H_ + +#include <complex> +#include <functional> +#include <memory> + +#include "tensorflow/stream_executor/blas.h" +#include "tensorflow/stream_executor/device_memory.h" +#include "tensorflow/stream_executor/dnn.h" +#include "tensorflow/stream_executor/event.h" +#include "tensorflow/stream_executor/fft.h" +#include "tensorflow/stream_executor/kernel.h" +#include "tensorflow/stream_executor/launch_dim.h" +#include "tensorflow/stream_executor/lib/array_slice.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/platform/thread_annotations.h" +#include "tensorflow/stream_executor/temporary_memory_manager.h" + +namespace perftools { +namespace gputools { + +namespace host { +class HostBlas; +class HostFft; +class HostRng; +class HostTimer; +} // namespace host + +namespace ocl { +class CLBlas; +} // namespace ocl + +namespace internal { +class StreamInterface; +} // namespace internal + +class DeviceMemoryBase; +template <typename ElemT> +class DeviceMemory; + +class Timer; + +namespace dnn { +struct BatchDescriptor; +struct FilterDescriptor; +struct ConvolutionDescriptor; +} // namespace dnn + +class StreamExecutor; + +// Represents a stream of dependent computations on a GPU device. +// +// The operations within a stream execute linearly and asynchronously until +// BlockHostUntilDone() is invoked, which synchronously joins host code with +// the execution of the stream. +// +// If any given operation fails when entraining work for the stream, ok() will +// indicate that an error has occurred. After initialization, once a stream is +// !ok(), it will never be ok(). +// +// Thread-safe post-initialization. +class Stream { + public: + // Instantiate a stream tied to parent as a platform executor. Work + // entrained onto this stream will be launched/managed on that + // StreamExecutor's platform. + explicit Stream(StreamExecutor *parent); + + // Test only. Use an externally-populated value (like a mock) for the + // platform-specific stream implementation. + Stream(StreamExecutor *parent, internal::StreamInterface *implementation); + + // Deallocates any stream resources that the parent StreamExecutor has + // bestowed + // upon this object. + ~Stream(); + + // Returns whether any errors have occurred while entraining work for this + // stream. + bool ok() const { return !InErrorState(); } + + // Initialize the stream. This must be performed before entraining any other + // operations. + Stream &Init(); + + // Initializes timer t via the StreamExecutor. + Stream &InitTimer(Timer *t); + + // Convenience wrapper around Init() and InitTimer(). + Stream &InitWithTimer(Timer *t); + + // Warning! After calling BlockHostUntilDone(), all sub-streams will be + // returned and hence invalid. This may be a temporary solution to the issue + // b/18070215. + // Get or create a sub-stream from this stream. If there is any sub-stream + // in the pool that can be reused then just return this sub-stream. + // Otherwise + // create a new sub-stream. + Stream *GetOrCreateSubStream(); + + // Return the sub-stream back to the host stream so that it can be reused + // later. + void ReturnSubStream(Stream *sub_stream); + + // Allocate temporary memories. The stream will deallocate them when blocked + // or destroyed. + template <typename T> + port::StatusOr<std::unique_ptr<TemporaryDeviceMemory<T>>> + AllocateTemporaryArray(uint64 element_count); + + // Entrains onto the stream of operations: a kernel launch with the given + // (variadic) parameters for the invocation. These arguments can be things + // like DeviceMemory or primitive types such as int. What arguments you may + // pass to a given kernel are noted as the template parameters to the + // TypedKernel type that the machocc compiler generates. + // + // Template parameters: + // Params... The type list of formal parameters that the typed kernel + // expects, which is matched against Args... + // Args... The deduced type list for passed actual arguments + // + // Implementation: A compile-time compatibility check is performed that has + // some leniency versus an exact parameter pack match -- for example, + // `const DeviceMemory<T>` is considered "pack compatible" with a + // `const DeviceMemory<T>&` formal parameter; in part, because we don't have + // perfect forwarding support without rvalue references. It also attempts to + // spit out helpful static_assert error traces with information as to the + // argument number and types that were mismatched. + template <typename... Params, typename... Args> + Stream &ThenLaunch(ThreadDim thread_dims, BlockDim block_dims, + const TypedKernel<Params...> &kernel, Args... args); + + // Record a "start" event for the interval timer at this point in the + // stream's + // execution (relative to the previously and subsequently enqueued items in + // the stream's execution). Streams may be started/stopped multiple times. + Stream &ThenStartTimer(Timer *t); + + // Record a "stop" event for the interval timer at this point in the + // stream's + // execution. See also Stream::ThenStartTimer. + Stream &ThenStopTimer(Timer *t); + + // TODO(leary) If work is added to the stream that is being depended upon, + // then what? Have to describe what happens. + template <typename... Params> + Stream &ThenWaitFor(Stream *other, Params... more_streams) { + return ThenWaitFor(more_streams...).ThenWaitFor(other); + } + + // Create a dependency for this stream's next work on the other stream + // completing. Does not take ownership of other, and other must not be + // null. + // + // Checks that a stream does not wait for itself, and it is up to the + // user to guarantee that a stream does not come to wait on itself in a + // cyclic + // manner; in that case, behavior is undefined. + // + // N.B. Base recursion case for the variadic ThenWaitFor. + Stream &ThenWaitFor(Stream *other); + + // Waits for all streams values in others. + // Checks that there is no shallow circular wait (i.e. that "this" is not in + // others). + Stream &ThenWaitFor(std::vector<std::unique_ptr<Stream>> *others); + + // Waits for an event object to be set. + // Note that ThenRecordEvent must have been called on the event before + // you call this function; otherwise the event will be considered complete + // and this wait will do nothing. + Stream &ThenWaitFor(Event *event); + + // Inserts the specified event into the end of this stream. Once the stream + // has processed all events prior to the insertion point, the event will be + // marked as completed. + // The stream does not take ownership of event - meaning that event's lifetime + // must extend past the point at which it is marked complete! + Stream &ThenRecordEvent(Event *event); + + //////////////// + // DNN support + // + // See DnnSupport::* for comments on the following methods. + + // TODO(leary) add double-precision version of this interface. + Stream &ThenConvolve(const dnn::BatchDescriptor &input_descriptor, + const DeviceMemory<float> &input_data, + const dnn::FilterDescriptor &filter_descriptor, + const DeviceMemory<float> &filter_data, + const dnn::ConvolutionDescriptor &convolution_descriptor, + const dnn::BatchDescriptor &output_descriptor, + DeviceMemory<float> *output); + + Stream &ThenSeparableConvolve( + const dnn::BatchDescriptor &input_descriptor, + const DeviceMemory<float> &input_data, + const dnn::FilterDescriptor &filter_descriptor, int depth_multiplier, + const DeviceMemory<float> &first_weights, + const DeviceMemory<float> &second_weights, + const dnn::ConvolutionDescriptor &convolution_descriptor, + const dnn::BatchDescriptor &output_descriptor, + DeviceMemory<float> *output); + + Stream &ThenConvolveBackwardData( + const dnn::FilterDescriptor &filter_descriptor, + const DeviceMemory<float> &filter_data, + const dnn::BatchDescriptor &output_descriptor, + DeviceMemory<float> backward_output_data, + const dnn::ConvolutionDescriptor &convolution_descriptor, + const dnn::BatchDescriptor &input_descriptor, + DeviceMemory<float> *backward_input_data); + + Stream &ThenConvolveBackwardFilter( + const dnn::BatchDescriptor &input_descriptor, + const DeviceMemory<float> &input_data, + const dnn::BatchDescriptor &output_descriptor, + DeviceMemory<float> backward_output_data, + const dnn::ConvolutionDescriptor &convolution_descriptor, + const dnn::FilterDescriptor &filter_descriptor, + DeviceMemory<float> *backward_filter_data); + + Stream &ThenMatMul(const DeviceMemory<float> &input_data, + const DeviceMemory<float> &weights, + const dnn::BatchDescriptor &input_dimensions, + const dnn::BatchDescriptor &output_dimensions, + DeviceMemory<float> *output_data); + + Stream &ThenMatMulQuantized(const DeviceMemory<float> &input_data, + const DeviceMemory<int8> &weights, + const DeviceMemory<float> &weight_scales, + const dnn::BatchDescriptor &input_dimensions, + const dnn::BatchDescriptor &output_dimensions, + DeviceMemory<float> *output_data); + + Stream &ThenMatMulQuantized(const DeviceMemory<float> &input_data, + const DeviceMemory<int16> &weights, + const DeviceMemory<float> &weight_scales, + const dnn::BatchDescriptor &input_dimensions, + const dnn::BatchDescriptor &output_dimensions, + DeviceMemory<float> *output_data); + + Stream &ThenBiasAdd(const DeviceMemory<float> &input_data, + const DeviceMemory<float> &biases, + const dnn::BatchDescriptor &dimensions, + DeviceMemory<float> *output_data); + + Stream &ThenPoolForward(const dnn::PoolingDescriptor &pooling_dimensions, + const dnn::BatchDescriptor &input_dimensions, + const DeviceMemory<float> &input_data, + const dnn::BatchDescriptor &output_dimensions, + DeviceMemory<float> *output_data); + + Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions, + const dnn::BatchDescriptor &input_dimensions, + const DeviceMemory<float> &input_data, + const dnn::BatchDescriptor &output_dimensions, + const DeviceMemory<float> &output_data, + const DeviceMemory<float> &input_diff_data, + DeviceMemory<float> *output_diff_data); + + Stream &ThenNormalize(const dnn::NormalizeDescriptor &normalize_descriptor, + const DeviceMemory<float> &input_data, + DeviceMemory<float> *output_data); + + Stream &ThenActivate(dnn::ActivationMode activation_mode, + const dnn::BatchDescriptor &dimensions, + const DeviceMemory<float> &input_data, + DeviceMemory<float> *output_data); + + Stream &ThenDepthConcatenate( + port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float> *> input_data, + DeviceMemory<float> *output_data); + + Stream &ThenElementwiseOperate( + dnn::ElementwiseOperation operation, + port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float> *> input_data, + const dnn::BatchDescriptor &output_dimensions, + DeviceMemory<float> *output_data); + + // See DnnSupport::DoMemcpyD2HQuantized. + // TODO(wgulland) Use a template to merge the versions of + // ThenMemcpyD2HQuantized. + Stream &ThenMemcpyD2HQuantized(const DeviceMemory<float> &gpu_unquantized_src, + port::MutableArraySlice<uint8> host_dst); + + // See DnnSupport::DoMemcpyD2HQuantized. + Stream &ThenMemcpyD2HQuantized(const DeviceMemory<float> &gpu_unquantized_src, + port::MutableArraySlice<uint16> host_dst); + + // See DnnSupport::DoMemcpyD2HQuantized. + Stream &ThenMemcpyD2HQuantized(const DeviceMemory<float> &gpu_unquantized_src, + port::MutableArraySlice<int32> host_dst); + + // See DnnSupport::DoMemcpyH2DQuantized. + Stream &ThenMemcpyH2DQuantized(port::ArraySlice<uint8> host_src, + DeviceMemory<float> *gpu_unquantized_dst); + + ///////////////// + // BLAS support + + // See BlasSupport::DoBlasAsum. + Stream &ThenBlasAsum(uint64 elem_count, const DeviceMemory<float> &x, + int incx, DeviceMemory<float> *result); + Stream &ThenBlasAsum(uint64 elem_count, const DeviceMemory<double> &x, + int incx, DeviceMemory<double> *result); + Stream &ThenBlasAsum(uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<float> *result); + Stream &ThenBlasAsum(uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<double> *result); + + // See BlasSupport::DoBlasAxpy. Note that, even for the case where alpha is + // present in DeviceMemory, it must be an execution-time constant (i.e. a + // value + // that the stream does not change or populate during the course of + // execution). The value is effectively captured at stream-enqueue time. + Stream &ThenBlasAxpy(uint64 elem_count, float alpha, + const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *y, int incy); + Stream &ThenBlasAxpy(uint64 elem_count, double alpha, + const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *y, int incy); + Stream &ThenBlasAxpy(uint64 elem_count, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<std::complex<float>> *y, int incy); + Stream &ThenBlasAxpy(uint64 elem_count, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<std::complex<double>> *y, int incy); + + // See BlasSupport::DoBlasCopy. + Stream &ThenBlasCopy(uint64 elem_count, const DeviceMemory<float> &x, + int incx, DeviceMemory<float> *y, int incy); + Stream &ThenBlasCopy(uint64 elem_count, const DeviceMemory<double> &x, + int incx, DeviceMemory<double> *y, int incy); + Stream &ThenBlasCopy(uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<std::complex<float>> *y, int incy); + Stream &ThenBlasCopy(uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<std::complex<double>> *y, int incy); + + // See BlasSupport::DoBlasDot. + Stream &ThenBlasDot(uint64 elem_count, const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *result); + Stream &ThenBlasDot(uint64 elem_count, const DeviceMemory<double> &x, + int incx, const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *result); + + // See BlasSupport::DoBlasDotc. + Stream &ThenBlasDotc(uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *result); + Stream &ThenBlasDotc(uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *result); + + // See BlasSupport::DoBlasDotu. + Stream &ThenBlasDotu(uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *result); + Stream &ThenBlasDotu(uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *result); + + // See BlasSupport::DoBlasNrm2. + Stream &ThenBlasNrm2(uint64 elem_count, const DeviceMemory<float> &x, + int incx, DeviceMemory<float> *result); + Stream &ThenBlasNrm2(uint64 elem_count, const DeviceMemory<double> &x, + int incx, DeviceMemory<double> *result); + Stream &ThenBlasNrm2(uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<float> *result); + Stream &ThenBlasNrm2(uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<double> *result); + + // See BlasSupport::DoBlasRot. + Stream &ThenBlasRot(uint64 elem_count, DeviceMemory<float> *x, int incx, + DeviceMemory<float> *y, int incy, float c, float s); + Stream &ThenBlasRot(uint64 elem_count, DeviceMemory<double> *x, int incx, + DeviceMemory<double> *y, int incy, double c, double s); + Stream &ThenBlasRot(uint64 elem_count, DeviceMemory<std::complex<float>> *x, + int incx, DeviceMemory<std::complex<float>> *y, int incy, + float c, float s); + Stream &ThenBlasRot(uint64 elem_count, DeviceMemory<std::complex<double>> *x, + int incx, DeviceMemory<std::complex<double>> *y, int incy, + double c, double s); + + // See BlasSupport::DoBlasRotg. + Stream &ThenBlasRotg(DeviceMemory<float> *a, DeviceMemory<float> *b, + DeviceMemory<float> *c, DeviceMemory<float> *s); + Stream &ThenBlasRotg(DeviceMemory<double> *a, DeviceMemory<double> *b, + DeviceMemory<double> *c, DeviceMemory<double> *s); + Stream &ThenBlasRotg(DeviceMemory<std::complex<float>> *a, + DeviceMemory<std::complex<float>> *b, + DeviceMemory<float> *c, + DeviceMemory<std::complex<float>> *s); + Stream &ThenBlasRotg(DeviceMemory<std::complex<double>> *a, + DeviceMemory<std::complex<double>> *b, + DeviceMemory<double> *c, + DeviceMemory<std::complex<double>> *s); + + // See BlasSupport::DoBlasRotm. + Stream &ThenBlasRotm(uint64 elem_count, DeviceMemory<float> *x, int incx, + DeviceMemory<float> *y, int incy, + const DeviceMemory<float> ¶m); + Stream &ThenBlasRotm(uint64 elem_count, DeviceMemory<double> *x, int incx, + DeviceMemory<double> *y, int incy, + const DeviceMemory<double> ¶m); + + // See BlasSupport::DoBlasRotmg. + Stream &ThenBlasRotmg(DeviceMemory<float> *d1, DeviceMemory<float> *d2, + DeviceMemory<float> *x1, const DeviceMemory<float> &y1, + DeviceMemory<float> *param); + Stream &ThenBlasRotmg(DeviceMemory<double> *d1, DeviceMemory<double> *d2, + DeviceMemory<double> *x1, + const DeviceMemory<double> &y1, + DeviceMemory<double> *param); + + // See BlasSupport::DoBlasScal. + Stream &ThenBlasScal(uint64 elem_count, float alpha, DeviceMemory<float> *x, + int incx); + Stream &ThenBlasScal(uint64 elem_count, double alpha, DeviceMemory<double> *x, + int incx); + Stream &ThenBlasScal(uint64 elem_count, float alpha, + DeviceMemory<std::complex<float>> *x, int incx); + Stream &ThenBlasScal(uint64 elem_count, double alpha, + DeviceMemory<std::complex<double>> *x, int incx); + Stream &ThenBlasScal(uint64 elem_count, std::complex<float> alpha, + DeviceMemory<std::complex<float>> *x, int incx); + Stream &ThenBlasScal(uint64 elem_count, std::complex<double> alpha, + DeviceMemory<std::complex<double>> *x, int incx); + + // See BlasSupport::DoBlasSwap. + Stream &ThenBlasSwap(uint64 elem_count, DeviceMemory<float> *x, int incx, + DeviceMemory<float> *y, int incy); + Stream &ThenBlasSwap(uint64 elem_count, DeviceMemory<double> *x, int incx, + DeviceMemory<double> *y, int incy); + Stream &ThenBlasSwap(uint64 elem_count, DeviceMemory<std::complex<float>> *x, + int incx, DeviceMemory<std::complex<float>> *y, + int incy); + Stream &ThenBlasSwap(uint64 elem_count, DeviceMemory<std::complex<double>> *x, + int incx, DeviceMemory<std::complex<double>> *y, + int incy); + + // See BlasSupport::DoBlasIamax. + Stream &ThenBlasIamax(uint64 elem_count, const DeviceMemory<float> &x, + int incx, DeviceMemory<int> *result); + Stream &ThenBlasIamax(uint64 elem_count, const DeviceMemory<double> &x, + int incx, DeviceMemory<int> *result); + Stream &ThenBlasIamax(uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<int> *result); + Stream &ThenBlasIamax(uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<int> *result); + + // See BlasSupport::DoBlasIamin. + Stream &ThenBlasIamin(uint64 elem_count, const DeviceMemory<float> &x, + int incx, DeviceMemory<int> *result); + Stream &ThenBlasIamin(uint64 elem_count, const DeviceMemory<double> &x, + int incx, DeviceMemory<int> *result); + Stream &ThenBlasIamin(uint64 elem_count, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<int> *result); + Stream &ThenBlasIamin(uint64 elem_count, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<int> *result); + + // See BlasSupport::DoBlasGbmv. + Stream &ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n, uint64 kl, + uint64 ku, float alpha, const DeviceMemory<float> &a, + int lda, const DeviceMemory<float> &x, int incx, + float beta, DeviceMemory<float> *y, int incy); + Stream &ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n, uint64 kl, + uint64 ku, double alpha, const DeviceMemory<double> &a, + int lda, const DeviceMemory<double> &x, int incx, + double beta, DeviceMemory<double> *y, int incy); + Stream &ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n, uint64 kl, + uint64 ku, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy); + Stream &ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n, uint64 kl, + uint64 ku, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy); + + // See BlasSupport::DoBlasGemv. + Stream &ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n, float alpha, + const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &x, int incx, float beta, + DeviceMemory<float> *y, int incy); + Stream &ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n, double alpha, + const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &x, int incx, double beta, + DeviceMemory<double> *y, int incy); + Stream &ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy); + Stream &ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy); + + // See BlasSupport::DoBlasGer. + Stream &ThenBlasGer(uint64 m, uint64 n, float alpha, + const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *a, int lda); + Stream &ThenBlasGer(uint64 m, uint64 n, double alpha, + const DeviceMemory<double> &x, int incx, + const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *a, int lda); + + // See BlasSupport::DoBlasGerc. + Stream &ThenBlasGerc(uint64 m, uint64 n, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *a, int lda); + Stream &ThenBlasGerc(uint64 m, uint64 n, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *a, int lda); + + // See BlasSupport::DoBlasGeru. + Stream &ThenBlasGeru(uint64 m, uint64 n, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *a, int lda); + Stream &ThenBlasGeru(uint64 m, uint64 n, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *a, int lda); + + // See BlasSupport::DoBlasHbmv. + Stream &ThenBlasHbmv(blas::UpperLower uplo, uint64 n, uint64 k, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy); + Stream &ThenBlasHbmv(blas::UpperLower uplo, uint64 n, uint64 k, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy); + + // See BlasSupport::DoBlasHemv. + Stream &ThenBlasHemv(blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy); + Stream &ThenBlasHemv(blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy); + + // See BlasSupport::DoBlasHer. + Stream &ThenBlasHer(blas::UpperLower uplo, uint64 n, float alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<std::complex<float>> *a, int lda); + Stream &ThenBlasHer(blas::UpperLower uplo, uint64 n, double alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<std::complex<double>> *a, int lda); + + // See BlasSupport::DoBlasHer2. + Stream &ThenBlasHer2(blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *a, int lda); + Stream &ThenBlasHer2(blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *a, int lda); + + // See BlasSupport::DoBlasHpmv. + Stream &ThenBlasHpmv(blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &ap, + const DeviceMemory<std::complex<float>> &x, int incx, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *y, int incy); + Stream &ThenBlasHpmv(blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &ap, + const DeviceMemory<std::complex<double>> &x, int incx, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *y, int incy); + + // See BlasSupport::DoBlasHpr. + Stream &ThenBlasHpr(blas::UpperLower uplo, uint64 n, float alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + DeviceMemory<std::complex<float>> *ap); + Stream &ThenBlasHpr(blas::UpperLower uplo, uint64 n, double alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + DeviceMemory<std::complex<double>> *ap); + + // See BlasSupport::DoBlasHpr2. + Stream &ThenBlasHpr2(blas::UpperLower uplo, uint64 n, + std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &x, int incx, + const DeviceMemory<std::complex<float>> &y, int incy, + DeviceMemory<std::complex<float>> *ap); + Stream &ThenBlasHpr2(blas::UpperLower uplo, uint64 n, + std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &x, int incx, + const DeviceMemory<std::complex<double>> &y, int incy, + DeviceMemory<std::complex<double>> *ap); + + // See BlasSupport::DoBlasSbmv. + Stream &ThenBlasSbmv(blas::UpperLower uplo, uint64 n, uint64 k, float alpha, + const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &x, int incx, float beta, + DeviceMemory<float> *y, int incy); + Stream &ThenBlasSbmv(blas::UpperLower uplo, uint64 n, uint64 k, double alpha, + const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &x, int incx, double beta, + DeviceMemory<double> *y, int incy); + + // See BlasSupport::DoBlasSpmv. + Stream &ThenBlasSpmv(blas::UpperLower uplo, uint64 n, float alpha, + const DeviceMemory<float> &ap, + const DeviceMemory<float> &x, int incx, float beta, + DeviceMemory<float> *y, int incy); + Stream &ThenBlasSpmv(blas::UpperLower uplo, uint64 n, double alpha, + const DeviceMemory<double> &ap, + const DeviceMemory<double> &x, int incx, double beta, + DeviceMemory<double> *y, int incy); + + // See BlasSupport::DoBlasSpr. + Stream &ThenBlasSpr(blas::UpperLower uplo, uint64 n, float alpha, + const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *ap); + Stream &ThenBlasSpr(blas::UpperLower uplo, uint64 n, double alpha, + const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *ap); + + // See BlasSupport::DoBlasSpr2. + Stream &ThenBlasSpr2(blas::UpperLower uplo, uint64 n, float alpha, + const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *ap); + Stream &ThenBlasSpr2(blas::UpperLower uplo, uint64 n, double alpha, + const DeviceMemory<double> &x, int incx, + const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *ap); + + // See BlasSupport::DoBlasSymv. + Stream &ThenBlasSymv(blas::UpperLower uplo, uint64 n, float alpha, + const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &x, int incx, float beta, + DeviceMemory<float> *y, int incy); + Stream &ThenBlasSymv(blas::UpperLower uplo, uint64 n, double alpha, + const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &x, int incx, double beta, + DeviceMemory<double> *y, int incy); + + // See BlasSupport::DoBlasSyr. + Stream &ThenBlasSyr(blas::UpperLower uplo, uint64 n, float alpha, + const DeviceMemory<float> &x, int incx, + DeviceMemory<float> *a, int lda); + Stream &ThenBlasSyr(blas::UpperLower uplo, uint64 n, double alpha, + const DeviceMemory<double> &x, int incx, + DeviceMemory<double> *a, int lda); + + // See BlasSupport::DoBlasSyr2. + Stream &ThenBlasSyr2(blas::UpperLower uplo, uint64 n, float alpha, + const DeviceMemory<float> &x, int incx, + const DeviceMemory<float> &y, int incy, + DeviceMemory<float> *a, int lda); + Stream &ThenBlasSyr2(blas::UpperLower uplo, uint64 n, double alpha, + const DeviceMemory<double> &x, int incx, + const DeviceMemory<double> &y, int incy, + DeviceMemory<double> *a, int lda); + + // See BlasSupport::DoBlasTbmv. + Stream &ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, uint64 k, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx); + Stream &ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, uint64 k, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx); + Stream &ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, uint64 k, + const DeviceMemory<std::complex<float>> &a, int lda, + DeviceMemory<std::complex<float>> *x, int incx); + Stream &ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, uint64 k, + const DeviceMemory<std::complex<double>> &a, int lda, + DeviceMemory<std::complex<double>> *x, int incx); + + // See BlasSupport::DoBlasTbsv. + Stream &ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, uint64 k, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx); + Stream &ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, uint64 k, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx); + Stream &ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, uint64 k, + const DeviceMemory<std::complex<float>> &a, int lda, + DeviceMemory<std::complex<float>> *x, int incx); + Stream &ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, uint64 k, + const DeviceMemory<std::complex<double>> &a, int lda, + DeviceMemory<std::complex<double>> *x, int incx); + + // See BlasSupport::DoBlasTpmv. + Stream &ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &ap, DeviceMemory<float> *x, + int incx); + Stream &ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &ap, DeviceMemory<double> *x, + int incx); + Stream &ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &ap, + DeviceMemory<std::complex<float>> *x, int incx); + Stream &ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &ap, + DeviceMemory<std::complex<double>> *x, int incx); + + // See BlasSupport::DoBlasTpsv. + Stream &ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &ap, DeviceMemory<float> *x, + int incx); + Stream &ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &ap, DeviceMemory<double> *x, + int incx); + Stream &ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &ap, + DeviceMemory<std::complex<float>> *x, int incx); + Stream &ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &ap, + DeviceMemory<std::complex<double>> *x, int incx); + + // See BlasSupport::DoBlasTrmv. + Stream &ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx); + Stream &ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx); + Stream &ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &a, int lda, + DeviceMemory<std::complex<float>> *x, int incx); + Stream &ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &a, int lda, + DeviceMemory<std::complex<double>> *x, int incx); + + // See BlasSupport::DoBlasTrsv. + Stream &ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<float> &a, int lda, + DeviceMemory<float> *x, int incx); + Stream &ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<double> &a, int lda, + DeviceMemory<double> *x, int incx); + Stream &ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<float>> &a, int lda, + DeviceMemory<std::complex<float>> *x, int incx); + Stream &ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans, + blas::Diagonal diag, uint64 n, + const DeviceMemory<std::complex<double>> &a, int lda, + DeviceMemory<std::complex<double>> *x, int incx); + + // See BlasSupport::DoBlasGemm. + Stream &ThenBlasGemm(blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, float alpha, + const DeviceMemory<float> &a, int lda, + const DeviceMemory<float> &b, int ldb, float beta, + DeviceMemory<float> *c, int ldc); + Stream &ThenBlasGemm(blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, double alpha, + const DeviceMemory<double> &a, int lda, + const DeviceMemory<double> &b, int ldb, double beta, + DeviceMemory<double> *c, int ldc); + Stream &ThenBlasGemm(blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc); + Stream &ThenBlasGemm(blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc); + + // See BlasSupport::DoBlasGemmBatched. + Stream &ThenBlasGemmBatched(blas::Transpose transa, blas::Transpose transb, + uint64 m, uint64 n, uint64 k, float alpha, + const port::ArraySlice<DeviceMemory<float> *> &a, + int lda, + const port::ArraySlice<DeviceMemory<float> *> &b, + int ldb, float beta, + const port::ArraySlice<DeviceMemory<float> *> &c, + int ldc, int batch_count); + Stream &ThenBlasGemmBatched(blas::Transpose transa, blas::Transpose transb, + uint64 m, uint64 n, uint64 k, double alpha, + const port::ArraySlice<DeviceMemory<double> *> &a, + int lda, + const port::ArraySlice<DeviceMemory<double> *> &b, + int ldb, double beta, + const port::ArraySlice<DeviceMemory<double> *> &c, + int ldc, int batch_count); + Stream &ThenBlasGemmBatched( + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, std::complex<float> alpha, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a, int lda, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb, + std::complex<float> beta, + const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc, + int batch_count); + Stream &ThenBlasGemmBatched( + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, std::complex<double> alpha, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a, int lda, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b, int ldb, + std::complex<double> beta, + const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc, + int batch_count); + + // See BlasSupport::DoBlasHemm. + Stream &ThenBlasHemm(blas::Side side, blas::UpperLower uplo, uint64 m, + uint64 n, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc); + Stream &ThenBlasHemm(blas::Side side, blas::UpperLower uplo, uint64 m, + uint64 n, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc); + + // See BlasSupport::DoBlasHerk. + Stream &ThenBlasHerk(blas::UpperLower uplo, blas::Transpose trans, uint64 n, + uint64 k, float alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + float beta, DeviceMemory<std::complex<float>> *c, + int ldc); + Stream &ThenBlasHerk(blas::UpperLower uplo, blas::Transpose trans, uint64 n, + uint64 k, double alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + double beta, DeviceMemory<std::complex<double>> *c, + int ldc); + + // See BlasSupport::DoBlasHer2k. + Stream &ThenBlasHer2k(blas::UpperLower uplo, blas::Transpose trans, uint64 n, + uint64 k, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + float beta, DeviceMemory<std::complex<float>> *c, + int ldc); + Stream &ThenBlasHer2k(blas::UpperLower uplo, blas::Transpose trans, uint64 n, + uint64 k, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + double beta, DeviceMemory<std::complex<double>> *c, + int ldc); + + // See BlasSupport::DoBlasSymm. + Stream &ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m, + uint64 n, float alpha, const DeviceMemory<float> &a, + int lda, const DeviceMemory<float> &b, int ldb, + float beta, DeviceMemory<float> *c, int ldc); + Stream &ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m, + uint64 n, double alpha, const DeviceMemory<double> &a, + int lda, const DeviceMemory<double> &b, int ldb, + double beta, DeviceMemory<double> *c, int ldc); + Stream &ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m, + uint64 n, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc); + Stream &ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m, + uint64 n, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc); + + // See BlasSupport::DoBlasSyrk. + Stream &ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans, uint64 n, + uint64 k, float alpha, const DeviceMemory<float> &a, + int lda, float beta, DeviceMemory<float> *c, int ldc); + Stream &ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans, uint64 n, + uint64 k, double alpha, const DeviceMemory<double> &a, + int lda, double beta, DeviceMemory<double> *c, int ldc); + Stream &ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans, uint64 n, + uint64 k, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc); + Stream &ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans, uint64 n, + uint64 k, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc); + + // See BlasSupport::DoBlasSyr2k. + Stream &ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans, uint64 n, + uint64 k, float alpha, const DeviceMemory<float> &a, + int lda, const DeviceMemory<float> &b, int ldb, + float beta, DeviceMemory<float> *c, int ldc); + Stream &ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans, uint64 n, + uint64 k, double alpha, const DeviceMemory<double> &a, + int lda, const DeviceMemory<double> &b, int ldb, + double beta, DeviceMemory<double> *c, int ldc); + Stream &ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans, uint64 n, + uint64 k, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + const DeviceMemory<std::complex<float>> &b, int ldb, + std::complex<float> beta, + DeviceMemory<std::complex<float>> *c, int ldc); + Stream &ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans, uint64 n, + uint64 k, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + const DeviceMemory<std::complex<double>> &b, int ldb, + std::complex<double> beta, + DeviceMemory<std::complex<double>> *c, int ldc); + + // See BlasSupport::DoBlasTrmm. + Stream &ThenBlasTrmm(blas::Side side, blas::UpperLower uplo, + blas::Transpose transa, blas::Diagonal diag, uint64 m, + uint64 n, float alpha, const DeviceMemory<float> &a, + int lda, DeviceMemory<float> *b, int ldb); + Stream &ThenBlasTrmm(blas::Side side, blas::UpperLower uplo, + blas::Transpose transa, blas::Diagonal diag, uint64 m, + uint64 n, double alpha, const DeviceMemory<double> &a, + int lda, DeviceMemory<double> *b, int ldb); + Stream &ThenBlasTrmm(blas::Side side, blas::UpperLower uplo, + blas::Transpose transa, blas::Diagonal diag, uint64 m, + uint64 n, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + DeviceMemory<std::complex<float>> *b, int ldb); + Stream &ThenBlasTrmm(blas::Side side, blas::UpperLower uplo, + blas::Transpose transa, blas::Diagonal diag, uint64 m, + uint64 n, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + DeviceMemory<std::complex<double>> *b, int ldb); + + // See BlasSupport::DoBlasTrsm. + Stream &ThenBlasTrsm(blas::Side side, blas::UpperLower uplo, + blas::Transpose transa, blas::Diagonal diag, uint64 m, + uint64 n, float alpha, const DeviceMemory<float> &a, + int lda, DeviceMemory<float> *b, int ldb); + Stream &ThenBlasTrsm(blas::Side side, blas::UpperLower uplo, + blas::Transpose transa, blas::Diagonal diag, uint64 m, + uint64 n, double alpha, const DeviceMemory<double> &a, + int lda, DeviceMemory<double> *b, int ldb); + Stream &ThenBlasTrsm(blas::Side side, blas::UpperLower uplo, + blas::Transpose transa, blas::Diagonal diag, uint64 m, + uint64 n, std::complex<float> alpha, + const DeviceMemory<std::complex<float>> &a, int lda, + DeviceMemory<std::complex<float>> *b, int ldb); + Stream &ThenBlasTrsm(blas::Side side, blas::UpperLower uplo, + blas::Transpose transa, blas::Diagonal diag, uint64 m, + uint64 n, std::complex<double> alpha, + const DeviceMemory<std::complex<double>> &a, int lda, + DeviceMemory<std::complex<double>> *b, int ldb); + + // See FftSupport::DoFft. + Stream &ThenFft(fft::Plan *plan, + const DeviceMemory<std::complex<float>> &input, + DeviceMemory<std::complex<float>> *output); + Stream &ThenFft(fft::Plan *plan, + const DeviceMemory<std::complex<double>> &input, + DeviceMemory<std::complex<double>> *output); + Stream &ThenFft(fft::Plan *plan, const DeviceMemory<float> &input, + DeviceMemory<std::complex<float>> *output); + Stream &ThenFft(fft::Plan *plan, const DeviceMemory<double> &input, + DeviceMemory<std::complex<double>> *output); + Stream &ThenFft(fft::Plan *plan, + const DeviceMemory<std::complex<float>> &input, + DeviceMemory<float> *output); + Stream &ThenFft(fft::Plan *plan, + const DeviceMemory<std::complex<double>> &input, + DeviceMemory<double> *output); + + // Makes the RNG use the provided value as the basis for further generation. + // /dev/urandom (good) and /dev/random (better, but sometimes slow) are good + // sources of seed data if the default (high quality) sources are not + // desired. + // For most use cases, this function will not be necessary; each provided + // back-end implementation will be appropriately seeded by default. + // At a minimum 16 bytes of data are required in the seed buffer. + // + // To seed with good (non-reproducable) data: + // File* f = File::Open("/dev/random", "r"); + // int64 bytes_read = f->Read(seed_data, bytes_to_read); + // < error checking > + // stream.ThenSetRngSeed(seed_data, bytes_read); + // + // To seed with reproducible data: + // uint64_t seed_data[2] = { <data> }; + // stream.ThenSetRngSeed(seed_data, 16); + Stream &ThenSetRngSeed(const uint8 *seed, uint64 seed_bytes); + + // Populates the memory indicated by values with uniform-random-distribution + // values. TODO(leary) seeding API/description + // + // Uses the type and size of the DeviceMemory to infer what data should be + // populated. + Stream &ThenPopulateRandUniform(DeviceMemory<float> *values); + Stream &ThenPopulateRandUniform(DeviceMemory<double> *values); + Stream &ThenPopulateRandUniform(DeviceMemory<std::complex<float>> *values); + Stream &ThenPopulateRandUniform(DeviceMemory<std::complex<double>> *values); + Stream &ThenPopulateRandGaussian(float mean, float stddev, + DeviceMemory<float> *values); + Stream &ThenPopulateRandGaussian(double mean, double stddev, + DeviceMemory<double> *values); + + // Entrain onto the stream: a memcpy to a host destination from a GPU source + // of the given target size. host_dst must be a pointer to host memory + // allocated by StreamExecutor::HostMemoryAllocate or otherwise allocated and + // then registered with StreamExecutor::HostMemoryRegister. + Stream &ThenMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src, + uint64 size); + + // Entrain onto the stream: a memcpy to a GPU destination from a host source + // of the given target size. host_src must be a pointer to host memory + // allocated by StreamExecutor::HostMemoryAllocate or otherwise allocated and + // then registered with StreamExecutor::HostMemoryRegister. + Stream &ThenMemcpy(DeviceMemoryBase *gpu_dst, const void *host_src, + uint64 size); + + // Alternative interface for memcpying from device to host that takes an + // array slice. Checks that the destination size can accomodate the host + // slice size. + template <typename T> + Stream &ThenMemcpyD2H(const DeviceMemory<T> &gpu_src, + port::MutableArraySlice<T> host_dst) { + auto host_size = host_dst.size() * sizeof(T); + CHECK(gpu_src.size() == 0 || host_size >= gpu_src.size()); + return ThenMemcpy(host_dst.begin(), gpu_src, host_size); + } + + // Alternative interface for memcpying from host to device that takes an + // array slice. Checks that the destination size can accomodate the host + // slice size. + template <typename T> + Stream &ThenMemcpyH2D(port::ArraySlice<T> host_src, + DeviceMemory<T> *gpu_dst) { + auto host_size = host_src.size() * sizeof(T); + CHECK(gpu_dst->size() == 0 || gpu_dst->size() >= host_size); + return ThenMemcpy(gpu_dst, host_src.begin(), host_size); + } + + // Entrain onto the stream: a memcpy to a GPU destination from a GPU source + // of the given target size. gpu_src/dst must be pointers to GPU memory and + // peer access must be enabled between their owning StreamExecutors. + Stream &ThenMemcpy(DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src, + uint64 size); + + // Calls to the device-to-device copy overload of ThenMemcpy -- useful for + // ensuring that the host pointer isn't getting confused accidentally with a + // device pointer if you're not doing metaprogramming against the API. + Stream &ThenMemcpyD2D(DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &gpu_src, uint64 size) { + return ThenMemcpy(gpu_dst, gpu_src, size); + } + + // Entrain onto the stream: a memset of zero at a GPU location of size + // bytes. + // The location must not be null. + // TODO(leary) Presently the size must be a 4-byte multiple. + Stream &ThenMemZero(DeviceMemoryBase *location, uint64 size); + + // Entrain onto the stream: a memset of a 32-bit pattern at a GPU location + // of + // size bytes, where bytes must be evenly 32-bit sized (i.e. evently + // divisible + // by 4). The location must not be null. + Stream &ThenMemset32(DeviceMemoryBase *location, const uint32 &pattern, + uint64 size); + + // (Synchronously) block the host code waiting for the operations entrained + // on + // the stream (enqueued to this point in program execution) to complete. + bool BlockHostUntilDone(); + + // Warning! This method interacts with internal threads in + // sometimes-unpredictable ways and is intended for GPU-Executor-internal + // use + // only. Please check with a member of the FASTR team before making use of + // this method. + // + // Entrains onto the stream a function to be executed on the host at some + // point in the future. + // Async host callbacks DO NOT block the stream as device functions (or as + // synchronous host callbacks). No synchronization is possible with + // asynchronous callbacks; they are strictly fire-and-forget. + // This method is private due to the potential for undefined behavior with + // synchronization using OpenCL user events. + // The ONLY lifetime guarantee in these calls is that the StreamExecutor + // parameter will still be valid - this Stream may not be! + // Any callbacks requiring device API calls must use this method. + Stream &ThenEnqueueOnBackgroundThread( + std::function<void(StreamExecutor *)> task); + + // Returns the (opaque) platform-specific backing object. Ownership is not + // transferred to the caller. + internal::StreamInterface *implementation() { return implementation_.get(); } + + // Entrains onto the stream a callback to the host (from the device). + // Host callbacks block/occupy the stream just as device functions + // (execute one at a time, block later stream operations). + // Behavior is undefined when synchronizing using OpenCL user events. + // Behavior is undefined if host callbacks call device routines or insert + // them into any stream. + // On certain platforms, ThenDoHostCallback is expected to have significant + // negative effects on performance. + Stream &ThenDoHostCallback(std::function<void()> callback); + + // Identical to ThenDoHostCallback; only exposed for testing purposes. + Stream &ThenDoHostCallbackForTest(std::function<void()> callback); + + // Returns the StreamExecutor (parent object) associated with this stream. + StreamExecutor *parent() const { + CHECK(parent_ != nullptr); + return parent_; + } + + // Returns the (internal usage) temporary-memory-allocation manager associated + // with this stream. + internal::TemporaryMemoryManager *temporary_memory_manager(); + + private: + friend class host::HostBlas; // for parent_. + friend class host::HostFft; // for parent_. + friend class host::HostRng; // for parent_. + template <typename... Args> + friend struct ThenBlasImpl; // for implementing ThenBlasXXX. + friend class ocl::CLBlas; // for parent_. + + bool InErrorState() const { + shared_lock lock{mu_}; + return !ok_; + } + + // Sets the error state if operation_retcode is false. + // This is a useful shorthand for many stream routines. + void CheckError(bool operation_retcode) { + if (operation_retcode) { + return; + } + mutex_lock lock{mu_}; + ok_ = false; + } + + void SetError() { CheckError(false /* = operation_retcode */); } + + // The platform-dependent implementation that the StreamExecutor interface + // delegates to. + std::unique_ptr<internal::StreamInterface> implementation_; + + // The StreamExecutor that supports the operation of this stream. + StreamExecutor *parent_; + + // mutex that guards the allocation / error state flags. + // Mutable so that it can be obtained via const reader lock. + mutable mutex mu_; + + // Whether Init() was successfully called to allocate this stream on the + // underlying platform. It simply flips from 0 to 1 with a sanity check. + // See StreamExecutor::AllocateStream. + bool allocated_ GUARDED_BY(mu_); + + // Whether all operations have entrained successfully to the current program + // point. + bool ok_ GUARDED_BY(mu_); + + // Sub-streams that are generated from this stream. Each element has a pointer + // to sub-stream and a boolean value indicating if this substream is ready to + // be reused. + std::vector<std::pair<std::unique_ptr<Stream>, bool>> sub_streams_ + GUARDED_BY(mu_); + + // Streams can allocate temporary memories to help with work they enqueue + // (e.g. for scratch memory spaces). This member tracks those allocations and + // notes when they can be reclaimed -- reclamation is attempted when + // BlockHostUntilDone() is called. + internal::TemporaryMemoryManager temporary_memory_manager_; + + SE_DISALLOW_COPY_AND_ASSIGN(Stream); +}; + +//////////// +// Inlines + +template <typename T> +inline port::StatusOr<std::unique_ptr<TemporaryDeviceMemory<T>>> +Stream::AllocateTemporaryArray(uint64 element_count) { + return temporary_memory_manager_.AllocateArray<T>(element_count); +} + +inline internal::TemporaryMemoryManager *Stream::temporary_memory_manager() { + return &temporary_memory_manager_; +} + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_STREAM_H_ diff --git a/tensorflow/stream_executor/stream_executor.h b/tensorflow/stream_executor/stream_executor.h new file mode 100644 index 0000000000..3bccaec5e3 --- /dev/null +++ b/tensorflow/stream_executor/stream_executor.h @@ -0,0 +1,50 @@ +// The StreamExecutor is a single-device abstraction for: +// +// * Loading/launching data-parallel-kernels +// * Invoking pre-canned high-performance library routines (like matrix +// multiply) +// +// The appropriately-typed kernel and "loader spec" are automatically generated +// for the user within a namespace by the gcudacc compiler output, so typical +// use looks like so: +// +// namespace gpu = ::perftools::gputools; +// namespace gcudacc = ::platforms::gpus::gcudacc; +// +// gpu::StreamExecutor stream_exec{PlatformKind::kCuda}; +// gcudacc::kernel::MyKernel my_kernel{&stream_exec}; +// bool ok = stream_exec.GetKernel(gcudacc::spec::MyKernelSpec(), +// &my_kernel); +// if (!ok) { ... } +// gpu::DeviceMemory<int> result = stream_exec.AllocateZeroed<int>(); +// if (result == nullptr) { ... } +// int host_result; +// gpu::Stream my_stream{&stream_exec}; +// my_stream +// .Init() +// .ThenLaunch(ThreadDim{1024}, BlockDim{1}, my_kernel, result) +// .ThenMemcpy(&host_result, result, sizeof(host_result)) +// .BlockHostUntilDone() +// if (!my_stream.ok()) { ... } +// printf("%d\n", host_result); +// +// Since the device may operate asynchronously to the host, the +// Stream::BlockHostUntilDone() call forces the calling host thread to wait for +// the chain of commands specified for the Stream to complete execution. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_H_ +#define TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_H_ + +#include "tensorflow/stream_executor/device_description.h" // IWYU pragma: export +#include "tensorflow/stream_executor/device_memory.h" // IWYU pragma: export +#include "tensorflow/stream_executor/device_options.h" // IWYU pragma: export +#include "tensorflow/stream_executor/event.h" // IWYU pragma: export +#include "tensorflow/stream_executor/kernel.h" // IWYU pragma: export +#include "tensorflow/stream_executor/kernel_spec.h" // IWYU pragma: export +#include "tensorflow/stream_executor/launch_dim.h" // IWYU pragma: export +#include "tensorflow/stream_executor/platform.h" // IWYU pragma: export +#include "tensorflow/stream_executor/stream.h" // IWYU pragma: export +#include "tensorflow/stream_executor/stream_executor_pimpl.h" // IWYU pragma: export +#include "tensorflow/stream_executor/timer.h" // IWYU pragma: export + +#endif // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_H_ diff --git a/tensorflow/stream_executor/stream_executor_internal.cc b/tensorflow/stream_executor/stream_executor_internal.cc new file mode 100644 index 0000000000..b2785e0874 --- /dev/null +++ b/tensorflow/stream_executor/stream_executor_internal.cc @@ -0,0 +1,65 @@ +#include "tensorflow/stream_executor/stream_executor_internal.h" + +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" + +namespace perftools { +namespace gputools { +namespace internal { + +// -- CUDA + +StreamExecutorFactory* MakeCUDAExecutorImplementation() { + static StreamExecutorFactory instance; + return &instance; +} +EventFactory* MakeCUDAEventImplementation() { + static EventFactory instance; + return &instance; +} +StreamFactory* MakeCUDAStreamImplementation() { + static StreamFactory instance; + return &instance; +} +TimerFactory* MakeCUDATimerImplementation() { + static TimerFactory instance; + return &instance; +} +KernelFactory* MakeCUDAKernelImplementation() { + static KernelFactory instance; + return &instance; +} + +// -- OpenCL + +StreamExecutorFactory* MakeOpenCLExecutorImplementation() { + static StreamExecutorFactory instance; + return &instance; +} +StreamExecutorFactory* MakeOpenCLAlteraExecutorImplementation() { + static StreamExecutorFactory instance; + return &instance; +} +StreamFactory* MakeOpenCLStreamImplementation() { + static StreamFactory instance; + return &instance; +} +TimerFactory* MakeOpenCLTimerImplementation() { + static TimerFactory instance; + return &instance; +} +KernelFactory* MakeOpenCLKernelImplementation() { + static KernelFactory instance; + return &instance; +} + +// -- Host + +StreamExecutorFactory MakeHostExecutorImplementation; +StreamFactory MakeHostStreamImplementation; +TimerFactory MakeHostTimerImplementation; + + +} // namespace internal +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h new file mode 100644 index 0000000000..5b4e596cfe --- /dev/null +++ b/tensorflow/stream_executor/stream_executor_internal.h @@ -0,0 +1,364 @@ +// Interfaces for platform-dependent implementations to satisfy. This are +// delegated to from the StreamExecutor in pointer-to-implementation style; i.e. +// the StreamExecutor is just a husk that delegates calls to the +// platform-specific objects which implement the interfaces defined here. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_ +#define TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_ + +#include <functional> +#include <map> +#include <memory> +#include <utility> +#include <vector> + +#include "tensorflow/stream_executor/device_description.h" +#include "tensorflow/stream_executor/device_memory.h" +#include "tensorflow/stream_executor/device_options.h" +#include "tensorflow/stream_executor/dnn.h" +#include "tensorflow/stream_executor/event.h" +#include "tensorflow/stream_executor/kernel.h" +#include "tensorflow/stream_executor/kernel_cache_config.h" +#include "tensorflow/stream_executor/kernel_spec.h" +#include "tensorflow/stream_executor/launch_dim.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/platform.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/plugin_registry.h" +#include "tensorflow/stream_executor/shared_memory_config.h" +#include "tensorflow/stream_executor/trace_listener.h" +#include "tensorflow/stream_executor/lib/inlined_vector.h" + +namespace perftools { +namespace gputools { + +class KernelBase; +class Stream; +class Timer; + +namespace blas { +class BlasSupport; +} // namespace blas + +namespace fft { +class Support; +} // namespace fft + +namespace rng { +class RngSupport; +} // namespace rng + +} // namespace gputools +} // namespace perftools + +namespace perftools { +namespace gputools { +namespace internal { + +// Interface for the different StreamExecutor platforms (i.e. CUDA, OpenCL). +// +// Various platforms will provide an implementation that satisfy this interface. +class StreamExecutorInterface { + public: + // Default constructor for the abstract interface. + StreamExecutorInterface() {} + + // Default destructor for the abstract interface. + virtual ~StreamExecutorInterface() {} + + // Returns the (transitively) wrapped executor if this executor is + // wrapping another executor; otherwise, returns this. + virtual StreamExecutorInterface *GetUnderlyingExecutor() { return this; } + + // See the StreamExecutor interface for comments on the same-named methods. + virtual port::Status Init(int device_ordinal, + DeviceOptions device_options) = 0; + virtual bool GetKernel(const MultiKernelLoaderSpec &spec, + KernelBase *kernel) { + return false; + } + virtual bool Launch(Stream *stream, const ThreadDim &thread_dims, + const BlockDim &block_dims, const KernelBase &k, + const std::vector<KernelArg> &args) { + return false; + } + virtual void *Allocate(uint64 size) = 0; + virtual void *AllocateSubBuffer(DeviceMemoryBase *parent, uint64 offset, + uint64 size) = 0; + virtual void Deallocate(DeviceMemoryBase *mem) = 0; + virtual void *HostMemoryAllocate(uint64 size) = 0; + virtual void HostMemoryDeallocate(void *mem) = 0; + virtual bool HostMemoryRegister(void *mem, uint64 size) = 0; + virtual bool HostMemoryUnregister(void *mem) = 0; + virtual bool SynchronizeAllActivity() = 0; + virtual bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) = 0; + virtual bool SynchronousMemSet(DeviceMemoryBase *location, int value, + uint64 size) = 0; + virtual bool SynchronousMemcpy(DeviceMemoryBase *gpu_dst, + const void *host_src, uint64 size) = 0; + virtual bool SynchronousMemcpy(void *host_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) = 0; + virtual bool SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) = 0; + virtual bool MemZero(Stream *stream, DeviceMemoryBase *location, + uint64 size) = 0; + virtual bool Memset32(Stream *stream, DeviceMemoryBase *location, + uint32 pattern, uint64 size) = 0; + virtual bool Memcpy(Stream *stream, void *host_dst, + const DeviceMemoryBase &gpu_src, uint64 size) = 0; + virtual bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, + const void *host_src, uint64 size) = 0; + virtual bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &host_src, + uint64 size) = 0; + virtual bool HostCallback(Stream *stream, std::function<void()> callback) = 0; + virtual port::Status AllocateEvent(Event *event) = 0; + virtual port::Status DeallocateEvent(Event *event) = 0; + virtual port::Status RecordEvent(Stream *stream, Event *event) = 0; + virtual port::Status WaitForEvent(Stream *stream, Event *event) = 0; + virtual Event::Status PollForEventStatus(Event *event) = 0; + virtual bool AllocateStream(Stream *stream) = 0; + virtual void DeallocateStream(Stream *stream) = 0; + virtual bool CreateStreamDependency(Stream *dependent, Stream *other) = 0; + virtual bool AllocateTimer(Timer *timer) = 0; + virtual void DeallocateTimer(Timer *timer) = 0; + virtual bool StartTimer(Stream *stream, Timer *timer) = 0; + virtual bool StopTimer(Stream *stream, Timer *timer) = 0; + virtual bool BlockHostUntilDone(Stream *stream) = 0; + virtual int PlatformDeviceCount() = 0; + virtual port::Status EnablePeerAccessTo(StreamExecutorInterface *other) = 0; + virtual bool CanEnablePeerAccessTo(StreamExecutorInterface *other) = 0; + virtual SharedMemoryConfig GetDeviceSharedMemoryConfig() = 0; + virtual port::Status SetDeviceSharedMemoryConfig( + SharedMemoryConfig config) = 0; + + virtual bool DeviceMemoryUsage(int64 *free, int64 *total) const { + return false; + } + + // Retrieves device pointer and size for a symbol. The device pointer is + // stored at mem, and the size is stored at size. Either mem or bytes can be + // null, however, both of them cannot be null at the same time. To use + // constant memory in CUDA, GetSymbol has to be used. Returns true if symbol + // is found. + virtual bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) { + return false; + } + + // Creates a new DeviceDescription object. Ownership is transferred to the + // caller. + virtual DeviceDescription *PopulateDeviceDescription() const = 0; + + virtual KernelArg DeviceMemoryToKernelArg( + const DeviceMemoryBase &gpu_mem) const = 0; + + // Attempts to register the provided TraceListener with the device-specific + // Executor implementation. When this is called, the PIMPL interface has + // already taken ownership of the object and is managing the generic tracing + // events. The device-specific implementation must determine if the passed + // listener is of a type appropriate for it to trace during registration (and + // before dispatching events to it). + // Returns true if the listener was successfully registered, false otherwise. + // Does not take ownership of listener. + virtual bool RegisterTraceListener(TraceListener* listener) { return false; } + + // Unregisters the specified listener from the device-specific Executor. + // Returns true if the listener was successfully registered, false otherwise. + virtual bool UnregisterTraceListener(TraceListener* listener) { + return false; + } + + // Returns whether this StreamExecutor has BLAS support for its underlying + // platform. + virtual bool SupportsBlas() const { return false; } + + // Creates a new BlasSupport object, ownership is transferred to the caller. + // If SupportsBlas() is false, this will always return null. + // + // If SupportsBlas() is true, this may return null, for example, if the BLAS + // initialization fails. + virtual blas::BlasSupport *CreateBlas() { return nullptr; } + + // Returns whether this StreamExecutor has FFT support for its underlying + // platform. + virtual bool SupportsFft() const { return false; } + + // Creates a new fft::FftSupport object, ownership is transferred to the + // caller. + // If SupportsFft() is false, this will always return null. + // + // If SupportsFft() is true, this may return null, for example, if the FFT + // initialization fails. + virtual fft::FftSupport *CreateFft() { return nullptr; } + + // Returns whether this StreamExecutor has Random Number Generation support + // for + // its underlying platform. + virtual bool SupportsRng() const { return false; } + + // Returns whether this StreamExecutor has neural net support for its + // underlying + // platform. + virtual bool SupportsDnn() const { return false; } + + // Creates a new RngSupport object, ownership is transferred to the caller. + // If SupportsRng() is false, this will always return null. + // + // If SupportsRng() is true, this may return null, for example, if the RNG + // initialization fails. + virtual rng::RngSupport *CreateRng() { return nullptr; } + + // Creates a new DnnSupport object, ownership is transferred to the caller. + // If SupportsDnn() is false, this will always return null. + // + // If SupportsDnn() is true, this may return null, for example, if the RNG + // initialization fails. + virtual dnn::DnnSupport *CreateDnn() { return nullptr; } + + // Please read the warning below. This method is only temporary. See + // http://b/15759750 + // + // Returns the CUDA context associated with this StreamExecutor platform + // implementation. + // + // WARNING: checks that the underlying platform is, in fact, CUDA, causing a + // fatal error if it is not. This hack is made available solely for use from + // distbelief code, which temporarily has strong ties to CUDA as a platform. + virtual void *CudaContextHack() { return nullptr; } + + private: + SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutorInterface); +}; + +// Pointer-to-implementation object type (i.e. the KernelBase class delegates to +// this interface) with virtual destruction. This class exists for the +// platform-dependent code to hang any kernel data/resource info/functionality +// off of. +class KernelInterface { + public: + // Default constructor for the abstract interface. + KernelInterface() {} + + // Default destructor for the abstract interface. + virtual ~KernelInterface() {} + + // Returns the number of formal parameters that this kernel accepts. + virtual unsigned Arity() const = 0; + + // Sets the preferred cache configuration. + virtual void SetPreferredCacheConfig(KernelCacheConfig config) = 0; + + // Gets the preferred cache configuration. + virtual KernelCacheConfig GetPreferredCacheConfig() const = 0; + + private: + SE_DISALLOW_COPY_AND_ASSIGN(KernelInterface); +}; + +// Platform-dependent interface class for the generic Events interface, in +// the PIMPL style. +class EventInterface { + public: + EventInterface() {} + virtual ~EventInterface() {} + + private: + SE_DISALLOW_COPY_AND_ASSIGN(EventInterface); +}; + +// Pointer-to-implementation object type (i.e. the Stream class delegates to +// this interface) with virtual destruction. This class exists for the +// platform-dependent code to hang any kernel data/resource info/functionality +// off of. +class StreamInterface { + public: + // Default constructor for the abstract interface. + StreamInterface() {} + + // Default destructor for the abstract interface. + virtual ~StreamInterface() {} + + // Please read the warning below. This method is only temporary. See + // http://b/15759750 + // + // Returns the CUDA stream associated with this platform's stream + // implementation. + // + // WARNING: checks that the underlying platform is, in fact, CUDA, causing a + // fatal error if it is not. This hack is made available solely for use from + // distbelief code, which temporarily has strong ties to CUDA as a platform. + virtual void *CudaStreamHack() { return nullptr; } + + // Please read the warning above. This method is only temporary. See + // http://b/15759750 + // + // See the above comment on CudaStreamHack -- this further breaks abstraction + // for Eigen within distbelief, which has strong ties to CUDA as a platform, + // and a historical attachment to a programming model which takes a + // stream-slot rather than a stream-value. + virtual void **CudaStreamMemberHack() { return nullptr; } + + private: + SE_DISALLOW_COPY_AND_ASSIGN(StreamInterface); +}; + +// Pointer-to-implementation object type (i.e. the Timer class delegates to +// this interface) with virtual destruction. This class exists for the +// platform-dependent code to hang any timer data/resource info/functionality +// off of. +class TimerInterface { + public: + // Default constructor for the abstract interface. + TimerInterface() {} + + // Default destructor for the abstract interface. + virtual ~TimerInterface() {} + + // Returns the number of microseconds elapsed in a completed timer. + virtual uint64 Microseconds() const = 0; + + // Returns the number of nanoseconds elapsed in a completed timer. + virtual uint64 Nanoseconds() const = 0; + + private: + SE_DISALLOW_COPY_AND_ASSIGN(TimerInterface); +}; + +// Extern functions for constructing platform-specific instances that conform to +// the StreamExecutor interface. (Defining constructor functions extern in this +// way prevents CUDA/OpenCL headers from leaking into any shared header files.) +// +// TODO(leary) switch this all over to registries. + +using StreamExecutorFactory = + std::function<StreamExecutorInterface *(const PluginConfig &)>; +using EventFactory = std::function<EventInterface *(StreamExecutor *)>; +using StreamFactory = std::function<StreamInterface *(StreamExecutor *)>; +using TimerFactory = std::function<TimerInterface *(StreamExecutor *)>; +using KernelFactory = std::function<KernelInterface*()>; + +EventFactory* MakeCUDAEventImplementation(); +StreamExecutorFactory* MakeCUDAExecutorImplementation(); +StreamFactory* MakeCUDAStreamImplementation(); +TimerFactory* MakeCUDATimerImplementation(); +KernelFactory* MakeCUDAKernelImplementation(); + +StreamExecutorFactory* MakeOpenCLExecutorImplementation(); +StreamExecutorFactory* MakeOpenCLAlteraExecutorImplementation(); +StreamFactory* MakeOpenCLStreamImplementation(); +TimerFactory* MakeOpenCLTimerImplementation(); +KernelFactory* MakeOpenCLKernelImplementation(); + +extern StreamExecutorFactory MakeHostExecutorImplementation; +extern StreamFactory MakeHostStreamImplementation; +extern TimerFactory MakeHostTimerImplementation; + + +} // namespace internal +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_ diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc new file mode 100644 index 0000000000..22b7a50b79 --- /dev/null +++ b/tensorflow/stream_executor/stream_executor_pimpl.cc @@ -0,0 +1,642 @@ +// Implements the StreamExecutor interface by passing through to its +// implementation_ value (in pointer-to-implementation style), which +// implements StreamExecutorInterface. + +#include "tensorflow/stream_executor/stream_executor_pimpl.h" + +#include <atomic> + +#include "tensorflow/stream_executor/blas.h" +#include "tensorflow/stream_executor/fft.h" +#include "tensorflow/stream_executor/lib/env.h" +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/notification.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" +#include "tensorflow/stream_executor/lib/threadpool.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/rng.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" + +namespace { +bool FLAGS_check_gpu_leaks = false; +} // namespace + +namespace perftools { +namespace gputools { +namespace { + +// Maximum stack depth to report when generating backtrace on mem allocation +// (for GPU memory leak checker) +static const int kMaxStackDepth = 256; + +// Make sure the executor is done with its work; we know (because this isn't +// publicly visible) that all enqueued work is quick. +void BlockOnThreadExecutor(port::ThreadPool *executor) { + port::Notification n; + executor->Schedule([&n]() { n.Notify(); }); + n.WaitForNotification(); +} + +internal::StreamExecutorInterface *StreamExecutorImplementationFromPlatformKind( + PlatformKind platform_kind, const PluginConfig &plugin_config) { + // Note: we use this factory-assignment-in-switch pattern instead of just + // invoking the callable in case linkage is messed up -- instead of invoking a + // nullptr std::function (due to failed registration) we give a nice + // LOG(FATAL) message. + internal::StreamExecutorFactory factory; + switch (platform_kind) { + case PlatformKind::kCuda: + factory = *internal::MakeCUDAExecutorImplementation(); + break; + case PlatformKind::kOpenCL: + factory = *internal::MakeOpenCLExecutorImplementation(); + break; + case PlatformKind::kOpenCLAltera: + factory = *internal::MakeOpenCLAlteraExecutorImplementation(); + break; + case PlatformKind::kHost: + factory = internal::MakeHostExecutorImplementation; + break; + default: + factory = nullptr; + } + if (factory == nullptr) { + LOG(FATAL) + << "cannot create GPU executor implementation for platform kind: " + << PlatformKindString(platform_kind); + } + return factory(plugin_config); +} + +std::atomic_int_fast64_t correlation_id_generator(0); + +} // namespace + +template <typename BeginCallT, typename CompleteCallT, + typename ReturnT, typename... BeginArgsT> +class ScopedTracer { + public: + ScopedTracer(StreamExecutor *stream_exec, BeginCallT begin_call, + CompleteCallT complete_call, const ReturnT *result, + BeginArgsT... begin_args) + : stream_exec_(stream_exec), + complete_call_(complete_call), + result_(result) { + if (stream_exec_->tracing_enabled_) { + correlation_id_ = + correlation_id_generator.fetch_add(1, std::memory_order_relaxed) - 1; + Trace(begin_call, begin_args...); + } + } + + ~ScopedTracer() { + if (stream_exec_->tracing_enabled_) { + Trace(complete_call_, result_); + } + } + + private: + template <typename CallbackT, typename... TraceArgsT> + void Trace(CallbackT callback, TraceArgsT... args) { + { + // Instance tracers held in a block to limit the lock lifetime. + shared_lock lock{stream_exec_->mu_}; + for (TraceListener *listener : stream_exec_->listeners_) { + (listener->*callback)(correlation_id_, + std::forward<TraceArgsT>(args)...); + } + } + } + + StreamExecutor *stream_exec_; + CompleteCallT complete_call_; + const ReturnT* result_; + int64 correlation_id_; +}; + +template <typename BeginCallT, typename CompleteCallT, typename ReturnT, + typename... BeginArgsT> +ScopedTracer<BeginCallT, CompleteCallT, ReturnT, BeginArgsT...> +MakeScopedTracer(StreamExecutor *stream_exec, BeginCallT begin_call, + CompleteCallT complete_call, ReturnT *result, + BeginArgsT... begin_args) { + return ScopedTracer<BeginCallT, CompleteCallT, ReturnT, BeginArgsT...>( + stream_exec, begin_call, complete_call, result, + std::forward<BeginArgsT>(begin_args)...); +} + +#define SCOPED_TRACE(LOC, ...) \ + auto tracer = MakeScopedTracer(this, &LOC ## Begin, \ + &LOC ## Complete, ## __VA_ARGS__); + +/* static */ mutex StreamExecutor::static_mu_{LINKER_INITIALIZED}; + +StreamExecutor::StreamExecutor(PlatformKind platform_kind, + const PluginConfig &plugin_config) + : implementation_(StreamExecutorImplementationFromPlatformKind( + platform_kind, plugin_config)), + platform_kind_(platform_kind), + device_ordinal_(-1), + background_threads_(new port::ThreadPool( + port::Env::Default(), "stream_executor", kNumBackgroundThreads)), + live_stream_count_(0), + tracing_enabled_(false) { + CheckPlatformKindIsValid(platform_kind); +} + +StreamExecutor::StreamExecutor( + PlatformKind platform_kind, + internal::StreamExecutorInterface *implementation) + : implementation_(implementation), + platform_kind_(platform_kind), + device_ordinal_(-1), + background_threads_(new port::ThreadPool( + port::Env::Default(), "stream_executor", kNumBackgroundThreads)), + live_stream_count_(0), + tracing_enabled_(false) { + CheckPlatformKindIsValid(platform_kind); +} + +StreamExecutor::~StreamExecutor() { + BlockOnThreadExecutor(background_threads_.get()); + + if (live_stream_count_.load() != 0) { + LOG(WARNING) << "Not all streams were deallocated at executor destruction " + << "time. This may lead to unexpected/bad behavior - " + << "especially if any stream is still active!"; + } + + if (FLAGS_check_gpu_leaks) { + for (auto it : mem_allocs_) { + LOG(INFO) << "Memory alloced at executor exit: addr: " + << port::Printf("%p", it.first) + << ", bytes: " << it.second.bytes << ", trace: \n" + << it.second.stack_trace; + } + } +} + +port::Status StreamExecutor::Init(int device_ordinal, + DeviceOptions device_options) { + device_ordinal_ = device_ordinal; + return implementation_->Init(device_ordinal, device_options); +} + +port::Status StreamExecutor::Init() { + return Init(0, DeviceOptions::Default()); +} + +bool StreamExecutor::GetKernel(const MultiKernelLoaderSpec &spec, + KernelBase *kernel) { + return implementation_->GetKernel(spec, kernel); +} + +void StreamExecutor::Deallocate(DeviceMemoryBase *mem) { + VLOG(1) << "Called StreamExecutor::Deallocate(mem=" << mem->opaque() + << ") mem->size()=" << mem->size(); + + if (mem->opaque() != nullptr) { + EraseAllocRecord(mem->opaque()); + } + implementation_->Deallocate(mem); + mem->Reset(nullptr, 0); +} + +void StreamExecutor::GetMemAllocs(std::map<void *, AllocRecord> *records_out) { + shared_lock lock{mu_}; + *records_out = mem_allocs_; +} + +bool StreamExecutor::CanEnablePeerAccessTo(StreamExecutor *other) { + return implementation_->CanEnablePeerAccessTo(other->implementation_.get()); +} + +port::Status StreamExecutor::EnablePeerAccessTo(StreamExecutor *other) { + return implementation_->EnablePeerAccessTo(other->implementation_.get()); +} + +SharedMemoryConfig StreamExecutor::GetDeviceSharedMemoryConfig() { + return implementation_->GetDeviceSharedMemoryConfig(); +} + +port::Status StreamExecutor::SetDeviceSharedMemoryConfig( + SharedMemoryConfig config) { + if (config != SharedMemoryConfig::kDefault && + config != SharedMemoryConfig::kFourByte && + config != SharedMemoryConfig::kEightByte) { + string error_msg = port::Printf( + "Invalid shared memory config specified: %d", static_cast<int>(config)); + LOG(ERROR) << error_msg; + return port::Status{port::error::INVALID_ARGUMENT, error_msg}; + } + return implementation_->SetDeviceSharedMemoryConfig(config); +} + +const DeviceDescription &StreamExecutor::GetDeviceDescription() const { + mutex_lock lock{mu_}; + if (device_description_ != nullptr) { + return *device_description_; + } + + device_description_.reset(PopulateDeviceDescription()); + return *device_description_; +} + +int StreamExecutor::PlatformDeviceCount() const { + return implementation_->PlatformDeviceCount(); +} + +bool StreamExecutor::SupportsBlas() const { + return implementation_->SupportsBlas(); +} + +bool StreamExecutor::SupportsRng() const { + return implementation_->SupportsRng(); +} + +bool StreamExecutor::SupportsDnn() const { + return implementation_->SupportsDnn(); +} + +dnn::DnnSupport *StreamExecutor::AsDnn() { + mutex_lock lock{mu_}; + if (dnn_ != nullptr) { + return dnn_.get(); + } + + dnn_.reset(implementation_->CreateDnn()); + return dnn_.get(); +} + +blas::BlasSupport *StreamExecutor::AsBlas() { + mutex_lock lock{mu_}; + if (blas_ != nullptr) { + return blas_.get(); + } + + blas_.reset(implementation_->CreateBlas()); + return blas_.get(); +} + +fft::FftSupport *StreamExecutor::AsFft() { + mutex_lock lock{mu_}; + if (fft_ != nullptr) { + return fft_.get(); + } + + fft_.reset(implementation_->CreateFft()); + return fft_.get(); +} + +rng::RngSupport *StreamExecutor::AsRng() { + mutex_lock lock{mu_}; + if (rng_ != nullptr) { + return rng_.get(); + } + + rng_.reset(implementation_->CreateRng()); + return rng_.get(); +} + +bool StreamExecutor::Launch(Stream *stream, const ThreadDim &thread_dims, + const BlockDim &block_dims, + const KernelBase &kernel, + const std::vector<KernelArg> &args) { + SubmitTrace(&TraceListener::LaunchSubmit, stream, thread_dims, block_dims, + kernel, args); + + return implementation_->Launch(stream, thread_dims, block_dims, kernel, args); +} + +bool StreamExecutor::BlockHostUntilDone(Stream *stream) { + bool result; + SCOPED_TRACE(TraceListener::BlockHostUntilDone, &result, stream); + + result = implementation_->BlockHostUntilDone(stream); + return result; +} + +void *StreamExecutor::Allocate(uint64 size) { + void *buf = implementation_->Allocate(size); + VLOG(1) << "Called StreamExecutor::Allocate(size=" << size + << ") returns " << buf; + CreateAllocRecord(buf, size); + + return buf; +} + +bool StreamExecutor::GetSymbol(const string &symbol_name, void **mem, + size_t *bytes) { + return implementation_->GetSymbol(symbol_name, mem, bytes); +} + +void *StreamExecutor::HostMemoryAllocate(uint64 size) { + void *buffer = implementation_->HostMemoryAllocate(size); + VLOG(1) << "Called StreamExecutor::HostMemoryAllocate(size=" << size + << ") returns " << buffer; + return buffer; +} + +void StreamExecutor::HostMemoryDeallocate(void *location) { + VLOG(1) << "Called StreamExecutor::HostMemoryDeallocate(location=" + << location << ")"; + + return implementation_->HostMemoryDeallocate(location); +} + +bool StreamExecutor::HostMemoryRegister(void *location, uint64 size) { + VLOG(1) << "Called StreamExecutor::HostMemoryRegister(location=" << location + << ", size=" << size << ")"; + if (location == nullptr || size == 0) { + LOG(WARNING) << "attempting to register null or zero-sized memory: " + << location << "; size " << size; + } + return implementation_->HostMemoryRegister(location, size); +} + +bool StreamExecutor::HostMemoryUnregister(void *location) { + VLOG(1) << "Called StreamExecutor::HostMemoryUnregister(location=" << location + << ")"; + return implementation_->HostMemoryUnregister(location); +} + +bool StreamExecutor::SynchronizeAllActivity() { + VLOG(1) << "Called StreamExecutor::SynchronizeAllActivity()"; + bool ok = implementation_->SynchronizeAllActivity(); + + // This should all be quick and infallible work, so we can perform the + // synchronization even in the case of failure. + BlockOnThreadExecutor(background_threads_.get()); + + return ok; +} + +bool StreamExecutor::SynchronousMemZero(DeviceMemoryBase *location, + uint64 size) { + VLOG(1) << "Called StreamExecutor::SynchronousMemZero(location=" + << location << ", size=" << size << ")"; + + return implementation_->SynchronousMemZero(location, size); +} + +bool StreamExecutor::SynchronousMemSet(DeviceMemoryBase *location, int value, + uint64 size) { + VLOG(1) << "Called StreamExecutor::SynchronousMemSet(location=" + << location << ", value=" << value << ", size=" << size << ")"; + + return implementation_->SynchronousMemSet(location, value, size); +} + +bool StreamExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst, + const void *host_src, uint64 size) { + VLOG(1) << "Called StreamExecutor::SynchronousMemcpy(gpu_dst=" + << gpu_dst->opaque() << ", host_src=" << host_src << ", size=" << size + << ") H2D"; + + // Tracing overloaded methods is very difficult due to issues with type + // inference on template args. Since use of these overloaded methods is + // discouraged anyway, this isn't a huge deal. + return implementation_->SynchronousMemcpy(gpu_dst, host_src, size); +} + +bool StreamExecutor::SynchronousMemcpy(void *host_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) { + VLOG(1) << "Called StreamExecutor::SynchronousMemcpy(host_dst=" + << host_dst << ", gpu_src=" << gpu_src.opaque() << ", size=" << size + << ") D2H"; + + return implementation_->SynchronousMemcpy(host_dst, gpu_src, size); +} + +bool StreamExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) { + VLOG(1) << "Called StreamExecutor::SynchronousMemcpy(gpu_dst=" + << gpu_dst->opaque() << ", gpu_src=" << gpu_src.opaque() << ", size=" << size + << ") D2D"; + + return implementation_->SynchronousMemcpyDeviceToDevice(gpu_dst, gpu_src, + size); +} + +port::Status StreamExecutor::SynchronousMemcpyD2H( + const DeviceMemoryBase &gpu_src, int64 size, void *host_dst) { + VLOG(1) << "Called StreamExecutor::SynchronousMemcpyD2H(gpu_src=" + << gpu_src.opaque() << ", size=" << size << ", host_dst=" << host_dst << ")"; + + port::Status result{port::Status::OK()}; + SCOPED_TRACE(TraceListener::SynchronousMemcpyD2H, + &result, gpu_src, size, host_dst); + + if (!implementation_->SynchronousMemcpy(host_dst, gpu_src, size)) { + return port::Status{ + port::error::INTERNAL, + port::Printf( + "failed to synchronously memcpy device-to-host: GPU %p to host %p " + "size %lld", + gpu_src.opaque(), host_dst, size)}; + } + + return result; +} + +port::Status StreamExecutor::SynchronousMemcpyH2D(const void *host_src, + int64 size, + DeviceMemoryBase *gpu_dst) { + VLOG(1) << "Called StreamExecutor::SynchronousMemcpyH2D(host_src=" + << host_src << ", size=" << size << ", gpu_dst" << gpu_dst->opaque() << ")"; + + port::Status result{port::Status::OK()}; + SCOPED_TRACE(TraceListener::SynchronousMemcpyH2D, + &result, host_src, size, gpu_dst); + + if (!implementation_->SynchronousMemcpy(gpu_dst, host_src, size)) { + result = port::Status{ + port::error::INTERNAL, + port::Printf("failed to synchronously memcpy host-to-device: host " + "%p to GPU %p size %lld", + host_src, gpu_dst->opaque(), size)}; + } + + return result; +} + +bool StreamExecutor::Memcpy(Stream *stream, void *host_dst, + const DeviceMemoryBase &gpu_src, uint64 size) { + return implementation_->Memcpy(stream, host_dst, gpu_src, size); +} + +bool StreamExecutor::Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, + const void *host_src, uint64 size) { + return implementation_->Memcpy(stream, gpu_dst, host_src, size); +} + +bool StreamExecutor::MemcpyDeviceToDevice(Stream *stream, + DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) { + return implementation_->MemcpyDeviceToDevice(stream, gpu_dst, gpu_src, size); +} + +bool StreamExecutor::MemZero(Stream *stream, DeviceMemoryBase *location, + uint64 size) { + return implementation_->MemZero(stream, location, size); +} + +bool StreamExecutor::Memset32(Stream *stream, DeviceMemoryBase *location, + uint32 pattern, uint64 size) { + CHECK_EQ(0, size % 4) + << "need 32-bit multiple size to fill with 32-bit pattern"; + return implementation_->Memset32(stream, location, pattern, size); +} + +bool StreamExecutor::HostCallback(Stream *stream, + std::function<void()> callback) { + return implementation_->HostCallback(stream, callback); +} + +port::Status StreamExecutor::AllocateEvent(Event *event) { + return implementation_->AllocateEvent(event); +} + +port::Status StreamExecutor::DeallocateEvent(Event *event) { + return implementation_->DeallocateEvent(event); +} + +port::Status StreamExecutor::RecordEvent(Stream *stream, Event *event) { + return implementation_->RecordEvent(stream, event); +} + +port::Status StreamExecutor::WaitForEvent(Stream *stream, Event *event) { + return implementation_->WaitForEvent(stream, event); +} + +Event::Status StreamExecutor::PollForEventStatus(Event *event) { + return implementation_->PollForEventStatus(event); +} + +bool StreamExecutor::AllocateStream(Stream *stream) { + live_stream_count_.fetch_add(1, std::memory_order_relaxed); + if (!implementation_->AllocateStream(stream)) { + auto count = live_stream_count_.fetch_sub(1); + CHECK_GE(count, 0) << "live stream count should not dip below zero"; + LOG(INFO) << "failed to allocate stream; live stream count: " << count; + return false; + } + + return true; +} + +void StreamExecutor::DeallocateStream(Stream *stream) { + implementation_->DeallocateStream(stream); + CHECK_GE(live_stream_count_.fetch_sub(1), 0) + << "live stream count should not dip below zero"; +} + +bool StreamExecutor::CreateStreamDependency(Stream *dependent, Stream *other) { + return implementation_->CreateStreamDependency(dependent, other); +} + +bool StreamExecutor::AllocateTimer(Timer *timer) { + return implementation_->AllocateTimer(timer); +} + +void StreamExecutor::DeallocateTimer(Timer *timer) { + return implementation_->DeallocateTimer(timer); +} + +bool StreamExecutor::StartTimer(Stream *stream, Timer *timer) { + return implementation_->StartTimer(stream, timer); +} + +bool StreamExecutor::StopTimer(Stream *stream, Timer *timer) { + return implementation_->StopTimer(stream, timer); +} + +DeviceDescription *StreamExecutor::PopulateDeviceDescription() const { + return implementation_->PopulateDeviceDescription(); +} + +bool StreamExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const { + return implementation_->DeviceMemoryUsage(free, total); +} + +KernelArg StreamExecutor::DeviceMemoryToKernelArg( + const DeviceMemoryBase &gpu_mem) const { + return implementation_->DeviceMemoryToKernelArg(gpu_mem); +} + +void StreamExecutor::EnqueueOnBackgroundThread(std::function<void()> task) { + background_threads_->Schedule(task); +} + +void StreamExecutor::CreateAllocRecord(void *opaque, uint64 bytes) { + if (FLAGS_check_gpu_leaks && opaque != nullptr && bytes != 0) { + mutex_lock lock{mu_}; + mem_allocs_[opaque] = AllocRecord{ + bytes, ""}; + } +} + +void StreamExecutor::EraseAllocRecord(void *opaque) { + if (FLAGS_check_gpu_leaks && opaque != nullptr) { + mutex_lock lock{mu_}; + if (mem_allocs_.find(opaque) == mem_allocs_.end()) { + LOG(ERROR) << "Deallocating unknown pointer: " + << port::Printf("0x%p", opaque); + } else { + mem_allocs_.erase(opaque); + } + } +} + +void StreamExecutor::EnableTracing(bool enabled) { tracing_enabled_ = enabled; } + +void StreamExecutor::RegisterTraceListener(TraceListener *listener) { + { + mutex_lock lock{mu_}; + if (listeners_.find(listener) != listeners_.end()) { + LOG(INFO) << "Attempt to register already-registered listener, " + << listener; + } else { + listeners_.insert(listener); + } + } + + implementation_->RegisterTraceListener(listener); +} + +bool StreamExecutor::UnregisterTraceListener(TraceListener *listener) { + { + mutex_lock lock{mu_}; + if (listeners_.find(listener) == listeners_.end()) { + LOG(INFO) << "Attempt to unregister unknown listener, " << listener; + return false; + } + listeners_.erase(listener); + } + + implementation_->UnregisterTraceListener(listener); + return true; +} + +template <typename TraceCallT, typename... ArgsT> +void StreamExecutor::SubmitTrace(TraceCallT trace_call, ArgsT &&... args) { + if (tracing_enabled_) { + { + // instance tracers held in a block to limit the lock lifetime. + shared_lock lock{mu_}; + for (TraceListener *listener : listeners_) { + (listener->*trace_call)(std::forward<ArgsT>(args)...); + } + } + } +} + +internal::StreamExecutorInterface *StreamExecutor::implementation() { + return implementation_->GetUnderlyingExecutor(); +} + +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h new file mode 100644 index 0000000000..29ab235d0e --- /dev/null +++ b/tensorflow/stream_executor/stream_executor_pimpl.h @@ -0,0 +1,725 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_PIMPL_H_ +#define TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_PIMPL_H_ + +#include <atomic> +#include <set> +#include <tuple> +#include <vector> + +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/lib/strcat.h" +#include "tensorflow/stream_executor/lib/threadpool.h" +#include "tensorflow/stream_executor/platform.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/platform/thread_annotations.h" +#include "tensorflow/stream_executor/rng.h" +#include "tensorflow/stream_executor/shared_memory_config.h" +#include "tensorflow/stream_executor/stream.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" +#include "tensorflow/stream_executor/trace_listener.h" + +namespace perftools { +namespace gputools { + +// Structure used for device memory leak checking. +struct AllocRecord { + // The requested allocation size of the buffer. + uint64 bytes; + + // Holds a representation of the stack at the time the associated buffer was + // allocated. Produced in a form described in + // //util/symbolize/symbolized_stacktrace.h. + string stack_trace; +}; + +// Forward declaration of private friend class. +template <typename BeginCallT, typename CompleteCallT, + typename ReturnT, typename... BeginArgsT> +class ScopedTracer; + +// A StreamExecutor manages a single device, in terms of executing work (kernel +// launches) and memory management (allocation/deallocation, memory copies to +// and from the device). It is conceptually the "handle" for a device -- Stream +// objects, which are used to enqueue work to run on the +// coprocessor have a StreamExecutor instance as their "parent" object. +// +// StreamExecutor objects have an underlying platform that is specified up +// front; +// e.g. either it is a CUDA or OpenCL executor. +// +// Thread-safe after initialization. +// StreamExecutor interface should not be invoked from a signal handler. +class StreamExecutor { + public: + explicit StreamExecutor(PlatformKind kind, + const PluginConfig &plugin_config = PluginConfig()); + + // Primarily used for testing. + StreamExecutor(PlatformKind kind, + internal::StreamExecutorInterface *implementation); + + ~StreamExecutor(); + + port::Status Init(); + port::Status Init(int device_ordinal, DeviceOptions device_options); + + // Returns the platform that this StreamExecutor is acting upon. + PlatformKind platform_kind() const { return platform_kind_; } + + // Retrieves (loads) a kernel for the platform this StreamExecutor is acting + // upon, if one exists. + // + // Parameters: + // spec: The MultiKernelLoaderSpec is usually generated as a compile-time + // constant into an appropriate namespace. For example, see + // perftools::gputools::executor_sample::kKernelLoaderSpecs, from which a + // MultiKernelLoaderSpec is selected. + // kernel: Outparam that the kernel is loaded into. A given Kernel + // instantiation should not be loaded into more than once. + // + // If an error occurs, or there is no kernel available for the StreamExecutor + // platform, false is returned. + bool GetKernel(const MultiKernelLoaderSpec &spec, KernelBase *kernel); + + // Synchronously allocates an array on the GPU device of type T with + // element_count elements. + template <typename T> + DeviceMemory<T> AllocateArray(uint64 element_count); + + // As AllocateArray(), but returns a ScopedDeviceMemory<T>. + template <typename T> + ScopedDeviceMemory<T> AllocateOwnedArray(uint64 element_count) { + return ScopedDeviceMemory<T>(this, AllocateArray<T>(element_count)); + } + + // Convenience wrapper that allocates space for a single element of type T + // in GPU memory. + template <typename T> + DeviceMemory<T> AllocateScalar() { + return AllocateArray<T>(1); + } + + // As AllocateScalar(), but returns a ScopedDeviceMemory<T>. + template <typename T> + ScopedDeviceMemory<T> AllocateOwnedScalar() { + return AllocateOwnedArray<T>(1); + } + + // Synchronously allocates a scalar of type T on the GPU device that is + // (POD) zero-byte initialized. + template <typename T> + DeviceMemory<T> AllocateZeroed(); + + // As AllocateZeroed(), but returns a ScopedDeviceMemory<T>. + template <typename T> + ScopedDeviceMemory<T> AllocateOwnedZeroed() { + return ScopedDeviceMemory<T>(this, AllocateZeroed<T>()); + } + + // Allocate a memory region inside another allocated memory region. + // Offset and size are specified in terms of T elements. + // Warning: Do not free a parent buffer before its sub-buffers; this may cause + // use-after-free issues (the specific behavior is not consistent across + // platforms). + // - Note: OpenCL uses refcounting to manage buffer lifetimes, so use of a + // sub-buffer after parent deallocation is expected to be safe. This will + // render your code non-platform-portable, however. + template <typename T> + DeviceMemory<T> AllocateSubBuffer(DeviceMemory<T> *parent, + uint64 element_offset, + uint64 element_count); + + // As AllocateSubBuffer(), but returns a ScopedDeviceMemory<T>. + template <typename T> + ScopedDeviceMemory<T> AllocateOwnedSubBuffer(DeviceMemory<T> *parent, + uint64 element_offset, + uint64 element_count) { + return ScopedDeviceMemory<T>( + this, AllocateSubBuffer<T>(parent, element_offset, element_count)); + } + + // Finds a symbol and returns device memory allocated to the symbol. The + // symbol is searched in any kernels that were previously loaded through + // GetKernel() before the GetSymbol() call. The user has to make sure that the + // type of symbol and T match. + // - Note: symbol_name should include its namespace as well. For example, + // pass "nms0::symbol" if referring to nms0::symbol. + template <typename T> + port::StatusOr<DeviceMemory<T>> GetSymbol(const string &symbol_name); + + // Deallocate the DeviceMemory previously allocated via this interface. + // Deallocation of a nullptr-representative value is permitted. + // + // Resets the internal contents of mem to be null-representative, but this + // null-out effect should not be relied upon in client code. + void Deallocate(DeviceMemoryBase *mem); + + // Retrieves a mapping of active opaque GPU memory pointer to a string + // representation of the [allocating thread's] stack at the time the pointer + // was allocated. Useful for tracking GPU memory leaks. + // + // Note: this will only be populated if --check_gpu_leaks flag is activated. + void GetMemAllocs(std::map<void *, AllocRecord> *records_out); + + // Allocates a region of host memory and registers it with the platform API. + // Memory allocated in this manner (or allocated and registered with + // HostMemoryRegister() is required for use in asynchronous memcpy operations, + // such as Stream::ThenMemcpy. + void *HostMemoryAllocate(uint64 bytes); + + // Deallocates a region of host memory allocated by HostMemoryAllocate(). + void HostMemoryDeallocate(void *location); + + // Registers a region of host memory with the platform API. Registered memory + // (or memory allocated with HostMemoryAllocate) is required for use with + // asynchronous memcpy operations, such as Stream::ThenMemcpy. This method + // is used to register memory allocated outside the StreamExecutor; + // HostMemoryAllocate implicitly registers its allocations and + // HostMemoryDeallocate implicitly deregisters on deallocation. + bool HostMemoryRegister(void *location, uint64 size) SE_MUST_USE_RESULT; + + // Unregisters a region of host memory registered with HostMemoryRegister. + // This should be done before deallocating the region with delete[]/free/etc. + bool HostMemoryUnregister(void *location) SE_MUST_USE_RESULT; + + // Synchronizes all activity occuring in the StreamExecutor's context (most + // likely a whole device). + bool SynchronizeAllActivity() SE_MUST_USE_RESULT; + + // Blocks the caller while "size" bytes are zeroed out (in POD fashion) at the + // given location in GPU memory. + bool SynchronousMemZero(DeviceMemoryBase *location, + uint64 size) SE_MUST_USE_RESULT; + + // Blocks the caller while "size" bytes are initialized to "value" (in POD + // fashion) at the given location in GPU memory. + bool SynchronousMemSet(DeviceMemoryBase *location, int value, + uint64 size) SE_MUST_USE_RESULT; + + // [deprecated] Blocks the caller while a data segment of the given size is + // copied from the host source to the GPU destination. + // + // Deprecation: prefer explicit H2D below, to avoid error-prone API usage. + bool SynchronousMemcpy(DeviceMemoryBase *gpu_dst, const void *host_src, + uint64 size) SE_MUST_USE_RESULT; + + // [deprecated] Blocks the caller while a data segment of the given size is + // copied from the GPU source to the host destination. + // + // Deprecation: prefer explicit D2H below, to avoid error-prone API usage. + bool SynchronousMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src, + uint64 size) SE_MUST_USE_RESULT; + + // Same as SynchronousMemcpy(DeviceMemoryBase*, ...) above. + port::Status SynchronousMemcpyH2D(const void *host_src, int64 size, + DeviceMemoryBase *gpu_dst); + + // Alternative interface for memcpying from host to device that takes an + // array slice. Checks that the destination size can accomodate the host + // slice size. + template <class T> + port::Status SynchronousMemcpyH2D(port::ArraySlice<T> host_src, + DeviceMemoryBase *gpu_dst) { + auto host_size = host_src.size() * sizeof(T); + CHECK(gpu_dst->size() == 0 || gpu_dst->size() >= host_size); + return SynchronousMemcpyH2D(host_src.begin(), host_size, gpu_dst); + } + + // Same as SynchronousMemcpy(void*, ...) above. + port::Status SynchronousMemcpyD2H(const DeviceMemoryBase &gpu_src, int64 size, + void *host_dst); + + // Alternative interface for memcpying from device to host that takes an + // array slice. Checks that the destination size can accomodate the host + // slice size. + template <typename T> + port::Status SynchronousMemcpyD2H(const DeviceMemory<T> &gpu_src, + port::MutableArraySlice<T> host_dst) { + auto host_size = host_dst.size() * sizeof(T); + CHECK(gpu_src.size() == 0 || host_size >= gpu_src.size()); + return SynchronousMemcpyD2H(gpu_src, host_size, host_dst.begin()); + } + + // Blocks the caller while a data segment of the given size is copied from the + // GPU source to the GPU destination. + bool SynchronousMemcpy(DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &gpu_src, + uint64 size) SE_MUST_USE_RESULT; + + // Enqueues an operation onto stream to zero out size bytes at the given GPU + // memory location. Neither stream nor location may be null. Returns whether + // the operation was successfully enqueued onto the stream. + bool MemZero(Stream *stream, DeviceMemoryBase *location, + uint64 size) SE_MUST_USE_RESULT; + + // Enqueues an operation onto stream to set 32-bit patterns starting at + // location, for byte count given by size. size must be 32-bit quantified + // (i.e. evently divisible by 4). Returns whether the operation was + // successfully enqueued onto the stream. + bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern, + uint64 size) SE_MUST_USE_RESULT; + + // Enables peer access from this StreamExecutor to memory + // allocated by other, such that launched device code, memcpies, etc may + // access it directly. + // + // Both this StreamExecutor and other must be backed by the same platform (as + // in + // CUDA vs OpenCL) implementation. + port::Status EnablePeerAccessTo(StreamExecutor *other); + + // Returns whether it's possible to enable peer access from this + // StreamExecutor + // to memory allocated by another. + // + // Even when this returns true, EnablePeerAccessTo may fail for other reasons; + // this is more an up-front test as to whether it's expressly forbidden. + bool CanEnablePeerAccessTo(StreamExecutor *other); + + // Gets the preferred shared memory configuration for the device to which this + // executor is bound. + SharedMemoryConfig GetDeviceSharedMemoryConfig(); + + // Sets the preferred shared memory configuration for the device to which this + // executor is bound. + port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config); + + // Obtains metadata about the underlying device. + // The value is cached on first use. + const DeviceDescription &GetDeviceDescription() const; + + // Returns the underlying device memory usage information, if it is available. + // If it is not available (false is returned), free/total may not be + // initialized. + // + // Note: "Free" reflects the amount of free memory on the underlying device, + // so allocations via other StreamExecutors that have the same underlying + // device + // will be reflected in "free". + bool DeviceMemoryUsage(int64 *free, int64 *total) const; + + // The device count reported by this StreamExecutor's platform. + // Note: on OpenCL we implicitly select platform zero at the moment. + int PlatformDeviceCount() const; + + // Returns whether the StreamExecutor supports BLAS routines for the platform + // that underlies this interface. + bool SupportsBlas() const; + + // Returns whether the StreamExecutor supports FFT routines for the platform + // that underlies this interface. + bool SupportsFft() const; + + // Returns whether the StreamExecutor supports RNG routines for the platform + // that underlies this interface. + bool SupportsRng() const; + + // Returns whether the StreamExecutor support neural net routines for the + // platform that underlies this interface. + bool SupportsDnn() const; + + // Returns the device ordinal that this StreamExecutor was initialized with. + // Meaningless before initialization. + int device_ordinal() const { return device_ordinal_; } + + // Returns a borrowed pointer to the underlying StreamExecutor implementation. + internal::StreamExecutorInterface *implementation(); + + // Warning: use Stream::ThenLaunch instead, this method is not for general + // consumption. However, this is the only way to launch a kernel for which + // the type signature is only known at runtime; say, if an application + // supports loading/launching kernels with arbitrary type signatures. + // In this case, the application is expected to know how to do parameter + // packing that obeys the contract of the underlying platform implementation. + // + // Launches a data parallel kernel with the given thread/block + // dimensionality and already-packed args/sizes to pass to the underlying + // platform driver. + // + // This is called by Stream::Launch() to delegate to the platform's launch + // implementation in StreamExecutorInterface::Launch(). + bool Launch(Stream *stream, const ThreadDim &thread_dims, + const BlockDim &block_dims, const KernelBase &kernel, + const std::vector<KernelArg> &args); + + // Gets-or-creates (creates with memoization) a FftSupport datatype that can + // be used to execute FFT routines on the current platform. + // + // Ownership and user-facing is the same as AsBlas() below. + // + // Returns null if there was an error initializing the FFT support for the + // underlying platform. + fft::FftSupport *AsFft(); + + // Gets-or-creates (creates with memoization) a DnnSupport datatype that can + // be used for neural network routines on the current platform. + // + // Ownership and user-facing is the same as AsBlas() below. + // + // Returns null if there was an error initializing the DNN support for the + // underlying platform. + dnn::DnnSupport *AsDnn(); + + // Turns StreamExecutor operation tracing on or off. + void EnableTracing(bool enable); + + // Registers a trace listener to receive callbacks for only a single + // StreamExecutor instance. + // To register a listener for all executors for a given platform, see + // Platform::RegisterTraceListener(). + // Does not take ownership of listener. + void RegisterTraceListener(TraceListener* listener); + + // Removes a TraceListener from this StreamExecutor instance. + // Returns false (and logs) in cases where the argument listener was not + // previously registered. + bool UnregisterTraceListener(TraceListener* listener); + + // Converts a DeviceMemory object into a KernelArg object for passing to the + // device driver for kernel launch. + KernelArg DeviceMemoryToKernelArg(const DeviceMemoryBase &gpu_mem) const; + + private: + template <typename BeginCallT, typename CompleteCallT, + typename ReturnT, typename... BeginArgsT> + friend class ScopedTracer; + friend class Event; + friend class Stream; + friend class Timer; + template <typename... Params> + friend class TypedKernel; + template <typename... Args> + friend struct ThenBlasImpl; + + // Gets-or-creates (creates with memoization) a BlasSupport datatype that can + // be used to execute BLAS routines on the current platform. This is typically + // not user-facing, as users will use the Stream::ThenBlas* family of routines + // to entrain BLAS operations. See blas.h for additional details. + // + // Ownership is not transferred to the caller -- ownership is retained by this + // object for memoization. This BLAS interface is also only expected to be + // used by a Stream for entraining calls to BLAS functionality. + // + // Returns null if there was an error initializing the BLAS support for the + // underlying platform. + blas::BlasSupport *AsBlas(); + + // Gets-or-creates (creates with memoization) an RngSupport datatype that can + // be used for random-number-generation routines on the current platform. + // + // Ownership and user-facing is the same as AsBlas() above. + // + // Returns null if there was an error initializing the RNG support for the + // underlying platform. + rng::RngSupport *AsRng(); + + // Causes the host code to synchronously wait for operations entrained onto + // stream to complete. Effectively a join on the asynchronous GPU operations + // enqueued on the stream before this program point. + bool BlockHostUntilDone(Stream *stream); + + // Synchronously allocates size bytes on the underlying platform and returns + // an opaque void* representing that allocation. In the case of failure, + // nullptr is returned. + void *Allocate(uint64 size); + + // Finds and retrieves device memory for the symbol on the underlying + // platform. + bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes); + + // Entrains a memcpy operation onto stream, with a host destination location + // host_dst and a GPU memory source, with target size size. + bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &gpu_src, + uint64 size); + + // Entrains a memcpy operation onto stream, with a GPU destination location + // and a host memory source, with target size size. + bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, const void *host_src, + uint64 size); + + // Entrains a memcpy operation onto stream, with a GPU destination location + // and a GPU source location, with target size size. Peer access should have + // been enabled between the StreamExecutors owning the GPU memory regions. + bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst, + const DeviceMemoryBase &gpu_src, uint64 size); + + // Entrains on a stream a user-specified function to be run on the host. + // See Stream::ThenDoHostCallback for full details. + bool HostCallback(Stream *stream, std::function<void()> callback); + + // Performs platform-specific allocation and initialization of an event. + port::Status AllocateEvent(Event *event); + + // Performs platform-specific deallocation and cleanup of an event. + port::Status DeallocateEvent(Event *event); + + // Inserts the specified event at the end of the specified stream. + port::Status RecordEvent(Stream *stream, Event *event); + + // Wait for the specified event at the end of the specified stream. + port::Status WaitForEvent(Stream *stream, Event *event); + + // Requests the current status of the event from the underlying platform. + Event::Status PollForEventStatus(Event *event); + + // Allocates stream resources on the underlying platform for subject and + // initializes its internals. + bool AllocateStream(Stream *subject); + + // Deallocates stream resources on the underlying platform. + void DeallocateStream(Stream *subject); + + // Causes dependent to not begin execution until other has finished its + // last-enqueued work. + bool CreateStreamDependency(Stream *dependent, Stream *other); + + // Allocates timer resources on the underlying platform for subject and + // initializes its internals. + bool AllocateTimer(Timer *subject); + + // Deallocates timer resources on the underlying platform. + void DeallocateTimer(Timer *subject); + + // Records a start event for an interval timer. + bool StartTimer(Stream *stream, Timer *timer); + + // Records a stop event for an interval timer. + bool StopTimer(Stream *stream, Timer *timer); + + // Allocates a new metadata object, appropriately populated, on the heap, with + // ownership transfer to caller. + DeviceDescription *PopulateDeviceDescription() const; + + // Adds a task to the port::ThreadPool work queue. These tasks must be + // fire-and-forget and have no external data or timing dependencies; their + // execution order and completion time have no guarantees. + // For an example of an appropriate task, see HostBlas::DoBlasGemmInternal; + // there, temporary internal buffers are freed using this method. + void EnqueueOnBackgroundThread(std::function<void()> task); + + // Adds an AllocRecord for 'opaque' of size 'bytes' to the record map, for + // leak checking. NULL buffer pointers and buffer sizes of 0 will not be + // tracked. + void CreateAllocRecord(void *opaque, uint64 size); + + // Removes the AllocRecord keyed by 'opaque' from the record map. NULL + // pointers will not be erased (as they're not tracked, per above). + void EraseAllocRecord(void *opaque); + + // Calls the relevant TraceListener routine to begin tracing for the specified + // asynchronous method. + template <typename TraceCallT, typename... ArgsT> + void SubmitTrace(TraceCallT trace_call, ArgsT&&... args); + + // Reader/writer lock for class-static StreamExecutor members. + static mutex static_mu_; + + // Reader/writer lock for mutable data structures on this StreamExecutor. + // + // Mutable so that caching functions (like DeviceDescription, AsBlas, etc.) + // can acquire the lock on their first (mutating) call as well. + mutable mutex mu_; + + // A mapping of pointer (to GPU memory) to string representation of the stack + // (of the allocating thread) at the time at which the pointer was allocated. + std::map<void *, AllocRecord> mem_allocs_ GUARDED_BY(mu_); + + // Pointer to the platform-specific-interface implementation. This is + // delegated to by the interface routines in pointer-to-implementation + // fashion. + std::unique_ptr<internal::StreamExecutorInterface> implementation_; + + // Memoized BLAS support object -- we only want to create this once when asked + // for a BLAS interface. + std::unique_ptr<blas::BlasSupport> blas_ GUARDED_BY(mu_); + + // Memoized DNN support object -- we only want to create this once when asked + // for an DNN interface. + std::unique_ptr<dnn::DnnSupport> dnn_ GUARDED_BY(mu_); + + // Memoized FFT support object -- we only want to create this once when asked + // for a FFT interface. + std::unique_ptr<fft::FftSupport> fft_; + + // Memoized RNG support object -- we only want to create this once when asked + // for an RNG interface. + std::unique_ptr<rng::RngSupport> rng_ GUARDED_BY(mu_); + + // Slot to cache the owned DeviceDescription for the underlying device + // once it has been quieried from DeviceDescription(). + mutable std::unique_ptr<DeviceDescription> device_description_ + GUARDED_BY(mu_); + + // The kind of the underlying platform that is being targeted, as passed + // during construction. + // + // Immutable post-initialization. + PlatformKind platform_kind_; + + // The device ordinal that this object was initialized with. + // + // Immutable post-initialization. + int device_ordinal_; + + // Executor for handling host callback work that cannot be performed + // by a host callback thread - for example, cleanup after a host BLAS routine + // (which may make device API calls). This work cannot block the host + // callback thread, will be completed asynchronously, and should be treated + // as fire-and-forget. Assume no ordering guarantees WRT the tasks enqueued + // here. + // + // Immutable post-initialization. Object is thread-safe. + std::unique_ptr<port::ThreadPool> background_threads_; + + // Counter for the current number of live streams. This is used to check + // for accidentally-outstanding streams at StreamExecutor teardown time, as + // well + // as to indicate leaks (via a large outstanding count being logged) in the + // case we can't allocate more streams. + std::atomic_int_fast32_t live_stream_count_; + + // Only one worker thread is needed; little work will be done by the + // executor. + static const int kNumBackgroundThreads = 1; + + // Indicates if StreamExecutor operation tracing should be performed. + bool tracing_enabled_; + + // The set of TraceListeners registered for this StreamExecutor. + std::set<TraceListener*> listeners_ GUARDED_BY(mu_); + + SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutor); +}; + +//////////// +// Inlines + +template <typename T> +inline DeviceMemory<T> StreamExecutor::AllocateArray(uint64 element_count) { + uint64 bytes = sizeof(T) * element_count; + void *opaque = Allocate(bytes); + return DeviceMemory<T>::MakeFromByteSize(opaque, bytes); +} + +template <typename T> +inline port::StatusOr<DeviceMemory<T>> StreamExecutor::GetSymbol( + const string &symbol_name) { + // If failed to get the symbol, opaque/bytes are unchanged. Initialize them to + // be nullptr/0 for consistency with DeviceMemory semantics. + void *opaque = nullptr; + size_t bytes = 0; + if (GetSymbol(symbol_name, &opaque, &bytes)) { + CHECK_EQ(bytes % sizeof(T), 0); + return DeviceMemory<T>::MakeFromByteSize(opaque, bytes); + } + return port::Status( + port::error::NOT_FOUND, + port::StrCat("Check if kernel using the symbol is loaded: ", + symbol_name)); +} + +template <typename ElemT> +ScopedDeviceMemory<ElemT>::ScopedDeviceMemory(StreamExecutor *parent, + DeviceMemoryBase value) + : wrapped_(value), parent_(parent) {} + +template <typename ElemT> +ScopedDeviceMemory<ElemT>::ScopedDeviceMemory( + StreamExecutor *parent, std::initializer_list<ElemT> values) + : ScopedDeviceMemory(parent, parent->AllocateArray<ElemT>(values.size())) { + if (ptr() != nullptr) { + std::vector<ElemT> local(values); + if (!parent->SynchronousMemcpy(ptr(), const_cast<const ElemT *>(&local[0]), + ptr()->size())) { + Reset(nullptr); + } + } +} + +template <typename ElemT> +ScopedDeviceMemory<ElemT>::~ScopedDeviceMemory() { + parent_->Deallocate(&wrapped_); +} + +template <typename ElemT> +void ScopedDeviceMemory<ElemT>::Reset(DeviceMemory<ElemT> updated) { + parent_->Deallocate(&wrapped_); + wrapped_ = updated; +} + +template <typename ElemT> +void ScopedDeviceMemory<ElemT>::Reset(std::nullptr_t) { + parent_->Deallocate(&wrapped_); + wrapped_ = DeviceMemory<ElemT>{}; +} + +template <typename T> +DeviceMemory<T> StreamExecutor::AllocateZeroed() { + void *opaque = Allocate(sizeof(T)); + if (opaque == nullptr) { + return DeviceMemory<T>{}; + } + + DeviceMemory<T> result = DeviceMemory<T>::MakeFromByteSize(opaque, sizeof(T)); + bool ok = SynchronousMemZero(&result, sizeof(T)); + if (!ok) { + Deallocate(&result); + return DeviceMemory<T>{}; + } + + return result; +} + +template <typename T> +DeviceMemory<T> StreamExecutor::AllocateSubBuffer(DeviceMemory<T> *parent, + uint64 element_offset, + uint64 element_count) { + if (element_offset + element_count > parent->ElementCount()) { + LOG(ERROR) << "requested sub-buffer allocation (offset + size) is greater " + << "than parent allocation size: (" << element_offset << " + " + << element_count << ") vs. (" << parent->ElementCount() << ")"; + return DeviceMemory<T>{}; + } + + void *opaque = implementation_->AllocateSubBuffer( + parent, sizeof(T) * element_offset, sizeof(T) * element_count); + if (opaque == nullptr) { + return DeviceMemory<T>{}; + } + CreateAllocRecord(opaque, sizeof(T) * element_count); + return DeviceMemory<T>(DeviceMemoryBase(opaque, sizeof(T) * element_count, + true /* = is_sub_buffer */)); +} + +template <typename... Params, typename... Args> +inline Stream &Stream::ThenLaunch(ThreadDim thread_dims, BlockDim block_dims, + const TypedKernel<Params...> &kernel, + Args... args) { + KernelInvocationChecker<std::tuple<Params...>, + std::tuple<Args...>>::CheckAllStaticAssert(); + if (ok()) { + // This is the core that allows type-safe kernel launching. + // Since the platforms take kernel arguments as tuples of (void *, size), + // we pack the variadic parameters passed as ...args into the desired + // tuple form and pass that packed form to the StreamExecutor::Launch() + // implementation. + std::vector<KernelArg> kernel_args; + kernel_args.reserve(kernel.Arity()); + kernel.PackParams(&kernel_args, args...); + bool ok = + parent_->Launch(this, thread_dims, block_dims, kernel, kernel_args); + if (!ok) { + SetError(); + LOG(WARNING) << "parent failed to launch kernel: " << &kernel; + } + } + return *this; +} + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_PIMPL_H_ diff --git a/tensorflow/stream_executor/temporary_device_memory.cc b/tensorflow/stream_executor/temporary_device_memory.cc new file mode 100644 index 0000000000..d11b58813d --- /dev/null +++ b/tensorflow/stream_executor/temporary_device_memory.cc @@ -0,0 +1,53 @@ +#include "tensorflow/stream_executor/temporary_device_memory.h" + +#include "tensorflow/stream_executor/stream.h" + +namespace perftools { +namespace gputools { + +TemporaryDeviceMemoryBase::~TemporaryDeviceMemoryBase() { + parent_->temporary_memory_manager()->MarkFinalized(device_memory_, + allocation_generation_, + /*must_exist=*/false); +} + +DeviceMemoryBase* TemporaryDeviceMemoryBase::mutable_device_memory() { + DCHECK(!IsFinalized()) + << "should not access device memory after finalization"; + return &device_memory_; +} + +const DeviceMemoryBase& TemporaryDeviceMemoryBase::device_memory() const { + DCHECK(!IsFinalized()) + << "should not access device memory after finalization"; + return device_memory_; +} + +void TemporaryDeviceMemoryBase::Finalize() { + DCHECK(!IsFinalized()) << "should not finalize more than once"; + parent_->temporary_memory_manager()->MarkFinalized(device_memory_, + allocation_generation_, + /*must_exist=*/true); +} + +bool TemporaryDeviceMemoryBase::IsFinalized() const { + return parent_->temporary_memory_manager()->IsFinalized( + device_memory_, allocation_generation_); +} + +bool TemporaryDeviceMemoryBase::IsAllocated() const { + return parent_->temporary_memory_manager()->HasAllocated( + device_memory_, allocation_generation_); +} + +TemporaryDeviceMemoryBase::TemporaryDeviceMemoryBase( + Stream* parent, DeviceMemoryBase device_memory, + uint64 allocation_generation) + : device_memory_(device_memory), + allocation_generation_(allocation_generation), + parent_(parent) { + DCHECK(IsAllocated()); +} + +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/temporary_device_memory.h b/tensorflow/stream_executor/temporary_device_memory.h new file mode 100644 index 0000000000..4e7c63056b --- /dev/null +++ b/tensorflow/stream_executor/temporary_device_memory.h @@ -0,0 +1,123 @@ +// Temporary memories are used to allocate scratch space required by an +// operation about to be enqueued onto a stream. +// +// std::unique_ptr<TemporaryDeviceMemory<float>> temporary_memory = +// stream.AllocateTemporaryArray<float>(1024).ConsumeValueOrDie(); +// // ... enqueue stuff onto the stream using the temporary memory ... +// // Note that the memory is accessible via +// // temporary_memory->device_memory() and similar. +// +// // Finalize the temporary memory. The underlying device memory may +// // be released any time after this program point, as another thread may +// // call Stream::BlockHostUntilDone, causing synchronization. This +// // finalization also happens automatically for the user if the unique_ptr +// // goes out of scope. +// temporary_memory.Finalize(); +// +// WARNING: do NOT hold onto the device memory associated with temporary_memory +// after finalization. If temporary_memory->device_memory() is used after the +// temporary memory is finalized, it will cause a DCHECK failure. +// +// Note that standard usage takes advantage of the type-safe wrapper, +// TemporaryDeviceMemory<T>, defined below. +// +// Also see tests for executable sample usage. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_TEMPORARY_DEVICE_MEMORY_H_ +#define TENSORFLOW_STREAM_EXECUTOR_TEMPORARY_DEVICE_MEMORY_H_ + +#include "tensorflow/stream_executor/device_memory.h" + +namespace perftools { +namespace gputools { + +class Stream; +namespace internal { +class TemporaryMemoryManager; +} + +// Untyped base class (analogous to a void*) for temporary device memory +// allocations associated with a stream. +class TemporaryDeviceMemoryBase { + public: + // Marks the temporary memory as finalized if it is not already marked as + // such. + ~TemporaryDeviceMemoryBase(); + + // Precondition: !IsFinalized() + DeviceMemoryBase* mutable_device_memory(); + + // Precondition: !IsFinalized() + const DeviceMemoryBase& device_memory() const; + + // "Finalizes" this temporary memory, making it acceptable to release at the + // next stream synchronization point -- the device memory can be reclaimed at + // any time after the temporary memory is marked as finalized (e.g. if a + // separate thread is calls Stream::BlockHostUntilDone). This may only be + // called once -- see the precondition below. + // + // Precondition: !IsFinalized() + void Finalize(); + + // Returns true iff the temporary memory is finalized (that is, the user is + // done referring to the temporary device memory, and thus it can be released + // at the next stream synchronization point). + bool IsFinalized() const; + + // Returns true iff the temporary memory is still allocated. + // + // Note: this is a polling call, no guarantee is made that the temporary + // memory is still allocated after the call has completed. + bool IsAllocated() const; + + private: + friend class internal::TemporaryMemoryManager; + friend class TemporaryDeviceMemoryTest; + + // Note: construction DCHECKs that the memory is known-allocated in the + // stream's temporary-allocation-manager. + TemporaryDeviceMemoryBase(Stream* parent, DeviceMemoryBase device_memory, + uint64 allocation_generation); + + // The device memory region that has allocated. + DeviceMemoryBase device_memory_; + + // The generation counter value for the temporary memory record in the + // temporary memory manager. + uint64 allocation_generation_; + + // The stream that this temporary memory was allocated for. + Stream* parent_; +}; + +// Type-safe wrapper around the base type (which is analogous to a void*). +template <typename T> +class TemporaryDeviceMemory : public TemporaryDeviceMemoryBase { + public: + // Type-safe wrapper around TemporaryDeviceMemoryBase::mutable_device_memory. + DeviceMemory<T>* mutable_device_memory() { + StaticSlicingAssertionDummy(); + return reinterpret_cast<DeviceMemory<T>*>( + TemporaryDeviceMemoryBase::mutable_device_memory()); + } + + // Type-safe wrapper around TemporaryDeviceMemoryBase::device_memory. + const DeviceMemory<T>& device_memory() const { + StaticSlicingAssertionDummy(); + return reinterpret_cast<const DeviceMemory<T>&>( + TemporaryDeviceMemoryBase::device_memory()); + } + + private: + static void StaticSlicingAssertionDummy() { + static_assert( + sizeof(TemporaryDeviceMemory) == sizeof(TemporaryDeviceMemoryBase), + "derived class is simply a wrapper, no members may be added due to " + "slicing"); + } +}; + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_TEMPORARY_DEVICE_MEMORY_H_ diff --git a/tensorflow/stream_executor/temporary_memory_manager.cc b/tensorflow/stream_executor/temporary_memory_manager.cc new file mode 100644 index 0000000000..0352aa4b2b --- /dev/null +++ b/tensorflow/stream_executor/temporary_memory_manager.cc @@ -0,0 +1,113 @@ +#include "tensorflow/stream_executor/temporary_memory_manager.h" + +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" +#include "tensorflow/stream_executor/lib/ptr_util.h" +#include "tensorflow/stream_executor/stream.h" +#include "tensorflow/stream_executor/stream_executor_pimpl.h" + +namespace perftools { +namespace gputools { +namespace internal { + +void TemporaryMemoryManager::ForceDeallocateAll() { + mutex_lock lock(mutex_); + VLOG(1) << "force-deallocating " << records_.size() << " remaining records"; + for (auto it = records_.begin(); it != records_.end(); ++it) { + DeviceMemoryBase device_memory = it->first; + stream_->parent()->Deallocate(&device_memory); + } +} + +void TemporaryMemoryManager::MarkFinalized( + const DeviceMemoryBase& device_memory, uint64 generation, bool must_exist) { + mutex_lock lock(mutex_); + auto it = records_.find(device_memory); + if (it == records_.end()) { + if (must_exist) { + LOG(FATAL) << "attempted to mark finalization for temporary " + "memory that does not exist"; + } + return; + } + it->second.finalized = true; +} + +void TemporaryMemoryManager::DeallocateFinalizedTemporaries() { + mutex_lock lock(mutex_); + int deallocated_count = 0; + for (auto it = records_.begin(); it != records_.end();) { + if (it->second.finalized) { + DeviceMemoryBase device_memory = it->first; + stream_->parent()->Deallocate(&device_memory); + ++deallocated_count; + it = records_.erase(it); + } else { + ++it; + } + } + VLOG(1) << "deallocated " << deallocated_count << " finalized temporaries"; +} + +bool TemporaryMemoryManager::IsFinalized(const DeviceMemoryBase& device_memory, + uint64 allocation_generation) const { + mutex_lock lock(mutex_); + auto it = records_.find(device_memory); + if (it == records_.end()) { + return true; // If there's no record present it's vacuously finalized. + } + + if (it->second.allocation_generation == allocation_generation) { + return it->second.finalized; + } + + // If the allocation generation did not match, it's vacuously true. + return true; +} + +bool TemporaryMemoryManager::HasAllocated(const DeviceMemoryBase& device_memory, + uint64 generation) const { + mutex_lock lock(mutex_); + auto it = records_.find(device_memory); + if (it == records_.end()) { + return false; + } + return it->second.allocation_generation == generation; +} + +port::StatusOr<std::unique_ptr<TemporaryDeviceMemoryBase>> +TemporaryMemoryManager::AllocateArrayBase(uint64 element_count, + uint64 element_size) { + uint64 byte_size = element_count * element_size; + DeviceMemoryBase device_memory = + stream_->parent()->AllocateArray<uint8>(byte_size); + if (device_memory == nullptr) { + return port::Status(port::error::RESOURCE_EXHAUSTED, + port::StrCat("could not allocate temporary memory of ", + byte_size, " bytes")); + } + + uint64 generation; + + // Add the record before instantiating the device memory instance so we can + // check the allocation invariant at TemporaryDeviceMemory construction time. + { + mutex_lock lock(mutex_); + generation = ++generation_; + DCHECK(records_.find(device_memory) == records_.end()); + records_[device_memory] = {generation, + /*finalized=*/false}; + } + + VLOG(1) << port::Printf( + "stream %p allocated temporary device memory at %p (size %llu) in " + "generation %llu", + stream_, device_memory.opaque(), byte_size, generation); + std::unique_ptr<TemporaryDeviceMemoryBase> result( + new TemporaryDeviceMemoryBase(stream_, device_memory, generation)); + return std::move(result); +} + +} // namespace internal +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/temporary_memory_manager.h b/tensorflow/stream_executor/temporary_memory_manager.h new file mode 100644 index 0000000000..847f0f2182 --- /dev/null +++ b/tensorflow/stream_executor/temporary_memory_manager.h @@ -0,0 +1,138 @@ +// The temporary-memory-manager is a helper class for a Stream to keep track of +// temporary allocations. These allocations defer their deallocation to the next +// Stream::BlockHostUntilDone call for efficiency purposes (as deallocation +// itself generally forces synchronization to occur). + +#ifndef TENSORFLOW_STREAM_EXECUTOR_TEMPORARY_MEMORY_MANAGER_H_ +#define TENSORFLOW_STREAM_EXECUTOR_TEMPORARY_MEMORY_MANAGER_H_ + +#include <map> +#include <memory> + +#include "tensorflow/stream_executor/device_memory.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/thread_annotations.h" +#include "tensorflow/stream_executor/temporary_device_memory.h" + +namespace perftools { +namespace gputools { +namespace internal { + +// Record used inside the TemporaryMemoryManager as metadata for a given device +// memory region. +struct TemporaryMemoryRecord { + // What "generation" this record was allocated in. + // + // Currently the generation counter is bumped for every allocation, but this + // could be made coarser if necessary. + uint64 allocation_generation; + + // Notes whether the temporary memory has been marked as finalized, such that + // we can release the DeviceMemory associated with this record at + // synchronization time. + bool finalized; +}; + +// Manages temporary memories associated with a stream -- keeps records of +// outstanding temporaries and their state, and can deallocate them +// appropriately at points in the Stream lifecycle (e.g. BlockHostUntilDone, +// destruction). +class TemporaryMemoryManager { + public: + explicit TemporaryMemoryManager(Stream* stream) : stream_(stream) {} + + // Allocates a temporary array that is then managed by this object. + template <typename T> + port::StatusOr<std::unique_ptr<TemporaryDeviceMemory<T>>> AllocateArray( + uint64 element_count); + + // Forces deallocation of all managed temporary memory regions. + // + // Called, for example, when the Stream owning this temporary memory manager + // is destroyed. + // + // Note: These calls to Deallocate will likely force synchronization. + void ForceDeallocateAll(); + + // Marks the given memory region as finalized. + // + // If must_exist is set, this will check-fail if the temporary memory record + // is not found. + void MarkFinalized(const DeviceMemoryBase& device_memory, uint64 generation, + bool must_exist); + + // Deallocates temporary memories that have been finalized. + // + // Note: These calls to Deallocate will likely force synchronization, so it is + // meant to be called before a "BlockHostUntilDone" is about to be performed. + void DeallocateFinalizedTemporaries(); + + // Returns whether the provided device_memory is finalized. + // + // In the vacuous case where the device memory doesn't appear in the temporary + // memory records, it is either not a temporary at all, or has already been + // deallocated, and thus returns true. + bool IsFinalized(const DeviceMemoryBase& device_memory, + uint64 allocation_generation) const; + + // Returns whether the manager has a live allocation record for the given + // device memory pointer with the given generation counter. + // + // Note: this is a polling call -- there is no guarantee that the region is + // still allocated once the call has completed. + bool HasAllocated(const DeviceMemoryBase& device_memory, + uint64 generation) const; + + private: + // Allocates an array without type parameterization, so that the + // implementation can live in the source file. Without this base allocation + // method, we incur a circular dependency between the StreamExecutor + // definition and this class' definition. + port::StatusOr<std::unique_ptr<TemporaryDeviceMemoryBase>> AllocateArrayBase( + uint64 element_count, uint64 element_size); + + // Mutex to guard temporary record state. + mutable mutex mutex_; + + // Mapping from device memory to the current (live) temporary memory record. + // + // If a device memory is not in this mapping, it is not a temporary currently + // allocated and owned by this temporary memory manager. + std::map<DeviceMemoryBase, TemporaryMemoryRecord> records_ GUARDED_BY(mutex_); + + // Allocation generation -- we bump this counter to distinguish temporary + // memory handles that have been deallocated and later reallocated at the same + // device memory address. + uint64 generation_ GUARDED_BY(mutex_); + + // The stream (parent object) for this temporary memory manager -- allocations + // are performed through this stream handle. + Stream* stream_; + + SE_DISALLOW_COPY_AND_ASSIGN(TemporaryMemoryManager); +}; + +//////////// +// Inlines + +template <typename T> +port::StatusOr<std::unique_ptr<TemporaryDeviceMemory<T>>> +TemporaryMemoryManager::AllocateArray(uint64 element_count) { + port::StatusOr<std::unique_ptr<TemporaryDeviceMemoryBase>> temporary_memory = + AllocateArrayBase(element_count, sizeof(T)); + if (!temporary_memory.ok()) { + return temporary_memory.status(); + } + + return std::unique_ptr<TemporaryDeviceMemory<T>>( + reinterpret_cast<TemporaryDeviceMemory<T>*>( + temporary_memory.ConsumeValueOrDie().release())); +} + +} // namespace internal +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_TEMPORARY_MEMORY_MANAGER_H_ diff --git a/tensorflow/stream_executor/timer.cc b/tensorflow/stream_executor/timer.cc new file mode 100644 index 0000000000..46210a2346 --- /dev/null +++ b/tensorflow/stream_executor/timer.cc @@ -0,0 +1,41 @@ +#include "tensorflow/stream_executor/timer.h" + +#include "tensorflow/stream_executor/platform/port.h" + +#include "tensorflow/stream_executor/platform.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/stream_executor.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" + +namespace perftools { +namespace gputools { + +static internal::TimerInterface *CreateTimerImplementation( + StreamExecutor *parent) { + PlatformKind platform_kind = parent->platform_kind(); + if (platform_kind == PlatformKind::kCuda) { + return (*internal::MakeCUDATimerImplementation())(parent); + } else if (platform_kind == PlatformKind::kOpenCL || + platform_kind == PlatformKind::kOpenCLAltera) { + return (*internal::MakeOpenCLTimerImplementation())(parent); + } else if (platform_kind == PlatformKind::kHost) { + return internal::MakeHostTimerImplementation(parent); + } else if (platform_kind == PlatformKind::kMock) { + return nullptr; + } else { + LOG(FATAL) << "cannot create timer implementation for platform kind: " + << PlatformKindString(platform_kind); + } +} + +Timer::Timer(StreamExecutor *parent) + : implementation_(CreateTimerImplementation(parent)), parent_(parent) {} + +Timer::~Timer() { parent_->DeallocateTimer(this); } + +uint64 Timer::Microseconds() const { return implementation_->Microseconds(); } + +uint64 Timer::Nanoseconds() const { return implementation_->Nanoseconds(); } + +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/stream_executor/timer.h b/tensorflow/stream_executor/timer.h new file mode 100644 index 0000000000..ff54c06180 --- /dev/null +++ b/tensorflow/stream_executor/timer.h @@ -0,0 +1,60 @@ +#ifndef TENSORFLOW_STREAM_EXECUTOR_TIMER_H_ +#define TENSORFLOW_STREAM_EXECUTOR_TIMER_H_ + +#include <memory> + +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { + +namespace internal { +class TimerInterface; +} // namespace internal + +class StreamExecutor; + +// An interval timer, suitable for use in timing the operations which occur in +// streams. +// +// Thread-hostile: CUDA associates a CUDA-context with a particular thread in +// the system. Any operation that a user attempts to perform by using a Timer +// on a thread not-associated with the CUDA-context has unknown behavior at the +// current time; see b/13176597 +class Timer { + public: + // Instantiate a timer tied to parent as a platform executor. + explicit Timer(StreamExecutor *parent); + + // Deallocates any timer resources that the parent StreamExecutor has bestowed + // upon this object. + ~Timer(); + + // Returns the elapsed number of microseconds for a completed timer. + // Completed means has been through a start/stop lifecycle. + uint64 Microseconds() const; + + // Returns the elapsed number of nanoseconds for a completed timer. + // Completed means has been through a start/stop lifecycle. + uint64 Nanoseconds() const; + + // Returns the (opaque) backing platform ITimer instance. Ownership is + // not transferred to the caller. + internal::TimerInterface *implementation() { return implementation_.get(); } + + private: + // Platform-dependent implementation of the timer internals for the underlying + // platform. This class just delegates to this opaque instance. + std::unique_ptr<internal::TimerInterface> implementation_; + + // The StreamExecutor that manages the platform-specific internals for this + // timer. + StreamExecutor *parent_; + + SE_DISALLOW_COPY_AND_ASSIGN(Timer); +}; + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_TIMER_H_ diff --git a/tensorflow/stream_executor/trace_listener.h b/tensorflow/stream_executor/trace_listener.h new file mode 100644 index 0000000000..dcbb223f4f --- /dev/null +++ b/tensorflow/stream_executor/trace_listener.h @@ -0,0 +1,59 @@ +// This file defines the StreamExecutor trace listener, used for inserting +// non-device-specific instrumentation into the StreamExecutor. +#ifndef TENSORFLOW_STREAM_EXECUTOR_TRACE_LISTENER_H_ +#define TENSORFLOW_STREAM_EXECUTOR_TRACE_LISTENER_H_ + +#include "tensorflow/stream_executor/device_memory.h" +#include "tensorflow/stream_executor/kernel.h" +#include "tensorflow/stream_executor/launch_dim.h" +#include "tensorflow/stream_executor/lib/status.h" + +namespace perftools { +namespace gputools { + +class Stream; + +// Traces StreamExecutor PIMPL-level events. +// The few StreamExecutor interfaces that are synchronous have both Begin and +// Complete versions of their trace calls. Asynchronous operations only have +// Submit calls, as execution of the underlying operations is device-specific. +// As all tracing calls mirror StreamExecutor routines, documentation here is +// minimal. +// +// All calls have default implementations that perform no work; subclasses +// should override functionality of interest. Keep in mind that these routines +// are not called on a dedicated thread, so callbacks should execute quickly. +// +// Note: This API is constructed on an as-needed basis. Users should add +// support for further StreamExecutor operations as required. By enforced +// convention (see SCOPED_TRACE in stream_executor_pimpl.cc), synchronous +// tracepoints should be named NameBegin and NameComplete. +class TraceListener { + public: + virtual ~TraceListener() {} + + virtual void LaunchSubmit(Stream* stream, const ThreadDim& thread_dims, + const BlockDim& block_dims, + const KernelBase& kernel, + const std::vector<KernelArg>& args) {} + + virtual void SynchronousMemcpyH2DBegin(int64 correlation_id, + const void* host_src, int64 size, + DeviceMemoryBase* gpu_dst) {} + virtual void SynchronousMemcpyH2DComplete(int64 correlation_id, + const port::Status* result) {} + + virtual void SynchronousMemcpyD2HBegin(int64 correlation_id, + const DeviceMemoryBase& gpu_src, + int64 size, void* host_dst) {} + virtual void SynchronousMemcpyD2HComplete(int64 correlation_id, + const port::Status* result) {} + + virtual void BlockHostUntilDoneBegin(int64 correlation_id, Stream* stream) {} + virtual void BlockHostUntilDoneComplete(int64 correlation_id, bool result) {} +}; + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_TRACE_LISTENER_H_ |