diff options
author | 2017-07-18 19:36:18 -0700 | |
---|---|---|
committer | 2017-07-18 19:40:33 -0700 | |
commit | 491beb74cc5a08693d0b884d10532514ac1aef19 (patch) | |
tree | 6ef4b12d84de7c922816ee46c873b58a9fc5e203 | |
parent | 9293c557bd2df05658727418067ccee7a77a4be3 (diff) |
Automated g4 rollback of changelist 162423171
PiperOrigin-RevId: 162437318
-rw-r--r-- | tensorflow/core/BUILD | 1 | ||||
-rw-r--r-- | tensorflow/core/kernels/BUILD | 14 | ||||
-rw-r--r-- | tensorflow/core/kernels/conv_ops_gpu.h | 129 | ||||
-rw-r--r-- | tensorflow/core/kernels/gpu_utils.h | 166 | ||||
-rw-r--r-- | tensorflow/core/kernels/matmul_op.cc | 312 | ||||
-rw-r--r-- | tensorflow/core/kernels/matmul_op.h | 64 | ||||
-rw-r--r-- | tensorflow/core/util/matmul_autotune.cc | 51 | ||||
-rw-r--r-- | tensorflow/core/util/matmul_autotune.h | 28 | ||||
-rw-r--r-- | tensorflow/python/BUILD | 42 | ||||
-rw-r--r-- | tensorflow/python/kernel_tests/matmul_op_test.py | 3 | ||||
-rw-r--r-- | tensorflow/python/ops/matmul_benchmark.py | 143 | ||||
-rw-r--r-- | tensorflow/python/ops/matmul_benchmark_test.py | 122 | ||||
-rw-r--r-- | tensorflow/stream_executor/blas.cc | 4 | ||||
-rw-r--r-- | tensorflow/stream_executor/blas.h | 138 | ||||
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_blas.cc | 181 | ||||
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_blas.h | 17 | ||||
-rw-r--r-- | tensorflow/stream_executor/stream.cc | 178 | ||||
-rw-r--r-- | tensorflow/stream_executor/stream.h | 63 |
18 files changed, 196 insertions, 1460 deletions
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 1dbb90d263..bb17d8832f 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -405,7 +405,6 @@ tf_cuda_library( "util/tensor_slice_reader_cache.h", "util/tensor_slice_writer.h", "util/use_cudnn.h", - "util/matmul_autotune.h", "util/util.h", "util/work_sharder.h", ] + select({ diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 6e983a9ea7..21194e80c3 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -157,7 +157,6 @@ cc_library( hdrs = ["conv_2d.h"], deps = [ ":eigen_helpers", - ":gpu_util_hdrs", "//tensorflow/core:framework", "//third_party/eigen3", ], @@ -249,15 +248,6 @@ cc_library( ], ) -cc_library( - name = "gpu_util_hdrs", - hdrs = ["gpu_utils.h"], - deps = [ - ":eigen_helpers", - "//third_party/eigen3", - ], -) - tf_cc_test( name = "ops_util_test", size = "small", @@ -2416,9 +2406,7 @@ tf_kernel_library( ], "//conditions:default": [], }), - deps = MATH_DEPS + [ - ":gpu_util_hdrs", - ] + select({ + deps = MATH_DEPS + select({ ":xsmm": [ "@libxsmm_archive//:xsmm_avx", ], diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h index d2f07c2207..b268f8dbd2 100644 --- a/tensorflow/core/kernels/conv_ops_gpu.h +++ b/tensorflow/core/kernels/conv_ops_gpu.h @@ -21,12 +21,27 @@ limitations under the License. #include <tuple> #include <unordered_map> #include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/kernels/gpu_utils.h" #include "tensorflow/core/lib/gtl/inlined_vector.h" #include "tensorflow/core/lib/hash/hash.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/lib/strings/stringprintf.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/stream_executor.h" namespace tensorflow { +// TODO(zhengxq): move this to gpu_util.h. The use of such wrappers is wide +// spread. +template <typename T> +inline perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, + uint64 size) { + perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), + size * sizeof(T)); + perftools::gputools::DeviceMemory<T> typed(wrapped); + return typed; +} + // Get the Cudnn workspace limit from the environment variable, which is in MB. // Return the workspace memory limit in bytes. If no value is set, return the // default value. @@ -41,10 +56,12 @@ class CudnnScratchAllocator : public perftools::gputools::ScratchAllocator { virtual ~CudnnScratchAllocator() {} CudnnScratchAllocator(int64 memory_limit, OpKernelContext* context) : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {} - int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override { + virtual int64 GetMemoryLimitInBytes( + perftools::gputools::Stream* stream) override { return memory_limit_; } - perftools::gputools::port::StatusOr<perftools::gputools::DeviceMemory<uint8>> + virtual perftools::gputools::port::StatusOr< + perftools::gputools::DeviceMemory<uint8>> AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override { Tensor temporary_memory; if (byte_size > memory_limit_) { @@ -168,6 +185,112 @@ class ConvParameters { typedef Eigen::GpuDevice GPUDevice; +// A helper class that looks up the best autotuned config from parameters. +// Due to the noisy nature of autotune, especially with multiple devices, it +// only accepts a config if its margin exceeds a threshold. +// For the same shape configs, if a new best config matches the previous best, +// they get promoted; otherwise, the winner gets demoted. This process stops +// when the winner's score exceeds the threshold. +// In a bad case when two configs are very close to each other and flips +// back and forth randomly, the expected number of experiments before autotune +// settles is O(threshold ^ 2). So we recommend that number of warmup runs +// for any benchmarks. +template <typename Parameters, typename Config> +class AutoTuneMap { + public: + bool Find(const Parameters& params, Config* config) const { + mutex_lock lock(mu_); + auto iter = params_config_map_.find(params); + if (iter == params_config_map_.end() || + iter->second.score < min_score_threshold_) { + return false; + } + *config = iter->second.config; + return true; + } + void Insert(const ConvParameters& params, const Config& config) { + mutex_lock lock(mu_); + auto iter = params_config_map_.find(params); + int new_score = 0; + if (iter == params_config_map_.end()) { + // Create a new entry if params is new. + VLOG(1) << GetActionSummary("creates", params, config); + params_config_map_.insert(std::make_pair(params, ValueType{config, 1})); + new_score = 1; + } else if (iter->second.score < min_score_threshold_) { + DCHECK(iter->second.score > 0); + if (iter->second.config != config) { + // If it is different from the current winner, demotes the winner. + VLOG(1) << GetActionSummary("demotes", params, config); + new_score = --iter->second.score; + if (new_score <= 0) { + VLOG(1) << GetActionSummary("erases", params, config); + params_config_map_.erase(iter); + } + } else { + // If it is the same as the current winner, promotes the winner. + VLOG(1) << GetActionSummary("promotes", params, config); + new_score = ++iter->second.score; + } + } + if (new_score >= min_score_threshold_) { + VLOG(1) << GetActionSummary("accepts", params, config); + } + } + + private: + AutoTuneMap(const string& name) : name_(name) { + min_score_threshold_ = 1; + const char* threshold_str = getenv("TF_AUTOTUNE_THRESHOLD"); + if (threshold_str != nullptr) { + strings::safe_strto32(threshold_str, &min_score_threshold_); + } + min_score_threshold_ = std::max(min_score_threshold_, 1); + } + + template <class Group, class Params, class Cfg> + friend class AutoTuneSingleton; + + struct Hasher { + std::size_t operator()(const Parameters& parameter) const { + return parameter.hash(); + } + }; + + string GetActionSummary(StringPiece action, const Parameters& params, + const Config& config) { + return strings::Printf("autotune_map %s %s: %s -> (%s)", name_.c_str(), + action.ToString().c_str(), params.ToString().c_str(), + config.ToString().c_str()); + } + + mutable mutex mu_; + struct ValueType { + Config config; + int32 score; + }; + std::unordered_map<Parameters, ValueType, Hasher> params_config_map_ + GUARDED_BY(mu_); + string name_; + int32 min_score_threshold_; + + TF_DISALLOW_COPY_AND_ASSIGN(AutoTuneMap); +}; + +// A Singleton helper that manages the global autotune results by groups. +// The caller specified arbitrary Group type that can distinguish between +// different autotune results, even if their Parameters and Configs are the +// same. +template <class Group, typename Parameters, typename Config> +class AutoTuneSingleton { + public: + typedef AutoTuneMap<Parameters, Config> AutoTuneType; + static AutoTuneType* GetInstance() { + static AutoTuneType* instance = new AutoTuneType(Group::name()); + return instance; + } +}; + } // namespace tensorflow #endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h deleted file mode 100644 index 18bfb8ce8e..0000000000 --- a/tensorflow/core/kernels/gpu_utils.h +++ /dev/null @@ -1,166 +0,0 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_ -#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_ - -#if GOOGLE_CUDA - -#include <unordered_map> - -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/stream_executor.h" - -namespace tensorflow { - -template <typename T> -inline perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, - uint64 size) { - perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), - size * sizeof(T)); - perftools::gputools::DeviceMemory<T> typed(wrapped); - return typed; -} - - -// A helper class that looks up the best autotuned config from parameters. -// Due to the noisy nature of autotune, especially with multiple devices, it -// only accepts a config if its margin exceeds a threshold. -// For the same shape configs, if a new best config matches the previous best, -// they get promoted; otherwise, the winner gets demoted. This process stops -// when the winner's score exceeds the threshold. -// In a bad case when two configs are very close to each other and flips -// back and forth randomly, the expected number of experiments before autotune -// settles is O(threshold ^ 2). So we recommend that number of warmup runs -// for any benchmarks. -template <typename Parameters, typename Config> -class AutoTuneMap { - public: - bool Find(const Parameters& params, Config* config) const { - mutex_lock lock(mu_); - auto iter = params_config_map_.find(params); - if (iter == params_config_map_.end() || - (iter->second.score < min_score_threshold_ && - iter->second.count <= max_autotune_count_)) { - return false; - } - *config = iter->second.config; - return true; - } - void Insert(const Parameters& params, const Config& config) { - mutex_lock lock(mu_); - auto iter = params_config_map_.find(params); - int new_score = 0; - if (iter == params_config_map_.end()) { - // Create a new entry if params is new. - VLOG(1) << GetActionSummary("creates", params, config); - params_config_map_.insert( - std::make_pair(params, ValueType{config, 1, 1})); - new_score = 1; - } else if (iter->second.score < min_score_threshold_ && - iter->second.count <= max_autotune_count_) { - DCHECK_GT(iter->second.score, 0); - if (iter->second.config != config) { - // If it is different from the current winner, demotes the winner. - VLOG(1) << GetActionSummary("demotes", params, config); - new_score = --iter->second.score; - ++iter->second.count; - if (new_score <= 0) { - VLOG(1) << GetActionSummary("erases", params, config); - params_config_map_.erase(iter); - } - } else { - // If it is the same as the current winner, promotes the winner. - VLOG(1) << GetActionSummary("promotes", params, config); - new_score = ++iter->second.score; - ++iter->second.count; - } - } - if (new_score >= min_score_threshold_) { - VLOG(1) << GetActionSummary("accepts", params, config); - } - } - - private: - AutoTuneMap(const string& name) : name_(name) { - min_score_threshold_ = 1; - int min_warmup_iterations = 10; - const char* threshold_str = getenv("TF_AUTOTUNE_THRESHOLD"); - if (threshold_str != nullptr) { - strings::safe_strto32(threshold_str, &min_score_threshold_); - } - const char* min_warmup_iteration_str = - getenv("TF_AUTOTUNE_MIN_WARMUP_ITERATIONS"); - if (min_warmup_iteration_str != nullptr) { - strings::safe_strto32(min_warmup_iteration_str, &min_warmup_iterations); - } - min_score_threshold_ = std::max(min_score_threshold_, 1); - max_autotune_count_ = std::max( - 5 * min_score_threshold_ * min_score_threshold_, min_warmup_iterations); - } - - template <class Group, class Params, class Cfg> - friend class AutoTuneSingleton; - - struct Hasher { - std::size_t operator()(const Parameters& parameter) const { - return parameter.hash(); - } - }; - - string GetActionSummary(StringPiece action, const Parameters& params, - const Config& config) { - return strings::Printf("autotune_map %s %s: %s -> (%s)", name_.c_str(), - action.ToString().c_str(), params.ToString().c_str(), - config.ToString().c_str()); - } - - mutable mutex mu_; - struct ValueType { - Config config; - int32 score; - int32 count; - }; - std::unordered_map<Parameters, ValueType, Hasher> params_config_map_ - GUARDED_BY(mu_); - string name_; - int32 min_score_threshold_; - int32 max_autotune_count_; - - TF_DISALLOW_COPY_AND_ASSIGN(AutoTuneMap); -}; - -// A Singleton helper that manages the global autotune results by groups. -// The caller specified arbitrary Group type that can distinguish between -// different autotune results, even if their Parameters and Configs are the -// same. -template <class Group, typename Parameters, typename Config> -class AutoTuneSingleton { - public: - typedef AutoTuneMap<Parameters, Config> AutoTuneType; - static AutoTuneType* GetInstance() { - static AutoTuneType* instance = new AutoTuneType(Group::name()); - return instance; - } -}; - -} // namespace tensorflow - -#endif // GOOGLE_CUDA - -#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_ diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc index 62c5ecfe81..8003f7ff67 100644 --- a/tensorflow/core/kernels/matmul_op.cc +++ b/tensorflow/core/kernels/matmul_op.cc @@ -23,15 +23,27 @@ limitations under the License. #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/kernels/fill_functor.h" -#include "tensorflow/core/util/matmul_autotune.h" + #if GOOGLE_CUDA #include "cuda/include/cuda.h" -#include "tensorflow/core/kernels/gpu_utils.h" #include "tensorflow/core/platform/stream_executor.h" #endif // GOOGLE_CUDA namespace tensorflow { +#if GOOGLE_CUDA + +namespace { +template <typename T> +perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) { + perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory)); + perftools::gputools::DeviceMemory<T> typed(wrapped); + return typed; +} +} // namespace + +#endif // GOOGLE_CUDA + typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL @@ -111,16 +123,10 @@ bool ExplicitVectorMatrixOptimization<Eigen::half>( template <typename Device, typename T> struct LaunchMatMulBase { -#if GOOGLE_CUDA - typedef perftools::gputools::blas::AlgorithmType AlgorithmType; -#else - typedef int64 AlgorithmType; -#endif // GOOGLE_CUDA - static void launch( - OpKernelContext* ctx, const Tensor& a, const Tensor& b, + OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b, const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair, - std::vector<AlgorithmType>* algorithms, bool use_aututone, Tensor* out) { + Tensor* out) { #ifndef TENSORFLOW_USE_SYCL // An explicit vector-matrix multiply is much better optimized than an // implicit one and this is a bottleneck during non-batched inference. @@ -134,10 +140,6 @@ struct LaunchMatMulBase { } #endif // TENSORFLOW_USE_SYCL } - - static void GetBlasGemmAlgorithm(OpKernelConstruction* ctx, - std::vector<int64>* algorithms, - bool* algorithm_set_flag) {} }; // On CPUs, we ignore USE_CUBLAS template <typename T> @@ -157,39 +159,24 @@ struct LaunchMatMul<SYCLDevice, T, USE_CUBLAS> : public LaunchMatMulSYCL<T> {}; #if GOOGLE_CUDA namespace { - template <typename T> struct LaunchBlasGemv { - static void Compute( - OpKernelContext* ctx, perftools::gputools::Stream* stream, bool trans, - uint64 m, uint64 n, const perftools::gputools::DeviceMemory<T>& a, - const perftools::gputools::DeviceMemory<T>& b, - perftools::gputools::DeviceMemory<T>* c, - perftools::gputools::blas::ProfileResult* output_profile) { + static void Compute(OpKernelContext* ctx, perftools::gputools::Stream* stream, + bool trans, uint64 m, uint64 n, + const perftools::gputools::DeviceMemory<T>& a, + const perftools::gputools::DeviceMemory<T>& b, + perftools::gputools::DeviceMemory<T>* c) { const auto blas_trans = trans ? perftools::gputools::blas::Transpose::kTranspose : perftools::gputools::blas::Transpose::kNoTranspose; - if (output_profile == nullptr) { - bool blas_launch_status = - stream - ->ThenBlasGemv(blas_trans, m, n, static_cast<T>(1.0), a, m, b, 1, - static_cast<T>(0.0), c, 1) - .ok(); - if (!blas_launch_status) { - ctx->SetStatus( - errors::Internal("Blas GEMV launch failed: m=", m, ", n=", n)); - } - } else { - bool blas_launch_status = - stream - ->ThenBlasGemvWithProfiling(blas_trans, m, n, static_cast<T>(1.0), - a, m, b, 1, static_cast<T>(0.0), c, 1, - output_profile) - .ok(); - if (!blas_launch_status) { - ctx->SetStatus(errors::Internal( - "Blas GEMV with profiling launch failed: m=", m, ", n=", n)); - } + bool blas_launch_status = + stream + ->ThenBlasGemv(blas_trans, m, n, static_cast<T>(1.0), a, m, b, 1, + static_cast<T>(0.0), c, 1) + .ok(); + if (!blas_launch_status) { + ctx->SetStatus( + errors::Internal("Blas GEMV launch failed: m=", m, ", n=", n)); } } @@ -201,8 +188,7 @@ void LaunchBlasGemv<Eigen::half>::Compute( OpKernelContext* ctx, perftools::gputools::Stream* stream, bool trans, uint64 m, uint64 n, const perftools::gputools::DeviceMemory<Eigen::half>& a, const perftools::gputools::DeviceMemory<Eigen::half>& b, - perftools::gputools::DeviceMemory<Eigen::half>* c, - perftools::gputools::blas::ProfileResult* output_profile) { + perftools::gputools::DeviceMemory<Eigen::half>* c) { ctx->SetStatus(errors::Internal( "Blas GEMV launch failed: GEMV is not implemented for float16.")); } @@ -214,55 +200,15 @@ bool LaunchBlasGemv<Eigen::half>::IsSupported() { } // namespace -bool GetCublasAutotuneComputationType( - const DataType& dtype, - perftools::gputools::blas::ComputationType* compute_type) { - using perftools::gputools::blas::ComputationType; - bool use_f32_for_f16_computation = MatmulDoFP32ComputationFP16Input(); - switch (dtype) { - case DT_HALF: - case DT_BFLOAT16: - if (use_f32_for_f16_computation) { - *compute_type = ComputationType::kF32; - } else { - *compute_type = ComputationType::kF16; - } - return false; - case DT_FLOAT: - *compute_type = ComputationType::kF32; - return true; - case DT_DOUBLE: - *compute_type = ComputationType::kF64; - return true; - default: - // Unsupported compute_type, return false. - return false; - } -} - -// A dummy type to group matmul autotune results together. -struct MatmulAutoTuneGroup { - static string name() { return "Matmul"; } -}; -typedef AutoTuneSingleton<MatmulAutoTuneGroup, MatmulParameters, - perftools::gputools::blas::AlgorithmConfig> - AutoTuneMatmul; - template <typename T> struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> { static void launch( - OpKernelContext* ctx, const Tensor& a, const Tensor& b, + OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b, const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair, - std::vector<int64>* algorithms, bool use_autotune, Tensor* out) { - using perftools::gputools::blas::AlgorithmConfig; - using perftools::gputools::blas::ComputationType; - using perftools::gputools::blas::ProfileResult; - using perftools::gputools::blas::Transpose; - using perftools::gputools::blas::kDefaultAlgorithm; - using perftools::gputools::blas::kDefaultBlasGemm; - using perftools::gputools::blas::kDefaultBlasGemv; - using perftools::gputools::blas::kNoAlgorithm; - Transpose trans[] = {Transpose::kNoTranspose, Transpose::kTranspose}; + Tensor* out) { + perftools::gputools::blas::Transpose trans[] = { + perftools::gputools::blas::Transpose::kNoTranspose, + perftools::gputools::blas::Transpose::kTranspose}; const uint64 m = a.dim_size(1 - dim_pair[0].first); const uint64 k = a.dim_size(dim_pair[0].first); const uint64 n = b.dim_size(1 - dim_pair[0].second); @@ -274,157 +220,36 @@ struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> { auto* stream = ctx->op_device_context()->stream(); OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available.")); - auto a_ptr = AsDeviceMemory(a.template flat<T>().data(), - a.template flat<T>().size()); - auto b_ptr = AsDeviceMemory(b.template flat<T>().data(), - b.template flat<T>().size()); - auto c_ptr = AsDeviceMemory(out->template flat<T>().data(), - out->template flat<T>().size()); - auto alpha = static_cast<T>(1.0); - auto beta = static_cast<T>(0.0); - - int device_id = stream->parent()->device_ordinal(); - DataType dtype = a.dtype(); - MatmulParameters matmul_parameters = { - transpose_a, transpose_b, m, n, k, dtype, device_id, - }; - AlgorithmConfig algorithm_config(kNoAlgorithm); - - ComputationType computation_type; - bool compute_type_supported = - GetCublasAutotuneComputationType(dtype, &computation_type); - if (use_autotune && compute_type_supported && !algorithms->empty()) { - ProfileResult best_result; - // TODO(yangzihao): Unify this code with conv autotuning. - if (!AutoTuneMatmul::GetInstance()->Find(matmul_parameters, - &algorithm_config)) { - ProfileResult profile_result; - for (auto profile_algorithm : (*algorithms)) { - // Cublas does - // C = A x B - // where A, B and C are assumed to be in column major. - // We want the output to be in row-major, so we can compute - // C' = B' x A' (' stands for transpose) - bool cublas_launch_status = - stream - ->ThenBlasGemmWithAlgorithm( - blas_transpose_b, blas_transpose_a, n, m, k, alpha, b_ptr, - transpose_b ? k : n, a_ptr, transpose_a ? m : k, beta, - &c_ptr, n, computation_type, profile_algorithm, - &profile_result) - .ok(); - if (cublas_launch_status) { - if (profile_result.is_valid()) { - if (profile_result.elapsed_time_in_ms() < - best_result.elapsed_time_in_ms()) { - best_result = profile_result; - } - } - } - } - // Try BlasGemmWithProfiling - bool cublas_launch_status = - stream - ->ThenBlasGemmWithProfiling( - blas_transpose_b, blas_transpose_a, n, m, k, 1.0, b_ptr, - transpose_b ? k : n, a_ptr, transpose_a ? m : k, 0.0, - &c_ptr, n, &profile_result) - .ok(); - if (cublas_launch_status) { - if (profile_result.is_valid()) { - if (profile_result.elapsed_time_in_ms() < - best_result.elapsed_time_in_ms()) { - best_result = profile_result; - } - } - } - // Try BlasGemvWithProfiling - if (LaunchBlasGemv<T>::IsSupported() && n == 1) { - LaunchBlasGemv<T>::Compute(ctx, stream, !transpose_a, - transpose_a ? m : k, transpose_a ? k : m, - a_ptr, b_ptr, &c_ptr, &profile_result); - if (profile_result.is_valid()) { - if (profile_result.elapsed_time_in_ms() < - best_result.elapsed_time_in_ms()) { - best_result = profile_result; - } - } - } - } - // We make sure that each matmul parameter set only gets one pass of - // autotune. If the best result is found, assign it to algorithm_type - // and insert it to autotune map. If all internal kernels of - // cublasGemmEx() returns invalid results, we add kNoAlgorithm to the - // autotune map. - if (best_result.is_valid()) { - algorithm_config.set_algorithm(best_result.algorithm()); - } - AutoTuneMatmul::GetInstance()->Insert(matmul_parameters, - algorithm_config); - if (algorithm_config.algorithm() != kNoAlgorithm && - algorithm_config.algorithm() != kDefaultBlasGemm && - algorithm_config.algorithm() != kDefaultBlasGemv) { - bool cublas_launch_status = - stream - ->ThenBlasGemmWithAlgorithm( - blas_transpose_b, blas_transpose_a, n, m, k, alpha, b_ptr, - transpose_b ? k : n, a_ptr, transpose_a ? m : k, beta, - &c_ptr, n, computation_type, algorithm_config.algorithm(), - nullptr) - .ok(); - if (!cublas_launch_status) { - ctx->SetStatus(errors::Internal( - "Blas GEMM with algorithm launch failed : a.shape=(", - a.dim_size(0), ", ", a.dim_size(1), "), b.shape=(", b.dim_size(0), - ", ", b.dim_size(1), "), m=", m, ", n=", n, ", k=", k)); - } - } - } - // For the following case, we use normal BlasGemm(): - // 1) We didn't set the use_autotune flag; - // 2) compute type does not support autotune; - // 3) no algorithm is found; - // 4) all internal kernels in autotune return invalid results. - if (!use_autotune || !compute_type_supported || algorithms->empty() || - algorithm_config.algorithm() == kNoAlgorithm || - algorithm_config.algorithm() == kDefaultBlasGemm || - algorithm_config.algorithm() == kDefaultBlasGemv) { - if (algorithm_config.algorithm() == kDefaultBlasGemv) { - // This is a matrix*vector multiply so use GEMV to compute A * b. - // Here we are multiplying in the natural order, so we have to flip - // the transposition flag to compensate for the tensor being stored - // row-major. - // TODO(yangzihao): Add Gemv as an autotuning option too. - LaunchBlasGemv<T>::Compute(ctx, stream, !transpose_a, - transpose_a ? m : k, transpose_a ? k : m, - a_ptr, b_ptr, &c_ptr, nullptr); - } else { - // Use C' = B' x A' (' stands for transpose) - bool blas_launch_status = - stream - ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k, - 1.0f, b_ptr, transpose_b ? k : n, a_ptr, - transpose_a ? m : k, 0.0f, &c_ptr, n) - .ok(); - if (!blas_launch_status) { - ctx->SetStatus(errors::Internal( - "Blas GEMM launch failed : a.shape=(", a.dim_size(0), ", ", - a.dim_size(1), "), b.shape=(", b.dim_size(0), ", ", b.dim_size(1), - "), m=", m, ", n=", n, ", k=", k)); - } + auto a_ptr = AsDeviceMemory(a.template flat<T>().data()); + auto b_ptr = AsDeviceMemory(b.template flat<T>().data()); + auto c_ptr = AsDeviceMemory(out->template flat<T>().data()); + // Cublas does + // C = A x B + // where A, B and C are assumed to be in column major. + // We want the output to be in row-major, so we can compute + // C' = B' x A' (' stands for transpose) + if (LaunchBlasGemv<T>::IsSupported() && n == 1) { + // This is a matrix*vector multiply so use GEMV to compute A * b. + // Here we are multiplying in the natural order, so we have to flip + // the transposition flag to compensate for the tensor being stored + // row-major. + LaunchBlasGemv<T>::Compute(ctx, stream, !transpose_a, transpose_a ? m : k, + transpose_a ? k : m, a_ptr, b_ptr, &c_ptr); + } else { + bool blas_launch_status = + stream + ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k, 1.0f, + b_ptr, transpose_b ? k : n, a_ptr, + transpose_a ? m : k, 0.0f, &c_ptr, n) + .ok(); + if (!blas_launch_status) { + ctx->SetStatus(errors::Internal( + "Blas GEMM launch failed : a.shape=(", a.dim_size(0), ", ", + a.dim_size(1), "), b.shape=(", b.dim_size(0), ", ", b.dim_size(1), + "), m=", m, ", n=", n, ", k=", k)); } } } - - static void GetBlasGemmAlgorithm(OpKernelConstruction* ctx, - std::vector<int64>* algorithms, - bool* algorithm_set_flag) { - if (*algorithm_set_flag == false) { - auto* stream = ctx->device()->tensorflow_gpu_device_info()->stream; - stream->parent()->GetBlasGemmAlgorithms(algorithms); - *algorithm_set_flag = true; - } - } }; #endif // GOOGLE_CUDA @@ -432,14 +257,9 @@ struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> { template <typename Device, typename T, bool USE_CUBLAS> class MatMulOp : public OpKernel { public: - explicit MatMulOp(OpKernelConstruction* ctx) - : OpKernel(ctx), algorithms_set_already_(false) { + explicit MatMulOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_)); - - LaunchMatMul<Device, T, USE_CUBLAS>::GetBlasGemmAlgorithm( - ctx, &algorithms_, &algorithms_set_already_); - use_autotune_ = MatmulAutotuneEnable(); } void Compute(OpKernelContext* ctx) override { @@ -482,14 +302,10 @@ class MatMulOp : public OpKernel { return; } - LaunchMatMul<Device, T, USE_CUBLAS>::launch( - ctx, a, b, dim_pair, &algorithms_, use_autotune_, out); + LaunchMatMul<Device, T, USE_CUBLAS>::launch(ctx, this, a, b, dim_pair, out); } private: - std::vector<int64> algorithms_; - bool algorithms_set_already_; - bool use_autotune_; bool transpose_a_; bool transpose_b_; }; diff --git a/tensorflow/core/kernels/matmul_op.h b/tensorflow/core/kernels/matmul_op.h index 6398da2fb9..5a8db6da19 100644 --- a/tensorflow/core/kernels/matmul_op.h +++ b/tensorflow/core/kernels/matmul_op.h @@ -17,9 +17,7 @@ limitations under the License. #define TENSORFLOW_KERNELS_MATMUL_OP_H_ #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_types.h" -#include "tensorflow/core/lib/hash/hash.h" namespace tensorflow { namespace functor { @@ -52,68 +50,6 @@ struct MatMulFunctor { }; } // end namespace functor - -#if GOOGLE_CUDA -// Encapsulate all the shape information that is used in matmul operations. -class MatmulParameters { - public: - MatmulParameters(bool transa, bool transb, uint64 m, uint64 n, uint64 k, - DataType dtype, int device_id) - : transa_(transa), - transb_(transb), - m_(m), - n_(n), - k_(k), - dtype_(dtype), - device_id_(device_id) { - hash_code_ = transa; - hash_code_ = Hash64Combine(hash_code_, transb); - hash_code_ = Hash64Combine(hash_code_, m); - hash_code_ = Hash64Combine(hash_code_, n); - hash_code_ = Hash64Combine(hash_code_, k); - hash_code_ = Hash64Combine(hash_code_, dtype); - hash_code_ = Hash64Combine(hash_code_, device_id); - } - bool operator==(const MatmulParameters& other) const { - return this->get_data_as_tuple() == other.get_data_as_tuple(); - } - - bool operator!=(const MatmulParameters& other) const { - return !(*this == other); - } - uint64 hash() const { return hash_code_; } - - string ToString() const { - // clang-format off - return strings::StrCat( - transa_, ", ", transb_, ", ", - m_, ", ", n_, ", ", k_, - dtype_, ", ", device_id_); - // clang-format on - } - - private: - typedef std::tuple<bool, bool, int64, int64, int64, DataType, int> - ParameterDataType; - - ParameterDataType get_data_as_tuple() const { - return std::make_tuple(transa_, transb_, m_, n_, k_, dtype_, device_id_); - } - - bool transa_; - bool transb_; - uint64 m_; - uint64 n_; - uint64 k_; - DataType dtype_; - int device_id_; - uint64 hash_code_; -}; - -typedef Eigen::GpuDevice GPUDevice; - -#endif // GOOGLE_CUDA - } // end namespace tensorflow #endif // TENSORFLOW_KERNELS_MATMUL_OP_H_ diff --git a/tensorflow/core/util/matmul_autotune.cc b/tensorflow/core/util/matmul_autotune.cc deleted file mode 100644 index 741a78a193..0000000000 --- a/tensorflow/core/util/matmul_autotune.cc +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/util/matmul_autotune.h" - -#include "tensorflow/core/framework/types.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/util/env_var.h" - -namespace tensorflow { -bool MatmulAutotuneEnable() { - bool value; - Status status = - ReadBoolFromEnvVar("TF_MATMUL_AUTOTUNE_ENABLE", false, &value); - if (!status.ok()) { - LOG(ERROR) << status.error_message(); - } - return value; -} - -bool MatmulDoFP32ComputationFP16Input() { - bool value; - // Feedback from NVIDIA: the "true floating point 16" compute capability is - // absent from compute capability SM 5.2. The native 16 bit floating point - // computation was introduced in SM 5.3 and higher compute capability. So - // for compatibility, set this to be true by default for now. - // TODO(yangzihao): In the future, we need to return three possibilities: - // user-set-true, user-set-false, user-no-setting. In the calling sites, - // check the compatibilities. Note that user-set-false with compute - // capability <= 5.2 will cause an error in the later cublasGemmEx() call. - Status status = - ReadBoolFromEnvVar("TF_FP16_MATMUL_USE_FP32_COMPUTE", true, &value); - if (!status.ok()) { - LOG(ERROR) << status.error_message(); - } - return value; -} - -} // namespace tensorflow diff --git a/tensorflow/core/util/matmul_autotune.h b/tensorflow/core/util/matmul_autotune.h deleted file mode 100644 index 5366623883..0000000000 --- a/tensorflow/core/util/matmul_autotune.h +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -// The utility to check matmul autotune related flags. - -#ifndef THIRD_PARTY_TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_ -#define THIRD_PARTY_TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_ - -namespace tensorflow { - -bool MatmulAutotuneEnable(); -bool MatmulDoFP32ComputationFP16Input(); - -} // namespace tensorflow - -#endif // THIRD_PARTY_TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_ diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 29d351304c..7b7eb72994 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -3851,48 +3851,6 @@ cuda_py_test( ) cuda_py_test( - name = "matmul_benchmark", - size = "medium", - srcs = ["ops/matmul_benchmark.py"], - additional_deps = [ - ":math_ops", - ":random_ops", - ":client", - ":client_testlib", - ":control_flow_ops", - ":framework_for_generated_wrappers", - ":framework_test_lib", - ":platform", - ":platform_benchmark", - ":variables", - "//third_party/py/numpy", - "//tensorflow/core:protos_all_py", - ], - main = "ops/matmul_benchmark.py", -) - -cuda_py_test( - name = "matmul_benchmark_test", - size = "medium", - srcs = ["ops/matmul_benchmark_test.py"], - additional_deps = [ - ":math_ops", - ":random_ops", - ":client", - ":client_testlib", - ":control_flow_ops", - ":framework_for_generated_wrappers", - ":platform", - ":platform_benchmark", - ":matmul_benchmark", - ":variables", - "//third_party/py/numpy", - "//tensorflow/core:protos_all_py", - ], - main = "ops/matmul_benchmark_test.py", -) - -cuda_py_test( name = "session_benchmark", srcs = ["client/session_benchmark.py"], additional_deps = [ diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py index b167278984..042f462357 100644 --- a/tensorflow/python/kernel_tests/matmul_op_test.py +++ b/tensorflow/python/kernel_tests/matmul_op_test.py @@ -31,9 +31,6 @@ from tensorflow.python.ops import random_ops from tensorflow.python.ops import variables from tensorflow.python.platform import test as test_lib -# TODO(yangzihao): Currently matmul autotuning is disabled by default. Use -# os.environ["TF_MATMUL_AUTOTUNE_ENABLE"] = "1" to enable it. - def _AddTest(test, op_name, testcase_name, fn): test_name = "_".join(["test", op_name, testcase_name]) diff --git a/tensorflow/python/ops/matmul_benchmark.py b/tensorflow/python/ops/matmul_benchmark.py deleted file mode 100644 index 55c575162a..0000000000 --- a/tensorflow/python/ops/matmul_benchmark.py +++ /dev/null @@ -1,143 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Benchmark for Matmul operator.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import itertools -import time - -import numpy as np - -from tensorflow.python.client import session as session_lib -from tensorflow.python.framework import ops -from tensorflow.python.ops import control_flow_ops -from tensorflow.python.ops import math_ops -from tensorflow.python.ops import random_ops -from tensorflow.python.ops import variables -from tensorflow.python.platform import test - - -def build_graph(device, n, m, k, transpose_a, transpose_b, dtype): - """Build a graph containing a sequence of matmul operations. - - Args: - device: String, the device to run on. - n: tensor A's first dimension size. - m: tensor A's second dimension size. - k: tensor B's second dimension size. - transpose_a: boolean value to show if tensor A is transposed. - transpose_b: boolean value to show if tensor B is transposed. - dtype: numpy data type of the input tensor. - - Returns: - A matmul operation to run() - """ - with ops.device('/%s:0' % device): - if not transpose_a: - x = variables.Variable(random_ops.random_uniform([n, m], dtype=dtype)) - else: - x = variables.Variable(random_ops.random_uniform([m, n], dtype=dtype)) - if not transpose_b: - y = variables.Variable(random_ops.random_uniform([m, k], dtype=dtype)) - else: - y = variables.Variable(random_ops.random_uniform([k, m], dtype=dtype)) - - z = math_ops.matmul(x, y, transpose_a=transpose_a, transpose_b=transpose_b) - return control_flow_ops.group(z) - - -class MatmulBenchmark(test.Benchmark): - """Benchmark matmul!""" - - def run_graph(self, device, n, m, k, transpose_a, transpose_b, num_iters, - dtype): - """Run the graph and print its execution time. - - Args: - device: String, the device to run on. - n: tensor A's first dimension size. - m: tensor A's second dimension size. - k: tensor B's second dimension size. - transpose_a: boolean value to show if tensor A is transposed. - transpose_b: boolean value to show if tensor B is transposed. - num_iters: number of iterations to run the benchmark. - dtype: numpy data type of the input tensor. - - Returns: - The duration of the run in seconds. - """ - graph = ops.Graph() - with graph.as_default(): - output = build_graph(device, n, m, k, transpose_a, transpose_b, dtype) - with session_lib.Session(graph=graph) as session: - variables.global_variables_initializer().run() - for _ in range(500): - session.run(output) - start_time = time.time() - for _ in range(num_iters): - session.run(output) - duration = (time.time() - start_time) - num_items = n * m * k * 2 - throughput = num_items * num_iters / duration / 1e9 - print('%s %s input_info:%s %d %.4fsec, %.4fGitems/s.' % - (device, str(dtype), str(n) + 'x' + str(m) + 'x' + str(k) + ',ta:' - + str(transpose_a) + '.tb:' + str(transpose_b), num_iters, - duration, throughput)) - - name_template = ('matmul_{device}_{dtype}_input_info_{inputinfo}') - - self.report_benchmark( - name=name_template.format( - device=device, - dtype=str(dtype).replace(' ', ''), - inputinfo=str(n) + 'x' + str(m) + 'x' + str(k) + ',ta:' + - str(transpose_a) + '.tb:' + str(transpose_b)).replace(' ', ''), - iters=num_iters, - wall_time=duration) - return duration - - def run_test_gpu(self, n, m, k, transpose_a, transpose_b, dtype, num_iters): - self.run_graph('gpu', n, m, k, transpose_a, transpose_b, num_iters, dtype) - - def test_round(self, num_iters): - dtypes = [np.float32, np.float64] - for dtype in dtypes: - for n, m, (transpose_a, transpose_b) in itertools.product( - [512, 1024], [1, 8, 16, 128], [(False, False), (True, False), - (False, True)]): - k = n - self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters) - - for n, m, k, (transpose_a, transpose_b) in itertools.product( - [200], [1, 8, 20], [10000], [(False, False), (True, False), (False, - True)]): - self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters) - - for (n, m, k), (transpose_a, transpose_b) in itertools.product( - [(200, 20, 20000), (1, 10000, 200)], [(False, False), (True, False), - (False, True)]): - self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters) - - def benchmark_matmul(self): - num_iters = 200 - for _ in range(10): - self.test_round(num_iters) - - -if __name__ == '__main__': - test.main() diff --git a/tensorflow/python/ops/matmul_benchmark_test.py b/tensorflow/python/ops/matmul_benchmark_test.py deleted file mode 100644 index a7914dba78..0000000000 --- a/tensorflow/python/ops/matmul_benchmark_test.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Tests for matmul_benchmark.py.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import itertools -import numpy as np - -from tensorflow.core.framework import graph_pb2 -from tensorflow.core.framework import node_def_pb2 -from tensorflow.python.framework import ops -from tensorflow.python.ops import matmul_benchmark -from tensorflow.python.platform import test as googletest -from tensorflow.python.platform import tf_logging - - -def BuildGraphTest(n, m, k, transpose_a, transpose_b, dtype): - - def Test(self): - if not googletest.is_gpu_available(): - tf_logging.info("Skipping BuildGraphTest %s", (n, m, k, transpose_a, - transpose_b)) - return - tf_logging.info("Testing BuildGraphTest %s", (n, m, k, transpose_a, - transpose_b)) - self._VerifyBuildGraph(n, m, k, transpose_a, transpose_b, dtype) - - return Test - - -def RunGraphTest(n, m, k, transpose_a, transpose_b, dtype): - - def Test(self): - if not googletest.is_gpu_available(): - tf_logging.info("Skipping RunGraphTest %s", (n, m, k, transpose_a, - transpose_b)) - return - tf_logging.info("Testing RunGraphTest %s", (n, m, k, transpose_a, - transpose_b)) - self._VerifyRunGraph(n, m, k, transpose_a, transpose_b, dtype) - - return Test - - -class MatmulBenchmarkTest(googletest.TestCase): - - def _StripNode(self, nd): - snode = node_def_pb2.NodeDef(name=nd.name, op=nd.op, input=nd.input) - if nd.device: - snode.device = nd.device - return snode - - def _StripGraph(self, gd): - return graph_pb2.GraphDef(node=[self._StripNode(nd) for nd in gd.node]) - - def _VerifyBuildGraph(self, n, m, k, transpose_a, transpose_b, dtype): - graph = ops.Graph() - with graph.as_default(): - matmul_benchmark.build_graph("gpu", n, m, k, transpose_a, transpose_b, - dtype) - gd = graph.as_graph_def() - self.assertProtoEquals(""" - node { name: "random_uniform/shape" op: "Const" device: "/device:GPU:0" } - node { name: "random_uniform/min" op: "Const" device: "/device:GPU:0" } - node { name: "random_uniform/max" op: "Const" device: "/device:GPU:0" } - node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: "/device:GPU:0" } - node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: "/device:GPU:0" } - node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: "/device:GPU:0" } - node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: "/device:GPU:0" } - node { name: "Variable" op: "VariableV2" device: "/device:GPU:0" } - node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: "/device:GPU:0" } - node { name: "Variable/read" op: "Identity" input: "Variable" device: "/device:GPU:0" } - node { name: "random_uniform_1/shape" op: "Const" device: "/device:GPU:0" } - node { name: "random_uniform_1/min" op: "Const" device: "/device:GPU:0" } - node { name: "random_uniform_1/max" op: "Const" device: "/device:GPU:0" } - node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: "/device:GPU:0" } - node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: "/device:GPU:0" } - node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: "/device:GPU:0" } - node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: "/device:GPU:0" } - node { name: "Variable_1" op: "VariableV2" device: "/device:GPU:0" } - node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: "/device:GPU:0" } - node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: "/device:GPU:0" } - node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: "/device:GPU:0" } - node { name: "group_deps" op: "NoOp" input: "^MatMul" device: "/device:GPU:0" } - """, self._StripGraph(gd)) - - def _VerifyRunGraph(self, n, m, k, transpose_a, transpose_b, dtype): - benchmark_instance = matmul_benchmark.MatmulBenchmark() - duration = benchmark_instance.run_graph("gpu", n, m, k, transpose_a, - transpose_b, 1, dtype) - self.assertTrue(duration > 1e-6) - - -if __name__ == "__main__": - dtypes = [np.float32, np.float64] - index = 0 - for _dtype in dtypes: - for _n, _m, (_transpose_a, _transpose_b) in itertools.product( - [512, 1024], [1, 8, 16, 128], [(False, False), (True, False), (False, - True)]): - _k = _n - setattr(MatmulBenchmarkTest, "testBuildGraph_" + str(index), - BuildGraphTest(_n, _m, _k, _transpose_a, _transpose_b, _dtype)) - setattr(MatmulBenchmarkTest, "testRunGraph_" + str(index), - RunGraphTest(_n, _m, _k, _transpose_a, _transpose_b, _dtype)) - index += 1 - googletest.main() diff --git a/tensorflow/stream_executor/blas.cc b/tensorflow/stream_executor/blas.cc index da09d84921..a59a1dda71 100644 --- a/tensorflow/stream_executor/blas.cc +++ b/tensorflow/stream_executor/blas.cc @@ -67,10 +67,6 @@ string SideString(Side s) { } } -// -- AlgorithmConfig - -string AlgorithmConfig::ToString() const { return port::StrCat(algorithm_); } - string ComputationTypeString(ComputationType ty) { switch (ty) { case ComputationType::kF16: diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h index eb1b19c5d9..cfff3649c8 100644 --- a/tensorflow/stream_executor/blas.h +++ b/tensorflow/stream_executor/blas.h @@ -44,6 +44,7 @@ limitations under the License. #include "tensorflow/stream_executor/platform/port.h" #include "tensorflow/stream_executor/lib/array_slice.h" +#include "tensorflow/stream_executor/platform/port.h" namespace Eigen { struct half; @@ -107,10 +108,6 @@ string ComputationTypeString(ComputationType ty); // Opaque identifier for an "algorithm" used by a blas routine. This functions // as a hint to the blas library. typedef int64 AlgorithmType; -constexpr AlgorithmType kDefaultAlgorithm = -1; -constexpr AlgorithmType kDefaultBlasGemm = -2; -constexpr AlgorithmType kDefaultBlasGemv = -3; -constexpr AlgorithmType kNoAlgorithm = -4; // blas uses -1 to represent the default algorithm. This happens to match up // with the CUBLAS_GEMM_DFALT constant, so cuda_blas.cc is using static_cast @@ -137,28 +134,10 @@ class ProfileResult { private: bool is_valid_ = false; - AlgorithmType algorithm_ = kDefaultAlgorithm; + AlgorithmType algorithm_ = 0; float elapsed_time_in_ms_ = std::numeric_limits<float>::max(); }; -class AlgorithmConfig { - public: - AlgorithmConfig() : algorithm_(kDefaultAlgorithm) {} - explicit AlgorithmConfig(AlgorithmType algorithm) : algorithm_(algorithm) {} - AlgorithmType algorithm() const { return algorithm_; } - void set_algorithm(AlgorithmType val) { algorithm_ = val; } - bool operator==(const AlgorithmConfig &other) const { - return this->algorithm_ == other.algorithm_; - } - bool operator!=(const AlgorithmConfig &other) const { - return !(*this == other); - } - string ToString() const; - - private: - AlgorithmType algorithm_; -}; - // BLAS support interface -- this can be derived from a GPU executor when the // underlying platform has an BLAS library implementation available. See // StreamExecutor::AsBlas(). @@ -474,29 +453,6 @@ class BlasSupport { std::complex<double> beta, DeviceMemory<std::complex<double>> *y, int incy) = 0; - virtual bool DoBlasGemvWithProfiling( - Stream *stream, blas::Transpose trans, uint64 m, uint64 n, float alpha, - const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x, - int incx, float beta, DeviceMemory<float> *y, int incy, - ProfileResult *output_profile_result) = 0; - virtual bool DoBlasGemvWithProfiling( - Stream *stream, blas::Transpose trans, uint64 m, uint64 n, double alpha, - const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x, - int incx, double beta, DeviceMemory<double> *y, int incy, - ProfileResult *output_profile_result) = 0; - virtual bool DoBlasGemvWithProfiling( - Stream *stream, blas::Transpose trans, uint64 m, uint64 n, - std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a, - int lda, const DeviceMemory<std::complex<float>> &x, int incx, - std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy, - ProfileResult *output_profile_result) = 0; - virtual bool DoBlasGemvWithProfiling( - Stream *stream, blas::Transpose trans, uint64 m, uint64 n, - std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a, - int lda, const DeviceMemory<std::complex<double>> &x, int incx, - std::complex<double> beta, DeviceMemory<std::complex<double>> *y, - int incy, ProfileResult *output_profile_result) = 0; - // Performs a rank-1 update of a general matrix. // // a <- alpha * x * y' + a, @@ -979,39 +935,8 @@ class BlasSupport { std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc) = 0; - virtual bool DoBlasGemmWithProfiling( - Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a, - int lda, const DeviceMemory<Eigen::half> &b, int ldb, float beta, - DeviceMemory<Eigen::half> *c, int ldc, - ProfileResult *output_profile_result) = 0; - virtual bool DoBlasGemmWithProfiling( - Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda, - const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c, - int ldc, ProfileResult *output_profile_result) = 0; - virtual bool DoBlasGemmWithProfiling( - Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda, - const DeviceMemory<double> &b, int ldb, double beta, - DeviceMemory<double> *c, int ldc, - ProfileResult *output_profile_result) = 0; - virtual bool DoBlasGemmWithProfiling( - Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, std::complex<float> alpha, - const DeviceMemory<std::complex<float>> &a, int lda, - const DeviceMemory<std::complex<float>> &b, int ldb, - std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc, - ProfileResult *output_profile_result) = 0; - virtual bool DoBlasGemmWithProfiling( - Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, std::complex<double> alpha, - const DeviceMemory<std::complex<double>> &a, int lda, - const DeviceMemory<std::complex<double>> &b, int ldb, - std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc, - ProfileResult *output_profile_result) = 0; - - // Gets a list of supported algorithms for DoBlasGemmWithAlgorithm. + // Gets a list of supported algorithms for DoBlasGemmWithAlgorithm. Note that + // any or all of these algorithms may still be virtual bool GetBlasGemmAlgorithms( std::vector<AlgorithmType> *out_algorithms) = 0; @@ -1548,28 +1473,6 @@ class BlasSupport { const DeviceMemory<std::complex<double>> &x, int incx, \ std::complex<double> beta, \ DeviceMemory<std::complex<double>> *y, int incy) override; \ - bool DoBlasGemvWithProfiling( \ - Stream *stream, blas::Transpose trans, uint64 m, uint64 n, float alpha, \ - const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x, \ - int incx, float beta, DeviceMemory<float> *y, int incy, \ - blas::ProfileResult *output_profile_result) override; \ - bool DoBlasGemvWithProfiling( \ - Stream *stream, blas::Transpose trans, uint64 m, uint64 n, double alpha, \ - const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x, \ - int incx, double beta, DeviceMemory<double> *y, int incy, \ - blas::ProfileResult *output_profile_result) override; \ - bool DoBlasGemvWithProfiling( \ - Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ - std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a, \ - int lda, const DeviceMemory<std::complex<float>> &x, int incx, \ - std::complex<float> beta, DeviceMemory<std::complex<float>> *y, \ - int incy, blas::ProfileResult *output_profile_result) override; \ - bool DoBlasGemvWithProfiling( \ - Stream *stream, blas::Transpose trans, uint64 m, uint64 n, \ - std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a, \ - int lda, const DeviceMemory<std::complex<double>> &x, int incx, \ - std::complex<double> beta, DeviceMemory<std::complex<double>> *y, \ - int incy, blas::ProfileResult *output_profile_result) override; \ bool DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha, \ const DeviceMemory<float> &x, int incx, \ const DeviceMemory<float> &y, int incy, \ @@ -1848,39 +1751,6 @@ class BlasSupport { const DeviceMemory<std::complex<double>> &b, int ldb, \ std::complex<double> beta, \ DeviceMemory<std::complex<double>> *c, int ldc) override; \ - bool DoBlasGemmWithProfiling( \ - Stream *stream, blas::Transpose transa, blas::Transpose transb, \ - uint64 m, uint64 n, uint64 k, float alpha, \ - const DeviceMemory<Eigen::half> &a, int lda, \ - const DeviceMemory<Eigen::half> &b, int ldb, float beta, \ - DeviceMemory<Eigen::half> *c, int ldc, \ - blas::ProfileResult *output_profile_result) override; \ - bool DoBlasGemmWithProfiling( \ - Stream *stream, blas::Transpose transa, blas::Transpose transb, \ - uint64 m, uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, \ - int lda, const DeviceMemory<float> &b, int ldb, float beta, \ - DeviceMemory<float> *c, int ldc, \ - blas::ProfileResult *output_profile_result) override; \ - bool DoBlasGemmWithProfiling( \ - Stream *stream, blas::Transpose transa, blas::Transpose transb, \ - uint64 m, uint64 n, uint64 k, double alpha, \ - const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b, \ - int ldb, double beta, DeviceMemory<double> *c, int ldc, \ - blas::ProfileResult *output_profile_result) override; \ - bool DoBlasGemmWithProfiling( \ - Stream *stream, blas::Transpose transa, blas::Transpose transb, \ - uint64 m, uint64 n, uint64 k, std::complex<float> alpha, \ - const DeviceMemory<std::complex<float>> &a, int lda, \ - const DeviceMemory<std::complex<float>> &b, int ldb, \ - std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc, \ - blas::ProfileResult *output_profile_result) override; \ - bool DoBlasGemmWithProfiling( \ - Stream *stream, blas::Transpose transa, blas::Transpose transb, \ - uint64 m, uint64 n, uint64 k, std::complex<double> alpha, \ - const DeviceMemory<std::complex<double>> &a, int lda, \ - const DeviceMemory<std::complex<double>> &b, int ldb, \ - std::complex<double> beta, DeviceMemory<std::complex<double>> *c, \ - int ldc, blas::ProfileResult *output_profile_result) override; \ bool GetBlasGemmAlgorithms(std::vector<blas::AlgorithmType> *out_algorithms) \ override; \ bool DoBlasGemmWithAlgorithm( \ diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc index 9070766166..2817364e97 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.cc +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -1857,180 +1857,6 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa, CUDAComplex(CUDAMemoryMutable(c)), ldc); } -bool CUDABlas::DoBlasGemvWithProfiling( - Stream *stream, blas::Transpose trans, uint64 m, uint64 n, float alpha, - const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x, - int incx, float beta, DeviceMemory<float> *y, int incy, - blas::ProfileResult *output_profile_result) { - return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x, - incx, beta, y, incy, - output_profile_result); -} - -bool CUDABlas::DoBlasGemvWithProfiling( - Stream *stream, blas::Transpose trans, uint64 m, uint64 n, double alpha, - const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x, - int incx, double beta, DeviceMemory<double> *y, int incy, - blas::ProfileResult *output_profile_result) { - return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x, - incx, beta, y, incy, - output_profile_result); -} - -bool CUDABlas::DoBlasGemvWithProfiling( - Stream *stream, blas::Transpose trans, uint64 m, uint64 n, - std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a, - int lda, const DeviceMemory<std::complex<float>> &x, int incx, - std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy, - blas::ProfileResult *output_profile_result) { - return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x, - incx, beta, y, incy, - output_profile_result); -} - -bool CUDABlas::DoBlasGemvWithProfiling( - Stream *stream, blas::Transpose trans, uint64 m, uint64 n, - std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a, - int lda, const DeviceMemory<std::complex<double>> &x, int incx, - std::complex<double> beta, DeviceMemory<std::complex<double>> *y, int incy, - blas::ProfileResult *output_profile_result) { - return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x, - incx, beta, y, incy, - output_profile_result); -} - -bool CUDABlas::DoBlasGemmWithProfiling( - Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a, - int lda, const DeviceMemory<Eigen::half> &b, int ldb, float beta, - DeviceMemory<Eigen::half> *c, int ldc, - blas::ProfileResult *output_profile_result) { - return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a, - lda, b, ldb, beta, c, ldc, - output_profile_result); -} - -bool CUDABlas::DoBlasGemmWithProfiling( - Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda, - const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c, - int ldc, blas::ProfileResult *output_profile_result) { - return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a, - lda, b, ldb, beta, c, ldc, - output_profile_result); -} - -bool CUDABlas::DoBlasGemmWithProfiling( - Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda, - const DeviceMemory<double> &b, int ldb, double beta, - DeviceMemory<double> *c, int ldc, - blas::ProfileResult *output_profile_result) { - return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a, - lda, b, ldb, beta, c, ldc, - output_profile_result); -} - -bool CUDABlas::DoBlasGemmWithProfiling( - Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, std::complex<float> alpha, - const DeviceMemory<std::complex<float>> &a, int lda, - const DeviceMemory<std::complex<float>> &b, int ldb, - std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc, - blas::ProfileResult *output_profile_result) { - return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a, - lda, b, ldb, beta, c, ldc, - output_profile_result); -} - -bool CUDABlas::DoBlasGemmWithProfiling( - Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, std::complex<double> alpha, - const DeviceMemory<std::complex<double>> &a, int lda, - const DeviceMemory<std::complex<double>> &b, int ldb, - std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc, - blas::ProfileResult *output_profile_result) { - return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a, - lda, b, ldb, beta, c, ldc, - output_profile_result); -} - -template <typename T> -bool CUDABlas::DoBlasGemvWithProfilingImpl( - Stream *stream, blas::Transpose trans, uint64 m, uint64 n, const T &alpha, - const DeviceMemory<T> &a, int lda, const DeviceMemory<T> &x, int incx, - const T &beta, DeviceMemory<T> *y, int incy, - blas::ProfileResult *output_profile_result) { - struct TimerDeleter { - void operator()(CUDATimer *t) { - t->Destroy(); - delete t; - } - }; - std::unique_ptr<CUDATimer, TimerDeleter> timer; - if (output_profile_result != nullptr) { - timer.reset(new CUDATimer(parent_)); - if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) { - return false; - } - } - - // Call blasGemm - bool result = - DoBlasGemv(stream, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); - - if (timer != nullptr && result) { - // CUDATimer will CHECK-fail if we Stop() it while the stream is in an error - // state. - if (!timer->Stop(AsCUDAStream(stream))) { - return false; - } - output_profile_result->set_is_valid(true); - output_profile_result->set_algorithm(blas::kDefaultBlasGemv); - output_profile_result->set_elapsed_time_in_ms( - timer->GetElapsedMilliseconds()); - } - return result; -} - -template <typename T, typename ParamType> -bool CUDABlas::DoBlasGemmWithProfilingImpl( - Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a, - int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta, - DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result) { - struct TimerDeleter { - void operator()(CUDATimer *t) { - t->Destroy(); - delete t; - } - }; - std::unique_ptr<CUDATimer, TimerDeleter> timer; - if (output_profile_result != nullptr) { - timer.reset(new CUDATimer(parent_)); - if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) { - return false; - } - } - - // Call blasGemm - bool result = DoBlasGemm(stream, transa, transb, m, n, k, alpha, a, lda, b, - ldb, beta, c, ldc); - - if (timer != nullptr && result) { - // CUDATimer will CHECK-fail if we Stop() it while the stream is in an error - // state. - if (!timer->Stop(AsCUDAStream(stream))) { - return false; - } - output_profile_result->set_is_valid(true); - output_profile_result->set_algorithm(blas::kDefaultBlasGemm); - output_profile_result->set_elapsed_time_in_ms( - timer->GetElapsedMilliseconds()); - } - return result; -} - template <typename InT, typename OutT, typename CompT> bool CUDABlas::DoBlasGemmWithAlgorithmImpl( Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, @@ -2094,9 +1920,6 @@ bool CUDABlas::GetBlasGemmAlgorithms( std::vector<blas::AlgorithmType> *out_algorithms) { // cublasGemmAlgo_t (and the function that accepts this type, cublasGemmEx) // were first introduced in CUDA 8. -// Note that when CUDA version and compute capability is not sufficient, we -// still return the out_algorithms. Caller needs to make sure that in this case, -// the returned vector is empty. #if CUDA_VERSION >= 8000 for (cublasGemmAlgo_t algo : {CUBLAS_GEMM_DFALT, CUBLAS_GEMM_ALGO0, CUBLAS_GEMM_ALGO1, @@ -2104,10 +1927,8 @@ bool CUDABlas::GetBlasGemmAlgorithms( CUBLAS_GEMM_ALGO5, CUBLAS_GEMM_ALGO6, CUBLAS_GEMM_ALGO7}) { out_algorithms->push_back(algo); } - return true; -#else - return false; #endif + return true; } bool CUDABlas::DoBlasGemmWithAlgorithm( diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h index 80cda97117..4a8641b300 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.h +++ b/tensorflow/stream_executor/cuda/cuda_blas.h @@ -127,23 +127,6 @@ class CUDABlas : public blas::BlasSupport { blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result); - // Helper function for implementing DoBlasGemmWithProfiling. - template <typename T, typename ParamType> - bool DoBlasGemmWithProfilingImpl( - Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a, - int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta, - DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result); - - // Helper function for implementing DoBlasGemvWithProfiling. - template <typename T> - bool DoBlasGemvWithProfilingImpl(Stream *stream, blas::Transpose trans, - uint64 m, uint64 n, const T &alpha, - const DeviceMemory<T> &a, int lda, - const DeviceMemory<T> &x, int incx, - const T &beta, DeviceMemory<T> *y, int incy, - blas::ProfileResult *output_profile_result); - // mutex that guards the cuBLAS handle for this device. mutex mu_; diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc index c9b36ba7ab..5996195173 100644 --- a/tensorflow/stream_executor/stream.cc +++ b/tensorflow/stream_executor/stream.cc @@ -3458,184 +3458,6 @@ struct ThenBlasWithProfileImpl { }; } // anonymous namespace -Stream &Stream::ThenBlasGemvWithProfiling( - blas::Transpose trans, uint64 m, uint64 n, float alpha, - const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x, - int incx, float beta, DeviceMemory<float> *y, int incy, - blas::ProfileResult *output_profile_result) { - VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), - PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), - PARAM(incy)); - - ThenBlasWithProfileImpl< - blas::Transpose, uint64, uint64, float, const DeviceMemory<float> &, int, - const DeviceMemory<float> &, int, float, DeviceMemory<float> *, int> - impl; - return impl(this, &blas::BlasSupport::DoBlasGemvWithProfiling, trans, m, n, - alpha, a, lda, x, incx, beta, y, incy, output_profile_result); -} - -Stream &Stream::ThenBlasGemvWithProfiling( - blas::Transpose trans, uint64 m, uint64 n, double alpha, - const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x, - int incx, double beta, DeviceMemory<double> *y, int incy, - blas::ProfileResult *output_profile_result) { - VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), - PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), - PARAM(incy)); - - ThenBlasWithProfileImpl<blas::Transpose, uint64, uint64, double, - const DeviceMemory<double> &, int, - const DeviceMemory<double> &, int, double, - DeviceMemory<double> *, int> - impl; - return impl(this, &blas::BlasSupport::DoBlasGemvWithProfiling, trans, m, n, - alpha, a, lda, x, incx, beta, y, incy, output_profile_result); -} - -Stream &Stream::ThenBlasGemvWithProfiling( - blas::Transpose trans, uint64 m, uint64 n, std::complex<float> alpha, - const DeviceMemory<std::complex<float>> &a, int lda, - const DeviceMemory<std::complex<float>> &x, int incx, - std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy, - blas::ProfileResult *output_profile_result) { - VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), - PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), - PARAM(incy)); - - ThenBlasWithProfileImpl<blas::Transpose, uint64, uint64, std::complex<float>, - const DeviceMemory<std::complex<float>> &, int, - const DeviceMemory<std::complex<float>> &, int, - std::complex<float>, - DeviceMemory<std::complex<float>> *, int> - impl; - return impl(this, &blas::BlasSupport::DoBlasGemvWithProfiling, trans, m, n, - alpha, a, lda, x, incx, beta, y, incy, output_profile_result); -} - -Stream &Stream::ThenBlasGemvWithProfiling( - blas::Transpose trans, uint64 m, uint64 n, std::complex<double> alpha, - const DeviceMemory<std::complex<double>> &a, int lda, - const DeviceMemory<std::complex<double>> &x, int incx, - std::complex<double> beta, DeviceMemory<std::complex<double>> *y, int incy, - blas::ProfileResult *output_profile_result) { - VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a), - PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), - PARAM(incy)); - - ThenBlasWithProfileImpl<blas::Transpose, uint64, uint64, std::complex<double>, - const DeviceMemory<std::complex<double>> &, int, - const DeviceMemory<std::complex<double>> &, int, - std::complex<double>, - DeviceMemory<std::complex<double>> *, int> - impl; - return impl(this, &blas::BlasSupport::DoBlasGemvWithProfiling, trans, m, n, - alpha, a, lda, x, incx, beta, y, incy, output_profile_result); -} - -Stream &Stream::ThenBlasGemmWithProfiling( - blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, float alpha, const DeviceMemory<Eigen::half> &a, int lda, - const DeviceMemory<Eigen::half> &b, int ldb, float beta, - DeviceMemory<Eigen::half> *c, int ldc, - blas::ProfileResult *output_profile_result) { - VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), - PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), - PARAM(beta), PARAM(c), PARAM(ldc)); - - ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64, uint64, - uint64, float, const DeviceMemory<Eigen::half> &, int, - const DeviceMemory<Eigen::half> &, int, float, - DeviceMemory<Eigen::half> *, int> - impl; - return impl(this, &blas::BlasSupport::DoBlasGemmWithProfiling, transa, transb, - m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - output_profile_result); -} - -Stream &Stream::ThenBlasGemmWithProfiling( - blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, float alpha, const DeviceMemory<float> &a, int lda, - const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c, - int ldc, blas::ProfileResult *output_profile_result) { - VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), - PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), - PARAM(beta), PARAM(c), PARAM(ldc)); - - ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64, uint64, - uint64, float, const DeviceMemory<float> &, int, - const DeviceMemory<float> &, int, float, - DeviceMemory<float> *, int> - impl; - return impl(this, &blas::BlasSupport::DoBlasGemmWithProfiling, transa, transb, - m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - output_profile_result); -} - -Stream &Stream::ThenBlasGemmWithProfiling( - blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, double alpha, const DeviceMemory<double> &a, int lda, - const DeviceMemory<double> &b, int ldb, double beta, - DeviceMemory<double> *c, int ldc, - blas::ProfileResult *output_profile_result) { - VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), - PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), - PARAM(beta), PARAM(c), PARAM(ldc)); - - ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64, uint64, - uint64, double, const DeviceMemory<double> &, int, - const DeviceMemory<double> &, int, double, - DeviceMemory<double> *, int> - impl; - return impl(this, &blas::BlasSupport::DoBlasGemmWithProfiling, transa, transb, - m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - output_profile_result); -} - -Stream &Stream::ThenBlasGemmWithProfiling( - blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, std::complex<float> alpha, - const DeviceMemory<std::complex<float>> &a, int lda, - const DeviceMemory<std::complex<float>> &b, int ldb, - std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc, - blas::ProfileResult *output_profile_result) { - VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), - PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), - PARAM(beta), PARAM(c), PARAM(ldc)); - - ThenBlasWithProfileImpl< - blas::Transpose, blas::Transpose, uint64, uint64, uint64, - std::complex<float>, const DeviceMemory<std::complex<float>> &, int, - const DeviceMemory<std::complex<float>> &, int, std::complex<float>, - DeviceMemory<std::complex<float>> *, int> - impl; - return impl(this, &blas::BlasSupport::DoBlasGemmWithProfiling, transa, transb, - m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - output_profile_result); -} - -Stream &Stream::ThenBlasGemmWithProfiling( - blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, std::complex<double> alpha, - const DeviceMemory<std::complex<double>> &a, int lda, - const DeviceMemory<std::complex<double>> &b, int ldb, - std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc, - blas::ProfileResult *output_profile_result) { - VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), - PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), - PARAM(beta), PARAM(c), PARAM(ldc)); - - ThenBlasWithProfileImpl< - blas::Transpose, blas::Transpose, uint64, uint64, uint64, - std::complex<double>, const DeviceMemory<std::complex<double>> &, int, - const DeviceMemory<std::complex<double>> &, int, std::complex<double>, - DeviceMemory<std::complex<double>> *, int> - impl; - return impl(this, &blas::BlasSupport::DoBlasGemmWithProfiling, transa, transb, - m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - output_profile_result); -} - Stream &Stream::ThenBlasGemmWithAlgorithm( blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, uint64 k, const Eigen::half &alpha, const DeviceMemory<Eigen::half> &a, diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h index e218873839..3c8b7ee894 100644 --- a/tensorflow/stream_executor/stream.h +++ b/tensorflow/stream_executor/stream.h @@ -934,31 +934,6 @@ class Stream { std::complex<double> beta, DeviceMemory<std::complex<double>> *y, int incy); - Stream &ThenBlasGemvWithProfiling(blas::Transpose trans, uint64 m, uint64 n, - float alpha, const DeviceMemory<float> &a, - int lda, const DeviceMemory<float> &x, - int incx, float beta, - DeviceMemory<float> *y, int incy, - blas::ProfileResult *output_profile_result); - Stream &ThenBlasGemvWithProfiling(blas::Transpose trans, uint64 m, uint64 n, - double alpha, const DeviceMemory<double> &a, - int lda, const DeviceMemory<double> &x, - int incx, double beta, - DeviceMemory<double> *y, int incy, - blas::ProfileResult *output_profile_result); - Stream &ThenBlasGemvWithProfiling( - blas::Transpose trans, uint64 m, uint64 n, std::complex<float> alpha, - const DeviceMemory<std::complex<float>> &a, int lda, - const DeviceMemory<std::complex<float>> &x, int incx, - std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy, - blas::ProfileResult *output_profile_result); - Stream &ThenBlasGemvWithProfiling( - blas::Transpose trans, uint64 m, uint64 n, std::complex<double> alpha, - const DeviceMemory<std::complex<double>> &a, int lda, - const DeviceMemory<std::complex<double>> &x, int incx, - std::complex<double> beta, DeviceMemory<std::complex<double>> *y, - int incy, blas::ProfileResult *output_profile_result); - // See BlasSupport::DoBlasGer. Stream &ThenBlasGer(uint64 m, uint64 n, float alpha, const DeviceMemory<float> &x, int incx, @@ -1274,44 +1249,6 @@ class Stream { std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc); - Stream &ThenBlasGemmWithProfiling(blas::Transpose transa, - blas::Transpose transb, uint64 m, uint64 n, - uint64 k, float alpha, - const DeviceMemory<Eigen::half> &a, int lda, - const DeviceMemory<Eigen::half> &b, int ldb, - float beta, DeviceMemory<Eigen::half> *c, - int ldc, - blas::ProfileResult *output_profile_result); - Stream &ThenBlasGemmWithProfiling(blas::Transpose transa, - blas::Transpose transb, uint64 m, uint64 n, - uint64 k, float alpha, - const DeviceMemory<float> &a, int lda, - const DeviceMemory<float> &b, int ldb, - float beta, DeviceMemory<float> *c, int ldc, - blas::ProfileResult *output_profile_result); - Stream &ThenBlasGemmWithProfiling(blas::Transpose transa, - blas::Transpose transb, uint64 m, uint64 n, - uint64 k, double alpha, - const DeviceMemory<double> &a, int lda, - const DeviceMemory<double> &b, int ldb, - double beta, DeviceMemory<double> *c, - int ldc, - blas::ProfileResult *output_profile_result); - Stream &ThenBlasGemmWithProfiling( - blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, std::complex<float> alpha, - const DeviceMemory<std::complex<float>> &a, int lda, - const DeviceMemory<std::complex<float>> &b, int ldb, - std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc, - blas::ProfileResult *output_profile_result); - Stream &ThenBlasGemmWithProfiling( - blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, std::complex<double> alpha, - const DeviceMemory<std::complex<double>> &a, int lda, - const DeviceMemory<std::complex<double>> &b, int ldb, - std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc, - blas::ProfileResult *output_profile_result); - // See BlasSupport::DoBlasGemmWithAlgorithm. Stream &ThenBlasGemmWithAlgorithm( blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, |