diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2016-09-30 12:32:28 -0800 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2016-09-30 13:49:10 -0700 |
commit | 3057e97bf5d8079f6e139212c6812fd07e989390 (patch) | |
tree | 8ce6dd55c63543592426f8477737ee314f3d7382 /tensorflow/core/kernels/batch_matmul_op.cc | |
parent | 113093b017c8b2654c052a054e63738174bae649 (diff) |
Shard batch_matmul_op across files to speed up build.
Change: 134820471
Diffstat (limited to 'tensorflow/core/kernels/batch_matmul_op.cc')
-rw-r--r-- | tensorflow/core/kernels/batch_matmul_op.cc | 465 |
1 files changed, 0 insertions, 465 deletions
diff --git a/tensorflow/core/kernels/batch_matmul_op.cc b/tensorflow/core/kernels/batch_matmul_op.cc deleted file mode 100644 index f8c12927e3..0000000000 --- a/tensorflow/core/kernels/batch_matmul_op.cc +++ /dev/null @@ -1,465 +0,0 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -// See docs in ../ops/math_ops.cc. - -#define EIGEN_USE_THREADS - -#include <vector> -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/register_types.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/framework/type_traits.h" -#include "tensorflow/core/framework/types.h" -#include "tensorflow/core/kernels/fill_functor.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/types.h" -#include "tensorflow/core/util/work_sharder.h" - -#if GOOGLE_CUDA -#include "tensorflow/core/platform/stream_executor.h" -#endif // GOOGLE_CUDA - -namespace tensorflow { - -typedef Eigen::ThreadPoolDevice CPUDevice; -typedef Eigen::GpuDevice GPUDevice; - -namespace { - -Eigen::IndexPair<Eigen::DenseIndex> ContractionDims(bool adj_x, bool adj_y) { - if (!adj_x) { - if (!adj_y) { - return Eigen::IndexPair<Eigen::DenseIndex>(1, 0); - } else { - return Eigen::IndexPair<Eigen::DenseIndex>(1, 1); - } - } else { - if (!adj_y) { - return Eigen::IndexPair<Eigen::DenseIndex>(0, 0); - } else { - return Eigen::IndexPair<Eigen::DenseIndex>(0, 1); - } - } -} - -// Parallel batch matmul kernel based on the multi-threaded tensor contraction -// in Eigen. -template <typename Scalar, bool IsComplex = true> -struct ParallelMatMulKernel { - static void Conjugate(const OpKernelContext* context, Tensor* out) { - const Eigen::ThreadPoolDevice d = context->eigen_cpu_device(); - auto z = out->tensor<Scalar, 3>(); - z.device(d) = z.conjugate(); - } - - static void Run(const OpKernelContext* context, const Tensor& in_x, - const Tensor in_y, bool adj_x, bool adj_y, Tensor* out, - int start, int limit) { - static_assert(IsComplex, "Complex type expected."); - auto Tx = in_x.tensor<Scalar, 3>(); - auto Ty = in_y.tensor<Scalar, 3>(); - auto Tz = out->tensor<Scalar, 3>(); - // We use the identities - // conj(a) * conj(b) = conj(a * b) - // conj(a) * b = conj(a * conj(b)) - // to halve the number of cases. The final conjugation of the result is - // done at the end of LaunchBatchMatMul<CPUDevice, Scalar>::Launch(). - Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs; - contract_pairs[0] = ContractionDims(adj_x, adj_y); - const Eigen::ThreadPoolDevice d = context->eigen_cpu_device(); - for (int i = start; i < limit; ++i) { - auto x = Tx.template chip<0>(i); - auto z = Tz.template chip<0>(i); - if (adj_x != adj_y) { - auto y = Ty.template chip<0>(i).conjugate(); - z.device(d) = x.contract(y, contract_pairs); - } else { - auto y = Ty.template chip<0>(i); - z.device(d) = x.contract(y, contract_pairs); - } - } - } -}; - -// The Eigen contraction kernel used here is very large and slow to compile, -// so we partially specialize ParallelMatMulKernel for real types to avoid all -// but one of the instantiations. -template <typename Scalar> -struct ParallelMatMulKernel<Scalar, false> { - static void Conjugate(const OpKernelContext* context, Tensor* out) {} - - static void Run(const OpKernelContext* context, const Tensor& in_x, - const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out, - int start, int limit) { - auto Tx = in_x.tensor<Scalar, 3>(); - auto Ty = in_y.tensor<Scalar, 3>(); - auto Tz = out->tensor<Scalar, 3>(); - Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs; - contract_pairs[0] = ContractionDims(adj_x, adj_y); - const Eigen::ThreadPoolDevice d = context->eigen_cpu_device(); - for (int i = start; i < limit; ++i) { - auto x = Tx.template chip<0>(i); - auto y = Ty.template chip<0>(i); - auto z = Tz.template chip<0>(i); - z.device(d) = x.contract(y, contract_pairs); - } - } -}; - -// TODO(rmlarsen): Get rid of this when we have upstreamed improvements -// for matrix*vector and vector*matrix to Eigen's general matrix product. -template <typename Tx, typename Ty, typename Tz> -static void Multiply(bool adj_x, bool adj_y, Tx x, Ty y, Tz z) { - if (!adj_x) { - if (!adj_y) { - z.noalias() = x * y; - } else { - z.noalias() = x * y.adjoint(); - } - } else { - if (!adj_y) { - z.noalias() = x.adjoint() * y; - } else { - z.noalias() = x.adjoint() * y.adjoint(); - } - } -} - -// Sequential batch matmul kernel that calls the regular Eigen matmul. -// We prefer this over the tensor contraction because it performs -// better on vector-matrix and matrix-vector products. -template <typename Scalar> -struct SequentialMatMulKernel { - using Matrix = - Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>; - using ConstMatrixMap = Eigen::Map<const Matrix>; - using MatrixMap = Eigen::Map<Matrix>; - - static ConstMatrixMap ConstTensorSliceToEigenMatrix(const Tensor& t, - int slice) { - return ConstMatrixMap( - t.flat<Scalar>().data() + slice * t.dim_size(1) * t.dim_size(2), - t.dim_size(1), t.dim_size(2)); - } - - static MatrixMap TensorSliceToEigenMatrix(Tensor* t, int slice) { - return MatrixMap( - t->flat<Scalar>().data() + slice * t->dim_size(1) * t->dim_size(2), - t->dim_size(1), t->dim_size(2)); - } - - static void Run(const Tensor& in_x, const Tensor& in_y, bool adj_x, - bool adj_y, Tensor* out, int start, int limit) { - for (int i = start; i < limit; ++i) { - auto x = ConstTensorSliceToEigenMatrix(in_x, i); - auto y = ConstTensorSliceToEigenMatrix(in_y, i); - auto z = TensorSliceToEigenMatrix(out, i); - // TODO(rmlarsen): Get rid of the special casing here when we have - // upstreamed improvements for matrix*vector and vector*matrix to - // Eigen's general matrix product. - if (!adj_x && x.rows() == 1) { - Multiply(adj_x, adj_y, x.row(0), y, z); - } else if (adj_x && x.cols() == 1) { - Multiply(adj_x, adj_y, x.col(0), y, z); - } else if (!adj_y && y.cols() == 1) { - Multiply(adj_x, adj_y, x, y.col(0), z); - } else if (adj_y && y.rows() == 1) { - Multiply(adj_x, adj_y, x, y.row(0), z); - } else { - Multiply(adj_x, adj_y, x, y, z); - } - } - } -}; - -} // namespace - -template <typename Device, typename Scalar> -struct LaunchBatchMatMul; - -template <typename Scalar> -struct LaunchBatchMatMul<CPUDevice, Scalar> { - static void Launch(OpKernelContext* context, const Tensor& in_x, - const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) { - typedef ParallelMatMulKernel<Scalar, Eigen::NumTraits<Scalar>::IsComplex> - ParallelMatMulKernel; - bool conjugate_result = false; - - // Number of matrix multiplies i.e. size of the batch. - const int64 num_units = in_x.dim_size(0); - const int64 cost_per_unit = - in_x.dim_size(1) * in_x.dim_size(2) * out->dim_size(2); - const int64 min_dim = std::min(std::min(in_x.dim_size(1), in_x.dim_size(2)), - out->dim_size(2)); - const int64 kMaxCostOuterParallelism = 128 * 256 * 256; // heuristic. - auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); - if (min_dim > 1 && - (num_units == 1 || cost_per_unit > kMaxCostOuterParallelism)) { - // Parallelize over inner dims. - // For large matrix products it is counter-productive to parallelize - // over the batch dimension. - ParallelMatMulKernel::Run(context, in_x, in_y, adj_x, adj_y, out, 0, - num_units); - conjugate_result = adj_x; - } else if (min_dim > 1 && worker_threads.num_threads > num_units) { - // Parallelize over both outer and inner dims. - // TODO(rmlarsen): The parallelized contraction in Eigen can deadlock - // when running num_threads or more contractions in parallel. Launch on - // all worker_threads.num_threads threads here once that is fixed. - Shard(std::max(1, worker_threads.num_threads - 1), worker_threads.workers, - num_units, cost_per_unit, - [context, &in_x, &in_y, adj_x, adj_y, out](int start, int limit) { - ParallelMatMulKernel::Run(context, in_x, in_y, adj_x, adj_y, out, - start, limit); - }); - conjugate_result = adj_x; - } else { - // Parallelize over outer dims. For small matrices and large batches, it - // is counter-productive to parallelize the inner matrix multiplies. - Shard(worker_threads.num_threads, worker_threads.workers, num_units, - cost_per_unit, - [&in_x, &in_y, adj_x, adj_y, out](int start, int limit) { - SequentialMatMulKernel<Scalar>::Run(in_x, in_y, adj_x, adj_y, out, - start, limit); - }); - } - if (conjugate_result) { - // We used one of the identities - // conj(a) * conj(b) = conj(a * b) - // conj(a) * b = conj(a * conj(b)) - // above, we need to conjugate the final output. This is a - // no-op for non-complex types. - ParallelMatMulKernel::Conjugate(context, out); - } - } -}; - -#if GOOGLE_CUDA - -namespace { -template <typename T> -perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) { - perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory)); - perftools::gputools::DeviceMemory<T> typed(wrapped); - return typed; -} - -class CublasScratchAllocator : public perftools::gputools::ScratchAllocator { - public: - using Stream = ::perftools::gputools::Stream; - using DeviceMemoryBytes = ::perftools::gputools::DeviceMemory<uint8>; - - CublasScratchAllocator(OpKernelContext* context) : context_(context) {} - - int64 GetMemoryLimitInBytes(Stream* stream) override { return -1; } - - perftools::gputools::port::StatusOr<DeviceMemoryBytes> AllocateBytes( - Stream* stream, int64 byte_size) override { - Tensor temporary_memory; - - Status allocation_status(context_->allocate_temp( - DT_UINT8, TensorShape({byte_size}), &temporary_memory)); - if (!allocation_status.ok()) { - return perftools::gputools::port::StatusOr<DeviceMemoryBytes>( - DeviceMemoryBytes::MakeFromByteSize(nullptr, 0)); - } - // Hold the reference of the allocated tensors until the end of the - // allocator. - allocated_tensors_.push_back(temporary_memory); - return perftools::gputools::port::StatusOr<DeviceMemoryBytes>( - DeviceMemoryBytes::MakeFromByteSize( - temporary_memory.flat<uint8>().data(), - temporary_memory.flat<uint8>().size())); - } - - private: - OpKernelContext* context_; - std::vector<Tensor> allocated_tensors_; -}; -} // namespace - -template <typename Scalar> -struct LaunchBatchMatMul<GPUDevice, Scalar> { - static void Launch(OpKernelContext* context, const Tensor& in_x, - const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) { - constexpr perftools::gputools::blas::Transpose kTranspose = - is_complex<Scalar>::value - ? perftools::gputools::blas::Transpose::kConjugateTranspose - : perftools::gputools::blas::Transpose::kTranspose; - perftools::gputools::blas::Transpose trans[] = { - perftools::gputools::blas::Transpose::kNoTranspose, kTranspose}; - const uint64 m = in_x.dim_size(adj_x ? 2 : 1); - const uint64 k = in_x.dim_size(adj_x ? 1 : 2); - const uint64 n = in_y.dim_size(adj_y ? 1 : 2); - const uint64 batch_size = in_x.dim_size(0); - auto blas_transpose_a = trans[adj_x]; - auto blas_transpose_b = trans[adj_y]; - - auto* stream = context->op_device_context()->stream(); - OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); - - typedef perftools::gputools::DeviceMemory<Scalar> DeviceMemoryType; - std::vector<DeviceMemoryType> a_device_memory; - std::vector<DeviceMemoryType> b_device_memory; - std::vector<DeviceMemoryType> c_device_memory; - std::vector<DeviceMemoryType*> a_ptrs; - std::vector<DeviceMemoryType*> b_ptrs; - std::vector<DeviceMemoryType*> c_ptrs; - a_device_memory.reserve(batch_size); - b_device_memory.reserve(batch_size); - c_device_memory.reserve(batch_size); - a_ptrs.reserve(batch_size); - b_ptrs.reserve(batch_size); - c_ptrs.reserve(batch_size); - auto* a_base_ptr = in_x.template flat<Scalar>().data(); - auto* b_base_ptr = in_y.template flat<Scalar>().data(); - auto* c_base_ptr = out->template flat<Scalar>().data(); - for (int64 i = 0; i < batch_size; ++i) { - a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k)); - b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n)); - c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n)); - a_ptrs.push_back(&a_device_memory.back()); - b_ptrs.push_back(&b_device_memory.back()); - c_ptrs.push_back(&c_device_memory.back()); - } - - // Cublas does - // C = A x B - // where A, B and C are assumed to be in column major. - // We want the output to be in row-major, so we can compute - // C' = B' x A' (' stands for transpose) - CublasScratchAllocator scratch_allocator(context); - bool blas_launch_status = - stream - ->ThenBlasGemmBatchedWithScratch( - blas_transpose_b, blas_transpose_a, n, m, k, - static_cast<Scalar>(1.0), b_ptrs, adj_y ? k : n, a_ptrs, - adj_x ? m : k, static_cast<Scalar>(0.0), c_ptrs, n, batch_size, - &scratch_allocator) - .ok(); - if (!blas_launch_status) { - context->SetStatus(errors::Internal( - "Blas SGEMMBatched launch failed : a.shape=", - in_x.shape().DebugString(), ", b.shape=", in_y.shape().DebugString(), - ", m=", m, ", n=", n, ", k=", k, ", batch_size=", batch_size)); - } - } -}; - -#endif // GOOGLE_CUDA - -template <typename Device, typename Scalar> -class BatchMatMul : public OpKernel { - public: - explicit BatchMatMul(OpKernelConstruction* context) : OpKernel(context) { - OP_REQUIRES_OK(context, context->GetAttr("adj_x", &adj_x_)); - OP_REQUIRES_OK(context, context->GetAttr("adj_y", &adj_y_)); - } - - virtual ~BatchMatMul() {} - - void Compute(OpKernelContext* ctx) override { - const Tensor& in0 = ctx->input(0); - const Tensor& in1 = ctx->input(1); - OP_REQUIRES(ctx, in0.dims() == in1.dims(), - errors::InvalidArgument("In[0] and In[1] has different ndims: ", - in0.shape().DebugString(), " vs. ", - in1.shape().DebugString())); - const int ndims = in0.dims(); - OP_REQUIRES( - ctx, ndims >= 2, - errors::InvalidArgument("In[0] and In[1] ndims must be >= 2: ", ndims)); - TensorShape out_shape; - for (int i = 0; i < ndims - 2; ++i) { - OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i), - errors::InvalidArgument("In[0].dim(", i, ") and In[1].dim(", - i, ") must be the same: ", - in0.shape().DebugString(), " vs ", - in1.shape().DebugString())); - out_shape.AddDim(in0.dim_size(i)); - } - auto n = (ndims == 2) ? 1 : out_shape.num_elements(); - auto d0 = in0.dim_size(ndims - 2); - auto d1 = in0.dim_size(ndims - 1); - Tensor in0_reshaped; - CHECK(in0_reshaped.CopyFrom(in0, TensorShape({n, d0, d1}))); - auto d2 = in1.dim_size(ndims - 2); - auto d3 = in1.dim_size(ndims - 1); - Tensor in1_reshaped; - CHECK(in1_reshaped.CopyFrom(in1, TensorShape({n, d2, d3}))); - if (adj_x_) std::swap(d0, d1); - if (adj_y_) std::swap(d2, d3); - OP_REQUIRES(ctx, d1 == d2, - errors::InvalidArgument( - "In[0] mismatch In[1] shape: ", d1, " vs. ", d2, ": ", - in0.shape().DebugString(), " ", in1.shape().DebugString(), - " ", adj_x_, " ", adj_y_)); - out_shape.AddDim(d0); - out_shape.AddDim(d3); - Tensor* out = nullptr; - OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out)); - if (out->NumElements() == 0) { - return; - } - if (in0.NumElements() == 0 || in1.NumElements() == 0) { - functor::SetZeroFunctor<Device, Scalar> f; - f(ctx->eigen_device<Device>(), out->flat<Scalar>()); - return; - } - Tensor out_reshaped; - CHECK(out_reshaped.CopyFrom(*out, TensorShape({n, d0, d3}))); - LaunchBatchMatMul<Device, Scalar>::Launch(ctx, in0_reshaped, in1_reshaped, - adj_x_, adj_y_, &out_reshaped); - } - - private: - bool adj_x_; - bool adj_y_; -}; - -#define REGISTER_CPU(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("BatchMatMul").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \ - BatchMatMul<CPUDevice, TYPE>) - -#define REGISTER_GPU(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("BatchMatMul").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \ - BatchMatMul<GPUDevice, TYPE>) - -TF_CALL_float(REGISTER_CPU); -TF_CALL_double(REGISTER_CPU); -TF_CALL_half(REGISTER_CPU); -TF_CALL_int32(REGISTER_CPU); -TF_CALL_complex64(REGISTER_CPU); -TF_CALL_complex128(REGISTER_CPU); - -#if GOOGLE_CUDA -TF_CALL_float(REGISTER_GPU); -TF_CALL_double(REGISTER_GPU); -TF_CALL_complex64(REGISTER_GPU); -TF_CALL_complex128(REGISTER_GPU); -#if CUDA_VERSION >= 7050 -TF_CALL_half(REGISTER_GPU); -#endif -#endif // GOOGLE_CUDA - -#undef REGISTER_CPU -#undef REGISTER_GPU -} // end namespace tensorflow |