aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/kernels/batch_matmul_op.cc
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2016-09-30 12:32:28 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2016-09-30 13:49:10 -0700
commit3057e97bf5d8079f6e139212c6812fd07e989390 (patch)
tree8ce6dd55c63543592426f8477737ee314f3d7382 /tensorflow/core/kernels/batch_matmul_op.cc
parent113093b017c8b2654c052a054e63738174bae649 (diff)
Shard batch_matmul_op across files to speed up build.
Change: 134820471
Diffstat (limited to 'tensorflow/core/kernels/batch_matmul_op.cc')
-rw-r--r--tensorflow/core/kernels/batch_matmul_op.cc465
1 files changed, 0 insertions, 465 deletions
diff --git a/tensorflow/core/kernels/batch_matmul_op.cc b/tensorflow/core/kernels/batch_matmul_op.cc
deleted file mode 100644
index f8c12927e3..0000000000
--- a/tensorflow/core/kernels/batch_matmul_op.cc
+++ /dev/null
@@ -1,465 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// See docs in ../ops/math_ops.cc.
-
-#define EIGEN_USE_THREADS
-
-#include <vector>
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/type_traits.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/fill_functor.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/work_sharder.h"
-
-#if GOOGLE_CUDA
-#include "tensorflow/core/platform/stream_executor.h"
-#endif // GOOGLE_CUDA
-
-namespace tensorflow {
-
-typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
-
-namespace {
-
-Eigen::IndexPair<Eigen::DenseIndex> ContractionDims(bool adj_x, bool adj_y) {
- if (!adj_x) {
- if (!adj_y) {
- return Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
- } else {
- return Eigen::IndexPair<Eigen::DenseIndex>(1, 1);
- }
- } else {
- if (!adj_y) {
- return Eigen::IndexPair<Eigen::DenseIndex>(0, 0);
- } else {
- return Eigen::IndexPair<Eigen::DenseIndex>(0, 1);
- }
- }
-}
-
-// Parallel batch matmul kernel based on the multi-threaded tensor contraction
-// in Eigen.
-template <typename Scalar, bool IsComplex = true>
-struct ParallelMatMulKernel {
- static void Conjugate(const OpKernelContext* context, Tensor* out) {
- const Eigen::ThreadPoolDevice d = context->eigen_cpu_device();
- auto z = out->tensor<Scalar, 3>();
- z.device(d) = z.conjugate();
- }
-
- static void Run(const OpKernelContext* context, const Tensor& in_x,
- const Tensor in_y, bool adj_x, bool adj_y, Tensor* out,
- int start, int limit) {
- static_assert(IsComplex, "Complex type expected.");
- auto Tx = in_x.tensor<Scalar, 3>();
- auto Ty = in_y.tensor<Scalar, 3>();
- auto Tz = out->tensor<Scalar, 3>();
- // We use the identities
- // conj(a) * conj(b) = conj(a * b)
- // conj(a) * b = conj(a * conj(b))
- // to halve the number of cases. The final conjugation of the result is
- // done at the end of LaunchBatchMatMul<CPUDevice, Scalar>::Launch().
- Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs;
- contract_pairs[0] = ContractionDims(adj_x, adj_y);
- const Eigen::ThreadPoolDevice d = context->eigen_cpu_device();
- for (int i = start; i < limit; ++i) {
- auto x = Tx.template chip<0>(i);
- auto z = Tz.template chip<0>(i);
- if (adj_x != adj_y) {
- auto y = Ty.template chip<0>(i).conjugate();
- z.device(d) = x.contract(y, contract_pairs);
- } else {
- auto y = Ty.template chip<0>(i);
- z.device(d) = x.contract(y, contract_pairs);
- }
- }
- }
-};
-
-// The Eigen contraction kernel used here is very large and slow to compile,
-// so we partially specialize ParallelMatMulKernel for real types to avoid all
-// but one of the instantiations.
-template <typename Scalar>
-struct ParallelMatMulKernel<Scalar, false> {
- static void Conjugate(const OpKernelContext* context, Tensor* out) {}
-
- static void Run(const OpKernelContext* context, const Tensor& in_x,
- const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out,
- int start, int limit) {
- auto Tx = in_x.tensor<Scalar, 3>();
- auto Ty = in_y.tensor<Scalar, 3>();
- auto Tz = out->tensor<Scalar, 3>();
- Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs;
- contract_pairs[0] = ContractionDims(adj_x, adj_y);
- const Eigen::ThreadPoolDevice d = context->eigen_cpu_device();
- for (int i = start; i < limit; ++i) {
- auto x = Tx.template chip<0>(i);
- auto y = Ty.template chip<0>(i);
- auto z = Tz.template chip<0>(i);
- z.device(d) = x.contract(y, contract_pairs);
- }
- }
-};
-
-// TODO(rmlarsen): Get rid of this when we have upstreamed improvements
-// for matrix*vector and vector*matrix to Eigen's general matrix product.
-template <typename Tx, typename Ty, typename Tz>
-static void Multiply(bool adj_x, bool adj_y, Tx x, Ty y, Tz z) {
- if (!adj_x) {
- if (!adj_y) {
- z.noalias() = x * y;
- } else {
- z.noalias() = x * y.adjoint();
- }
- } else {
- if (!adj_y) {
- z.noalias() = x.adjoint() * y;
- } else {
- z.noalias() = x.adjoint() * y.adjoint();
- }
- }
-}
-
-// Sequential batch matmul kernel that calls the regular Eigen matmul.
-// We prefer this over the tensor contraction because it performs
-// better on vector-matrix and matrix-vector products.
-template <typename Scalar>
-struct SequentialMatMulKernel {
- using Matrix =
- Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
- using ConstMatrixMap = Eigen::Map<const Matrix>;
- using MatrixMap = Eigen::Map<Matrix>;
-
- static ConstMatrixMap ConstTensorSliceToEigenMatrix(const Tensor& t,
- int slice) {
- return ConstMatrixMap(
- t.flat<Scalar>().data() + slice * t.dim_size(1) * t.dim_size(2),
- t.dim_size(1), t.dim_size(2));
- }
-
- static MatrixMap TensorSliceToEigenMatrix(Tensor* t, int slice) {
- return MatrixMap(
- t->flat<Scalar>().data() + slice * t->dim_size(1) * t->dim_size(2),
- t->dim_size(1), t->dim_size(2));
- }
-
- static void Run(const Tensor& in_x, const Tensor& in_y, bool adj_x,
- bool adj_y, Tensor* out, int start, int limit) {
- for (int i = start; i < limit; ++i) {
- auto x = ConstTensorSliceToEigenMatrix(in_x, i);
- auto y = ConstTensorSliceToEigenMatrix(in_y, i);
- auto z = TensorSliceToEigenMatrix(out, i);
- // TODO(rmlarsen): Get rid of the special casing here when we have
- // upstreamed improvements for matrix*vector and vector*matrix to
- // Eigen's general matrix product.
- if (!adj_x && x.rows() == 1) {
- Multiply(adj_x, adj_y, x.row(0), y, z);
- } else if (adj_x && x.cols() == 1) {
- Multiply(adj_x, adj_y, x.col(0), y, z);
- } else if (!adj_y && y.cols() == 1) {
- Multiply(adj_x, adj_y, x, y.col(0), z);
- } else if (adj_y && y.rows() == 1) {
- Multiply(adj_x, adj_y, x, y.row(0), z);
- } else {
- Multiply(adj_x, adj_y, x, y, z);
- }
- }
- }
-};
-
-} // namespace
-
-template <typename Device, typename Scalar>
-struct LaunchBatchMatMul;
-
-template <typename Scalar>
-struct LaunchBatchMatMul<CPUDevice, Scalar> {
- static void Launch(OpKernelContext* context, const Tensor& in_x,
- const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
- typedef ParallelMatMulKernel<Scalar, Eigen::NumTraits<Scalar>::IsComplex>
- ParallelMatMulKernel;
- bool conjugate_result = false;
-
- // Number of matrix multiplies i.e. size of the batch.
- const int64 num_units = in_x.dim_size(0);
- const int64 cost_per_unit =
- in_x.dim_size(1) * in_x.dim_size(2) * out->dim_size(2);
- const int64 min_dim = std::min(std::min(in_x.dim_size(1), in_x.dim_size(2)),
- out->dim_size(2));
- const int64 kMaxCostOuterParallelism = 128 * 256 * 256; // heuristic.
- auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
- if (min_dim > 1 &&
- (num_units == 1 || cost_per_unit > kMaxCostOuterParallelism)) {
- // Parallelize over inner dims.
- // For large matrix products it is counter-productive to parallelize
- // over the batch dimension.
- ParallelMatMulKernel::Run(context, in_x, in_y, adj_x, adj_y, out, 0,
- num_units);
- conjugate_result = adj_x;
- } else if (min_dim > 1 && worker_threads.num_threads > num_units) {
- // Parallelize over both outer and inner dims.
- // TODO(rmlarsen): The parallelized contraction in Eigen can deadlock
- // when running num_threads or more contractions in parallel. Launch on
- // all worker_threads.num_threads threads here once that is fixed.
- Shard(std::max(1, worker_threads.num_threads - 1), worker_threads.workers,
- num_units, cost_per_unit,
- [context, &in_x, &in_y, adj_x, adj_y, out](int start, int limit) {
- ParallelMatMulKernel::Run(context, in_x, in_y, adj_x, adj_y, out,
- start, limit);
- });
- conjugate_result = adj_x;
- } else {
- // Parallelize over outer dims. For small matrices and large batches, it
- // is counter-productive to parallelize the inner matrix multiplies.
- Shard(worker_threads.num_threads, worker_threads.workers, num_units,
- cost_per_unit,
- [&in_x, &in_y, adj_x, adj_y, out](int start, int limit) {
- SequentialMatMulKernel<Scalar>::Run(in_x, in_y, adj_x, adj_y, out,
- start, limit);
- });
- }
- if (conjugate_result) {
- // We used one of the identities
- // conj(a) * conj(b) = conj(a * b)
- // conj(a) * b = conj(a * conj(b))
- // above, we need to conjugate the final output. This is a
- // no-op for non-complex types.
- ParallelMatMulKernel::Conjugate(context, out);
- }
- }
-};
-
-#if GOOGLE_CUDA
-
-namespace {
-template <typename T>
-perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
- perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
- perftools::gputools::DeviceMemory<T> typed(wrapped);
- return typed;
-}
-
-class CublasScratchAllocator : public perftools::gputools::ScratchAllocator {
- public:
- using Stream = ::perftools::gputools::Stream;
- using DeviceMemoryBytes = ::perftools::gputools::DeviceMemory<uint8>;
-
- CublasScratchAllocator(OpKernelContext* context) : context_(context) {}
-
- int64 GetMemoryLimitInBytes(Stream* stream) override { return -1; }
-
- perftools::gputools::port::StatusOr<DeviceMemoryBytes> AllocateBytes(
- Stream* stream, int64 byte_size) override {
- Tensor temporary_memory;
-
- Status allocation_status(context_->allocate_temp(
- DT_UINT8, TensorShape({byte_size}), &temporary_memory));
- if (!allocation_status.ok()) {
- return perftools::gputools::port::StatusOr<DeviceMemoryBytes>(
- DeviceMemoryBytes::MakeFromByteSize(nullptr, 0));
- }
- // Hold the reference of the allocated tensors until the end of the
- // allocator.
- allocated_tensors_.push_back(temporary_memory);
- return perftools::gputools::port::StatusOr<DeviceMemoryBytes>(
- DeviceMemoryBytes::MakeFromByteSize(
- temporary_memory.flat<uint8>().data(),
- temporary_memory.flat<uint8>().size()));
- }
-
- private:
- OpKernelContext* context_;
- std::vector<Tensor> allocated_tensors_;
-};
-} // namespace
-
-template <typename Scalar>
-struct LaunchBatchMatMul<GPUDevice, Scalar> {
- static void Launch(OpKernelContext* context, const Tensor& in_x,
- const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
- constexpr perftools::gputools::blas::Transpose kTranspose =
- is_complex<Scalar>::value
- ? perftools::gputools::blas::Transpose::kConjugateTranspose
- : perftools::gputools::blas::Transpose::kTranspose;
- perftools::gputools::blas::Transpose trans[] = {
- perftools::gputools::blas::Transpose::kNoTranspose, kTranspose};
- const uint64 m = in_x.dim_size(adj_x ? 2 : 1);
- const uint64 k = in_x.dim_size(adj_x ? 1 : 2);
- const uint64 n = in_y.dim_size(adj_y ? 1 : 2);
- const uint64 batch_size = in_x.dim_size(0);
- auto blas_transpose_a = trans[adj_x];
- auto blas_transpose_b = trans[adj_y];
-
- auto* stream = context->op_device_context()->stream();
- OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
-
- typedef perftools::gputools::DeviceMemory<Scalar> DeviceMemoryType;
- std::vector<DeviceMemoryType> a_device_memory;
- std::vector<DeviceMemoryType> b_device_memory;
- std::vector<DeviceMemoryType> c_device_memory;
- std::vector<DeviceMemoryType*> a_ptrs;
- std::vector<DeviceMemoryType*> b_ptrs;
- std::vector<DeviceMemoryType*> c_ptrs;
- a_device_memory.reserve(batch_size);
- b_device_memory.reserve(batch_size);
- c_device_memory.reserve(batch_size);
- a_ptrs.reserve(batch_size);
- b_ptrs.reserve(batch_size);
- c_ptrs.reserve(batch_size);
- auto* a_base_ptr = in_x.template flat<Scalar>().data();
- auto* b_base_ptr = in_y.template flat<Scalar>().data();
- auto* c_base_ptr = out->template flat<Scalar>().data();
- for (int64 i = 0; i < batch_size; ++i) {
- a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k));
- b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n));
- c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n));
- a_ptrs.push_back(&a_device_memory.back());
- b_ptrs.push_back(&b_device_memory.back());
- c_ptrs.push_back(&c_device_memory.back());
- }
-
- // Cublas does
- // C = A x B
- // where A, B and C are assumed to be in column major.
- // We want the output to be in row-major, so we can compute
- // C' = B' x A' (' stands for transpose)
- CublasScratchAllocator scratch_allocator(context);
- bool blas_launch_status =
- stream
- ->ThenBlasGemmBatchedWithScratch(
- blas_transpose_b, blas_transpose_a, n, m, k,
- static_cast<Scalar>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
- adj_x ? m : k, static_cast<Scalar>(0.0), c_ptrs, n, batch_size,
- &scratch_allocator)
- .ok();
- if (!blas_launch_status) {
- context->SetStatus(errors::Internal(
- "Blas SGEMMBatched launch failed : a.shape=",
- in_x.shape().DebugString(), ", b.shape=", in_y.shape().DebugString(),
- ", m=", m, ", n=", n, ", k=", k, ", batch_size=", batch_size));
- }
- }
-};
-
-#endif // GOOGLE_CUDA
-
-template <typename Device, typename Scalar>
-class BatchMatMul : public OpKernel {
- public:
- explicit BatchMatMul(OpKernelConstruction* context) : OpKernel(context) {
- OP_REQUIRES_OK(context, context->GetAttr("adj_x", &adj_x_));
- OP_REQUIRES_OK(context, context->GetAttr("adj_y", &adj_y_));
- }
-
- virtual ~BatchMatMul() {}
-
- void Compute(OpKernelContext* ctx) override {
- const Tensor& in0 = ctx->input(0);
- const Tensor& in1 = ctx->input(1);
- OP_REQUIRES(ctx, in0.dims() == in1.dims(),
- errors::InvalidArgument("In[0] and In[1] has different ndims: ",
- in0.shape().DebugString(), " vs. ",
- in1.shape().DebugString()));
- const int ndims = in0.dims();
- OP_REQUIRES(
- ctx, ndims >= 2,
- errors::InvalidArgument("In[0] and In[1] ndims must be >= 2: ", ndims));
- TensorShape out_shape;
- for (int i = 0; i < ndims - 2; ++i) {
- OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i),
- errors::InvalidArgument("In[0].dim(", i, ") and In[1].dim(",
- i, ") must be the same: ",
- in0.shape().DebugString(), " vs ",
- in1.shape().DebugString()));
- out_shape.AddDim(in0.dim_size(i));
- }
- auto n = (ndims == 2) ? 1 : out_shape.num_elements();
- auto d0 = in0.dim_size(ndims - 2);
- auto d1 = in0.dim_size(ndims - 1);
- Tensor in0_reshaped;
- CHECK(in0_reshaped.CopyFrom(in0, TensorShape({n, d0, d1})));
- auto d2 = in1.dim_size(ndims - 2);
- auto d3 = in1.dim_size(ndims - 1);
- Tensor in1_reshaped;
- CHECK(in1_reshaped.CopyFrom(in1, TensorShape({n, d2, d3})));
- if (adj_x_) std::swap(d0, d1);
- if (adj_y_) std::swap(d2, d3);
- OP_REQUIRES(ctx, d1 == d2,
- errors::InvalidArgument(
- "In[0] mismatch In[1] shape: ", d1, " vs. ", d2, ": ",
- in0.shape().DebugString(), " ", in1.shape().DebugString(),
- " ", adj_x_, " ", adj_y_));
- out_shape.AddDim(d0);
- out_shape.AddDim(d3);
- Tensor* out = nullptr;
- OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
- if (out->NumElements() == 0) {
- return;
- }
- if (in0.NumElements() == 0 || in1.NumElements() == 0) {
- functor::SetZeroFunctor<Device, Scalar> f;
- f(ctx->eigen_device<Device>(), out->flat<Scalar>());
- return;
- }
- Tensor out_reshaped;
- CHECK(out_reshaped.CopyFrom(*out, TensorShape({n, d0, d3})));
- LaunchBatchMatMul<Device, Scalar>::Launch(ctx, in0_reshaped, in1_reshaped,
- adj_x_, adj_y_, &out_reshaped);
- }
-
- private:
- bool adj_x_;
- bool adj_y_;
-};
-
-#define REGISTER_CPU(TYPE) \
- REGISTER_KERNEL_BUILDER( \
- Name("BatchMatMul").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
- BatchMatMul<CPUDevice, TYPE>)
-
-#define REGISTER_GPU(TYPE) \
- REGISTER_KERNEL_BUILDER( \
- Name("BatchMatMul").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
- BatchMatMul<GPUDevice, TYPE>)
-
-TF_CALL_float(REGISTER_CPU);
-TF_CALL_double(REGISTER_CPU);
-TF_CALL_half(REGISTER_CPU);
-TF_CALL_int32(REGISTER_CPU);
-TF_CALL_complex64(REGISTER_CPU);
-TF_CALL_complex128(REGISTER_CPU);
-
-#if GOOGLE_CUDA
-TF_CALL_float(REGISTER_GPU);
-TF_CALL_double(REGISTER_GPU);
-TF_CALL_complex64(REGISTER_GPU);
-TF_CALL_complex128(REGISTER_GPU);
-#if CUDA_VERSION >= 7050
-TF_CALL_half(REGISTER_GPU);
-#endif
-#endif // GOOGLE_CUDA
-
-#undef REGISTER_CPU
-#undef REGISTER_GPU
-} // end namespace tensorflow