Add tiled evaluation for TensorForcedEvalOp

author: Eugene Zhulenev <ezhulenev@google.com> 2019-03-04 16:02:22 -0800
committer: Eugene Zhulenev <ezhulenev@google.com> 2019-03-04 16:02:22 -0800
commit: b95941e5c2cf8886a54e510be662cf4ecadc4f6f (patch)
tree: f53e57bf540a63b4b533b2d8113b9d02c3dbcb62 /unsupported
parent: 694084ecbd12c5183a8ff0604d04971d043abfff (diff)
2 files changed, 58 insertions, 6 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 78068be35..74b905329 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -90,14 +90,21 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned = true,
-    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess = false,
+    IsAligned         = true,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = internal::is_arithmetic<CoeffReturnType>::value,
     PreferBlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess = true
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess         = true
   };
 
+  typedef typename internal::TensorBlock<
+      CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
+      TensorBlock;
+  typedef typename internal::TensorBlockReader<
+      CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
+      TensorBlockReader;
+
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
   /// op_ is used for sycl
       : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
@@ -139,6 +146,14 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
     return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>*) const {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
+    assert(m_buffer != NULL);
+    TensorBlockReader::Run(block, m_buffer);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index 18c87b35e..608306613 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -452,6 +452,38 @@ static void test_execute_slice_lvalue(Device d)
   }
 }
 
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    bool Tileable, int Layout>
+static void test_execute_broadcasting_of_forced_eval(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  const auto broadcasts = RandomDims<NumDims>(1, 7);
+  const auto expr = src.square().eval().broadcast(broadcasts);
+
+  // We assume that broadcasting on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor<T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the broadcasting using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
 #define CALL_SUBTEST_PART(PART) \
   CALL_SUBTEST_##PART
 
@@ -528,8 +560,13 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
   CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4);
   CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5);
 
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5);
+
   // Force CMake to split this test.
-  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12
 }
 
 #undef CALL_SUBTEST_COMBINATIONS
author	Eugene Zhulenev <ezhulenev@google.com>	2019-03-04 16:02:22 -0800
committer	Eugene Zhulenev <ezhulenev@google.com>	2019-03-04 16:02:22 -0800
commit	b95941e5c2cf8886a54e510be662cf4ecadc4f6f (patch)
tree	f53e57bf540a63b4b533b2d8113b9d02c3dbcb62 /unsupported
parent	694084ecbd12c5183a8ff0604d04971d043abfff (diff)