Vectorized the evaluation of tensor expression (using SSE, AVX, NEON, ...)

Added the ability to parallelize the evaluation of a tensor expression over multiple cpu cores. Added the ability to offload the evaluation of a tensor expression to a GPU.
author: Benoit Steiner <benoit.steiner.goog@gmail.com> 2014-05-16 15:08:05 -0700
committer: Benoit Steiner <benoit.steiner.goog@gmail.com> 2014-05-16 15:08:05 -0700
commit: 7402fea0a8e63e3ea248257047c584afee8f8bde (patch)
tree: 429aee7ea314c579ed62c1c5e1ff84850b14370a /unsupported/test
parent: 0320f7e3a71406b9a03d1bab0d168fd76e63d457 (diff)
4 files changed, 194 insertions, 0 deletions
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 31583d3ca..abc3375e5 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -104,4 +104,7 @@ if(EIGEN_TEST_CXX11)
   ei_add_test(cxx11_tensor_assign "-std=c++0x")
   ei_add_test(cxx11_tensor_expr "-std=c++0x")
   ei_add_test(cxx11_tensor_map "-std=c++0x")
+  ei_add_test(cxx11_tensor_device  "-std=c++0x")
+#  ei_add_test(cxx11_tensor_fixed_size "-std=c++0x")
+  ei_add_test(cxx11_tensor_thread_pool "-std=c++0x")
 endif()
diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp
new file mode 100644
index 000000000..9eb1d0420
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_device.cpp
@@ -0,0 +1,126 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_device
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+// Context for evaluation on cpu
+struct CPUContext {
+  CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out) { }
+
+  const Eigen::Tensor<float, 3>& in1() const { return in1_; }
+  const Eigen::Tensor<float, 3>& in2() const { return in2_; }
+  Eigen::TensorDevice<Eigen::Tensor<float, 3>, Eigen::DefaultDevice> out() { return TensorDevice<Eigen::Tensor<float, 3>, Eigen::DefaultDevice>(cpu_device_, out_); }
+
+ private:
+  const Eigen::Tensor<float, 3>& in1_;
+  const Eigen::Tensor<float, 3>& in2_;
+  Eigen::Tensor<float, 3>& out_;
+
+  Eigen::DefaultDevice cpu_device_;
+};
+
+
+// Context for evaluation on GPU
+struct GPUContext {
+  GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out) { }
+
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
+  Eigen::TensorDevice<Eigen::TensorMap<Eigen::Tensor<float, 3> >, Eigen::GpuDevice> out() { return TensorDevice<Eigen::TensorMap<Eigen::Tensor<float, 3> >, Eigen::GpuDevice>(gpu_device_, out_); }
+
+ private:
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
+  Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
+  Eigen::GpuDevice gpu_device_;
+};
+
+
+// The actual expression to evaluate
+template <typename Context>
+static void test_contextual_eval(Context* context)
+{
+  context->out() = context->in1() + context->in2() * 3.14f;
+}
+
+static void test_cpu() {
+  Eigen::Tensor<float, 3> in1(Eigen::array<int, 3>(2,3,7));
+  Eigen::Tensor<float, 3> in2(Eigen::array<int, 3>(2,3,7));
+  Eigen::Tensor<float, 3> out(Eigen::array<int, 3>(2,3,7));
+
+  in1.setRandom();
+  in2.setRandom();
+  CPUContext context(in1, in2, out);
+  test_contextual_eval(&context);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * 3.14f);
+      }
+    }
+  }
+}
+
+static void test_gpu() {
+  Eigen::Tensor<float, 3> in1(Eigen::array<int, 3>(2,3,7));
+  Eigen::Tensor<float, 3> in2(Eigen::array<int, 3>(2,3,7));
+  Eigen::Tensor<float, 3> out(Eigen::array<int, 3>(2,3,7));
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t in2_bytes = in2.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  float* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_in2), in2_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<int, 3>(2,3,7));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<int, 3>(2,3,7));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(2,3,7));
+
+  GPUContext context(gpu_in1, gpu_in2, gpu_out);
+  test_contextual_eval(&context);
+
+  cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * 3.14f);
+      }
+    }
+  }
+}
+
+
+
+void test_cxx11_tensor_device()
+{
+  CALL_SUBTEST(test_cpu());
+  CALL_SUBTEST(test_gpu());
+}
diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp
index c1d74d881..214f6951d 100644
--- a/unsupported/test/cxx11_tensor_fixed_size.cpp
+++ b/unsupported/test/cxx11_tensor_fixed_size.cpp
@@ -159,9 +159,37 @@ static void test_3d()
 }
 
 
+static void test_array()
+{
+  TensorFixedSize<float, Sizes<2, 3, 7> > mat1;
+  float val = 0.0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(array<ptrdiff_t, 3>(i,j,k)) = val;
+        val += 1.0;
+      }
+    }
+  }
+
+  TensorFixedSize<float, Sizes<2, 3, 7> > mat3;
+  mat3 = mat1.cwisePow(3.5f);
+
+  val = 0.0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(array<ptrdiff_t, 3>(i,j,k)), powf(val, 3.5f));
+        val += 1.0;
+      }
+    }
+  }
+}
+
 void test_cxx11_tensor_fixed_size()
 {
   CALL_SUBTEST(test_1d());
   CALL_SUBTEST(test_2d());
   CALL_SUBTEST(test_3d());
+  CALL_SUBTEST(test_array());
 }
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
new file mode 100644
index 000000000..c9de71da3
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -0,0 +1,37 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_cxx11_tensor_thread_pool()
+{
+  Eigen::Tensor<float, 3> in1(Eigen::array<ptrdiff_t, 3>(2,3,7));
+  Eigen::Tensor<float, 3> in2(Eigen::array<ptrdiff_t, 3>(2,3,7));
+  Eigen::Tensor<float, 3> out(Eigen::array<ptrdiff_t, 3>(2,3,7));
+
+  in1.setRandom();
+  in2.setRandom();
+
+  Eigen::ThreadPoolDevice thread_pool_device(3);
+  out.device(thread_pool_device) = in1 + in2 * 3.14;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(out(Eigen::array<ptrdiff_t, 3>(i,j,k)), in1(Eigen::array<ptrdiff_t, 3>(i,j,k)) + in2(Eigen::array<ptrdiff_t, 3>(i,j,k)) * 3.14f);
+      }
+    }
+  }
+}
author	Benoit Steiner <benoit.steiner.goog@gmail.com>	2014-05-16 15:08:05 -0700
committer	Benoit Steiner <benoit.steiner.goog@gmail.com>	2014-05-16 15:08:05 -0700
commit	7402fea0a8e63e3ea248257047c584afee8f8bde (patch)
tree	429aee7ea314c579ed62c1c5e1ff84850b14370a /unsupported/test
parent	0320f7e3a71406b9a03d1bab0d168fd76e63d457 (diff)