diff options
author | 2016-02-23 11:09:05 +0100 | |
---|---|---|
committer | 2016-02-23 11:09:05 +0100 | |
commit | 91e1375ba97284d1a11068d27c039800ec7900f1 (patch) | |
tree | ce96e6c443a28e1ea6d189b56cc08460c475b77c | |
parent | 055000a42466670d7fd0162f026cde9ab90f9b25 (diff) | |
parent | 1d9256f7db5db6c9f7fa915b4af868625f53502f (diff) |
merge
-rw-r--r-- | Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 26 | ||||
-rw-r--r-- | bench/tensors/tensor_benchmarks.h | 234 | ||||
-rw-r--r-- | bench/tensors/tensor_benchmarks_cpu.cc | 42 | ||||
-rw-r--r-- | bench/tensors/tensor_benchmarks_gpu.cu | 6 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/Tensor | 1 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h | 17 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 16 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h | 55 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 22 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 32 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 52 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 75 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_of_float16_cuda.cu | 41 |
13 files changed, 273 insertions, 346 deletions
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 7af0bdc60..1a1b4ec3d 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -52,14 +52,19 @@ __device__ half operator /= (half& a, const half& b) { a = a / b; return a; } -__device__ half __shfl_xor(half a, int) { - assert(false && "tbd"); - return a; + +namespace std { +__device__ half abs(const half& a) { + half result; + result.x = a.x & 0x7FFF; + return result; +} } namespace Eigen { namespace internal { +template<> struct is_arithmetic<half> { enum { value = true }; }; template<> struct is_arithmetic<half2> { enum { value = true }; }; template<> struct packet_traits<half> : default_packet_traits @@ -214,17 +219,20 @@ template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) { } template<> EIGEN_DEVICE_FUNC inline half2 pabs<half2>(const half2& a) { - assert(false && "tbd"); - return half2(); + half2 result; + result.x = a.x & 0x7FFF7FFF; + return result; } EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<half2,2>& kernel) { - assert(false && "tbd"); - // half tmp = kernel.packet[0].y; - // kernel.packet[0].y = kernel.packet[1].x; - // kernel.packet[1].x = tmp; + half a1 = __low2half(kernel.packet[0]); + half a2 = __high2half(kernel.packet[0]); + half b1 = __low2half(kernel.packet[1]); + half b2 = __high2half(kernel.packet[1]); + kernel.packet[0] = __halves2half2(a1, b1); + kernel.packet[1] = __halves2half2(a2, b2); } } // end namespace internal diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index f3ec70a9e..b208a401a 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -15,7 +15,7 @@ using Eigen::TensorMap; // TODO(bsteiner): also templatize on the input type since we have users // for int8 as well as floats. -template <typename Device> class BenchmarkSuite { +template <typename Device, typename T> class BenchmarkSuite { public: BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n) : m_(m), k_(k), n_(n), device_(device) { @@ -37,7 +37,7 @@ template <typename Device> class BenchmarkSuite { eigen_assert(m_ == k_ && k_ == n_); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { - device_.memcpy(c_, a_, m_ * m_ * sizeof(float)); + device_.memcpy(c_, a_, m_ * m_ * sizeof(T)); } // Record the number of values copied per second finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters); @@ -45,13 +45,15 @@ template <typename Device> class BenchmarkSuite { void typeCasting(int num_iters) { eigen_assert(m_ == n_); - const Eigen::array<TensorIndex, 2> sizes = {{m_, k_}}; - const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> A(a_, sizes); + Eigen::array<TensorIndex, 2> sizes; + sizes[0] = m_; + sizes[1] = k_; + const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> A(a_, sizes); TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> B((int*)b_, sizes); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { - B.device(device_) = A.cast<int>(); + B.device(device_) = A.template cast<int>(); } // Record the number of values copied per second finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters); @@ -59,8 +61,10 @@ template <typename Device> class BenchmarkSuite { void random(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}}; - TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + Eigen::array<TensorIndex, 2> sizes; + sizes[0] = m_; + sizes[1] = m_; + TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -72,10 +76,12 @@ template <typename Device> class BenchmarkSuite { void slicing(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}}; - const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes); - TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + Eigen::array<TensorIndex, 2> sizes; + sizes[0] = m_; + sizes[1] = m_; + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes); + const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); + TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2); const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0); @@ -100,10 +106,13 @@ template <typename Device> class BenchmarkSuite { } void rowChip(int num_iters) { - const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}}; - const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size); - const Eigen::array<TensorIndex, 1> output_size = {{n_}}; - TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); + Eigen::array<TensorIndex, 2> input_size; + input_size[0] = k_; + input_size[1] = n_; + const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size); + Eigen::array<TensorIndex, 1> output_size; + output_size[0] = n_; + TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -114,10 +123,13 @@ template <typename Device> class BenchmarkSuite { } void colChip(int num_iters) { - const Eigen::array<TensorIndex, 2> input_size= {{k_, n_}}; - const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size); - const Eigen::array<TensorIndex, 1> output_size = {{n_}}; - TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); + Eigen::array<TensorIndex, 2> input_size; + input_size[0] = k_; + input_size[1] = n_; + const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size); + Eigen::array<TensorIndex, 1> output_size; + output_size[0] = n_; + TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -129,12 +141,18 @@ template <typename Device> class BenchmarkSuite { void shuffling(int num_iters) { eigen_assert(m_ == n_); - const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}}; - const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a); - const Eigen::array<TensorIndex, 2> size_b = {{k_, m_}}; - TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b); - - const Eigen::array<int, 2> shuffle = {{1, 0}}; + Eigen::array<TensorIndex, 2> size_a; + size_a[0] = m_; + size_a[1] = k_; + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a); + Eigen::array<TensorIndex, 2> size_b; + size_b[0] = k_; + size_b[1] = m_; + TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b); + + Eigen::array<int, 2> shuffle; + shuffle[0] = 1; + shuffle[1] = 0; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -146,10 +164,14 @@ template <typename Device> class BenchmarkSuite { void padding(int num_iters) { eigen_assert(m_ == k_); - const Eigen::array<TensorIndex, 2> size_a = {{m_, k_-3}}; - const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a); - const Eigen::array<TensorIndex, 2> size_b = {{k_, m_}}; - TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b); + Eigen::array<TensorIndex, 2> size_a; + size_a[0] = m_; + size_a[1] = k_-3; + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a); + Eigen::array<TensorIndex, 2> size_b; + size_b[0] = k_; + size_b[1] = m_; + TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b); Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings; paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0); @@ -165,12 +187,18 @@ template <typename Device> class BenchmarkSuite { void striding(int num_iters) { eigen_assert(m_ == k_); - const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}}; - const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a); - const Eigen::array<TensorIndex, 2> size_b = {{m_, k_ / 2}}; - TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b); - - const Eigen::array<TensorIndex, 2> strides = {{1, 2}}; + Eigen::array<TensorIndex, 2> size_a; + size_a[0] = m_; + size_a[1] = k_; + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a); + Eigen::array<TensorIndex, 2> size_b; + size_b[0] = m_; + size_b[1] = k_/2; + TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b); + + Eigen::array<TensorIndex, 2> strides; + strides[0] = 1; + strides[1] = 2; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -181,13 +209,19 @@ template <typename Device> class BenchmarkSuite { } void broadcasting(int num_iters) { - const Eigen::array<TensorIndex, 2> size_a = {{m_, 1}}; - const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a); - const Eigen::array<TensorIndex, 2> size_c = {{m_, n_}}; - TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c); + Eigen::array<TensorIndex, 2> size_a; + size_a[0] = m_; + size_a[1] = 1; + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a); + Eigen::array<TensorIndex, 2> size_c; + size_c[0] = m_; + size_c[1] = n_; + TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, size_c); #ifndef EIGEN_HAS_INDEX_LIST - const Eigen::array<int, 2> broadcast = {{1, n_}}; + Eigen::array<int, 2> broadcast; + broadcast[0] = 1; + broadcast[1] = n_; #else // Take advantage of cxx11 to give the compiler information it can use to // optimize the code. @@ -205,10 +239,12 @@ template <typename Device> class BenchmarkSuite { void coeffWiseOp(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}}; - const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes); - TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + Eigen::array<TensorIndex, 2> sizes; + sizes[0] = m_; + sizes[1] = m_; + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes); + const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); + TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -221,10 +257,12 @@ template <typename Device> class BenchmarkSuite { void algebraicFunc(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}}; - const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes); - TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + Eigen::array<TensorIndex, 2> sizes; + sizes[0] = m_; + sizes[1] = m_; + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes); + const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); + TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -237,10 +275,12 @@ template <typename Device> class BenchmarkSuite { void transcendentalFunc(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}}; - const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes); - TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + Eigen::array<TensorIndex, 2> sizes; + sizes[0] = m_; + sizes[1] = m_; + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes); + const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); + TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -253,13 +293,16 @@ template <typename Device> class BenchmarkSuite { // Row reduction void rowReduction(int num_iters) { - const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}}; - const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size); + Eigen::array<TensorIndex, 2> input_size; + input_size[0] = k_; + input_size[1] = n_; + const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size); const Eigen::array<TensorIndex, 1> output_size = {{n_}}; - TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); + TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); #ifndef EIGEN_HAS_INDEX_LIST - const Eigen::array<TensorIndex, 1> sum_along_dim = {{0}}; + Eigen::array<TensorIndex, 1> sum_along_dim; + sum_along_dim[0] = 0; #else // Take advantage of cxx11 to give the compiler information it can use to // optimize the code. @@ -277,15 +320,18 @@ template <typename Device> class BenchmarkSuite { // Column reduction void colReduction(int num_iters) { - const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}}; - const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B( + Eigen::array<TensorIndex, 2> input_size; + input_size[0] = k_; + input_size[1] = n_; + const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B( b_, input_size); const Eigen::array<TensorIndex, 1> output_size = {{k_}}; - TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C( + TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C( c_, output_size); #ifndef EIGEN_HAS_INDEX_LIST - const Eigen::array<TensorIndex, 1> sum_along_dim = {{1}}; + Eigen::array<TensorIndex, 1> sum_along_dim; + sum_along_dim = 1; #else // Take advantage of cxx11 to give the compiler information it can use to // optimize the code. @@ -303,16 +349,23 @@ template <typename Device> class BenchmarkSuite { // do a contraction which is equivalent to a matrix multiplication void contraction(int num_iters) { - const Eigen::array<TensorIndex, 2> sizeA = {{m_, k_}}; - const Eigen::array<TensorIndex, 2> sizeB = {{k_, n_}}; - const Eigen::array<TensorIndex, 2> sizeC = {{m_, n_}}; - - const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB); - TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC); - - typedef typename Tensor<float, 2>::DimensionPair DimPair; - const Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; + Eigen::array<TensorIndex, 2> sizeA; + sizeA[0] = m_; + sizeA[1] = k_; + Eigen::array<TensorIndex, 2> sizeB; + sizeB[0] = k_; + sizeB[1] = n_; + Eigen::array<TensorIndex, 2> sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA); + const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB); + TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor<T, 2>::DimensionPair DimPair; + Eigen::array<DimPair, 1> dims; + dims[0] = DimPair(1, 0); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -324,14 +377,21 @@ template <typename Device> class BenchmarkSuite { } void convolution(int num_iters, int kernel_x, int kernel_y) { - const Eigen::array<TensorIndex, 2> input_sizes = {{m_, n_}}; - TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes); - const Eigen::array<TensorIndex, 2> kernel_sizes = {{kernel_x, kernel_y}}; - TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes); - const Eigen::array<TensorIndex, 2> result_sizes = - {{m_ - kernel_x + 1, n_ - kernel_y + 1}}; - TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes); - Eigen::array<Tensor<float, 2>::Index, 2> dims = {{0, 1}}; + Eigen::array<TensorIndex, 2> input_sizes; + input_sizes[0] = m_; + input_sizes[1] = n_; + TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes); + Eigen::array<TensorIndex, 2> kernel_sizes; + kernel_sizes[0] = kernel_x; + kernel_sizes[1] = kernel_y; + TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes); + Eigen::array<TensorIndex, 2> result_sizes; + result_sizes[0] = m_ - kernel_x + 1; + result_sizes[1] = n_ - kernel_y + 1; + TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes); + Eigen::array<TensorIndex, 2> dims; + dims[0] = 0; + dims[1] = 1; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -345,15 +405,15 @@ template <typename Device> class BenchmarkSuite { private: void initialize() { - a_ = (float *) device_.allocate(m_ * k_ * sizeof(float)); - b_ = (float *) device_.allocate(k_ * n_ * sizeof(float)); - c_ = (float *) device_.allocate(m_ * n_ * sizeof(float)); + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); // Initialize the content of the memory pools to prevent asan from // complaining. - device_.memset(a_, 12, m_ * k_ * sizeof(float)); - device_.memset(b_, 23, k_ * n_ * sizeof(float)); - device_.memset(c_, 31, m_ * n_ * sizeof(float)); + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); //BenchmarkUseRealTime(); } @@ -372,9 +432,9 @@ template <typename Device> class BenchmarkSuite { TensorIndex m_; TensorIndex k_; TensorIndex n_; - float* a_; - float* b_; - float* c_; + T* a_; + T* b_; + T* c_; Device device_; }; #endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc index 6754e1a32..8947f4b7f 100644 --- a/bench/tensors/tensor_benchmarks_cpu.cc +++ b/bench/tensors/tensor_benchmarks_cpu.cc @@ -9,13 +9,13 @@ Eigen::ThreadPool pool(threads); \ Eigen::ThreadPoolDevice device(&pool, threads); // Simple functions -#define BM_FuncCPU(FUNC, THREADS) \ - static void BM_##FUNC##_##THREADS##T(int iters, int N) { \ - StopBenchmarkTiming(); \ - CREATE_THREAD_POOL(THREADS); \ - BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \ - suite.FUNC(iters); \ - } \ +#define BM_FuncCPU(FUNC, THREADS) \ + static void BM_##FUNC##_##THREADS##T(int iters, int N) { \ + StopBenchmarkTiming(); \ + CREATE_THREAD_POOL(THREADS); \ + BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \ + suite.FUNC(iters); \ + } \ BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000); BM_FuncCPU(memcpy, 4); @@ -80,19 +80,19 @@ BM_FuncCPU(colReduction, 12); // Contractions -#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \ - static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) {\ - StopBenchmarkTiming(); \ - if (THREADS == 1) { \ - Eigen::DefaultDevice device; \ - BenchmarkSuite<Eigen::DefaultDevice> suite(device, D1, D2, D3); \ - suite.FUNC(iters); \ - } else { \ - CREATE_THREAD_POOL(THREADS); \ - BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3); \ - suite.FUNC(iters); \ - } \ - } \ +#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \ + static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) { \ + StopBenchmarkTiming(); \ + if (THREADS == 1) { \ + Eigen::DefaultDevice device; \ + BenchmarkSuite<Eigen::DefaultDevice, float> suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } else { \ + CREATE_THREAD_POOL(THREADS); \ + BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } \ + } \ BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000); @@ -138,7 +138,7 @@ BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16); static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) { \ StopBenchmarkTiming(); \ CREATE_THREAD_POOL(THREADS); \ - BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \ + BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \ suite.FUNC(iters, DIM1, DIM2); \ } \ BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000); diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu index 611e8197b..a6f594382 100644 --- a/bench/tensors/tensor_benchmarks_gpu.cu +++ b/bench/tensors/tensor_benchmarks_gpu.cu @@ -12,7 +12,7 @@ StopBenchmarkTiming(); \ Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ - BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \ + BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N); \ cudaDeviceSynchronize(); \ suite.FUNC(iters); \ } \ @@ -41,7 +41,7 @@ BM_FuncGPU(colReduction); StopBenchmarkTiming(); \ Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ - BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3); \ + BenchmarkSuite<Eigen::GpuDevice, float> suite(device, D1, D2, D3); \ cudaDeviceSynchronize(); \ suite.FUNC(iters); \ } \ @@ -60,7 +60,7 @@ BM_FuncWithInputDimsGPU(contraction, N, N, 64); StopBenchmarkTiming(); \ Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ - BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \ + BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N); \ cudaDeviceSynchronize(); \ suite.FUNC(iters, DIM1, DIM2); \ } \ diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index b4f860c41..3b5be4426 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -58,6 +58,7 @@ typedef unsigned __int64 uint64_t; #endif #ifdef EIGEN_USE_GPU +#include <iostream> #include <cuda_runtime.h> #if defined(__CUDACC__) #include <curand_kernel.h> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index e254c0b7b..4e87813a9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -165,6 +165,18 @@ class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprT typename XprType::Nested m_xpr; }; +template <bool SameType, typename Eval, typename Scalar> struct ConversionSubExprEval { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool run(Eval& impl, Scalar*) { + impl.evalSubExprsIfNeeded(NULL); + return true; + } +}; + +template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eval, Scalar> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool run(Eval& impl, Scalar* data) { + return impl.evalSubExprsIfNeeded(data); + } +}; @@ -195,10 +207,9 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; + return ConversionSubExprEval<internal::is_same<TargetType, SrcType>::value, TensorEvaluator<ArgType, Device>, Scalar>::run(m_impl, data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index aec5f4c8e..a5aa05da4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -230,10 +230,10 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D pos_j_base_powered[1] = pos_j_base; if (line_len > 2) { const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; - for (int i = 2; i < line_len + 1; ++i) { - pos_j_base_powered[i] = pos_j_base_powered[i - 1] * - pos_j_base_powered[i - 1] / - pos_j_base_powered[i - 2] * pos_j_base_sq; + for (int j = 2; j < line_len + 1; ++j) { + pos_j_base_powered[j] = pos_j_base_powered[j - 1] * + pos_j_base_powered[j - 1] / + pos_j_base_powered[j - 2] * pos_j_base_sq; } } } @@ -468,7 +468,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D template <int Dir> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge( - ComplexScalar* data, int n, int n_power_of_2) { + ComplexScalar* data, Index n, Index n_power_of_2) { // Original code: // RealScalar wtemp = std::sin(M_PI/n); // RealScalar wpi = -std::sin(2 * M_PI/n); @@ -482,9 +482,9 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D const ComplexScalar wp_one_2 = wp_one * wp_one; const ComplexScalar wp_one_3 = wp_one_2 * wp_one; const ComplexScalar wp_one_4 = wp_one_3 * wp_one; - const int n2 = n / 2; + const Index n2 = n / 2; ComplexScalar w(1.0, 0.0); - for (int i = 0; i < n2; i += 4) { + for (Index i = 0; i < n2; i += 4) { ComplexScalar temp0(data[i + n2] * w); ComplexScalar temp1(data[i + 1 + n2] * w * wp_one); ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2); @@ -507,7 +507,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D template <int Dir> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly( - ComplexScalar* data, int n, int n_power_of_2) { + ComplexScalar* data, Index n, Index n_power_of_2) { eigen_assert(isPowerOfTwo(n)); if (n > 8) { compute_1D_Butterfly<Dir>(data, n / 2, n_power_of_2 - 1); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 2ab332add..bc6021c9e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -167,7 +167,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> IsAligned = false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = NumDims == 5, + CoordAccess = false, RawAccess = false }; @@ -437,59 +437,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> Index rowInflateStride() const { return m_row_inflate_strides; } Index colInflateStride() const { return m_col_inflate_strides; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const - { - // Location of the first element of the patch. - // ColMajor - // 0: d, 1: patch_rows, 2: patch_cols, 3: number of patches, 4: number of batches - // RowMajor - // 0: number of batches, 1: number of patches, 2: patch_cols , 3: patch_rows, 4: d - const Index patch2DIndex = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 3 : 1]; - - array<Index, NumDims-1> inputCoords; - Index input_col_idx = patch2DIndex / m_fastInputColsEff; - Index inputCol = input_col_idx + coords[1] * m_in_row_strides - m_rowPaddingTop; - Index inputRow = patch2DIndex - input_col_idx * m_input_cols_eff + coords[2] * m_in_col_strides - m_colPaddingLeft; - const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); - const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - inputCoords[0] = coords[0]; // depth - inputCoords[1] = origInputCol; - inputCoords[2] = origInputRow; - inputCoords[3] = coords[4]; // batch - } else { - inputCoords[3] = coords[4]; // depth - inputCoords[2] = origInputCol; - inputCoords[1] = origInputRow; - inputCoords[0] = coords[0]; // batch - } - // If the computed coordinates are outside the original image perimeter, return 0. - if (inputCol < 0 || inputCol >= m_input_cols_eff || inputRow < 0 || inputRow >= m_input_rows_eff || - ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides)) || - ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { - return Scalar(m_paddingValue); - } - if (TensorEvaluator<ArgType, Device>::CoordAccess) { - return m_impl.coeff(inputCoords); - } else { - Index inputIndex; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - inputIndex = - inputCoords[3] * m_patchInputStride + - inputCoords[2] * m_colInputStride + - inputCoords[1] * m_rowInputStride + - inputCoords[0]; - } else { - inputIndex = - inputCoords[1] * m_patchInputStride + - inputCoords[2] * m_colInputStride + - inputCoords[3] * m_rowInputStride + - inputCoords[4]; - } - return m_impl.coeff(inputIndex); - } - } - protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 11284315c..e867e450e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -318,7 +318,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess, + CoordAccess = false, RawAccess = false }; @@ -457,15 +457,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) - { - array<Index, NumDims> inputCoords; - for (int i = 0; i < NumDims; ++i) { - inputCoords = coords[i] + this->m_offsets[i]; - } - return m_impl.coeff(inputCoords); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { Scalar* result = m_impl.data(); if (result) { @@ -547,7 +538,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess, + CoordAccess = false, RawAccess = false }; @@ -608,15 +599,6 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> } } } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(const array<Index, NumDims>& coords) - { - array<Index, NumDims> inputCoords; - for (int i = 0; i < NumDims; ++i) { - inputCoords = coords[i] + this->m_offsets[i]; - } - return this->m_impl.coeffRef(inputCoords); - } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 39a305a93..c3f25f0df 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -151,27 +151,27 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { - return Scalar(0); + return internal::scalar_cast_op<int, Scalar>()(0); } inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) { - return Scalar(0); + return internal::scalar_cast_op<int, Scalar>()(0); } inputIndex += (index - m_padding[0].first); } else { for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i+1]; if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { - return Scalar(0); + return internal::scalar_cast_op<int, Scalar>()(0); } inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; index -= idx * m_outputStrides[i+1]; } if (index < m_padding[NumDims-1].first || index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) { - return Scalar(0); + return internal::scalar_cast_op<int, Scalar>()(0); } inputIndex += (index - m_padding[NumDims-1].first); } @@ -194,14 +194,14 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device { const Index idx = coords[0]; if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) { - return Scalar(0); + return internal::scalar_cast_op<int, Scalar>()(0); } inputIndex = idx - m_padding[0].first; } for (int i = 1; i < NumDims; ++i) { const Index idx = coords[i]; if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { - return Scalar(0); + return internal::scalar_cast_op<int, Scalar>()(0); } inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; } @@ -209,14 +209,14 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device { const Index idx = coords[NumDims-1]; if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) { - return Scalar(0); + return internal::scalar_cast_op<int, Scalar>()(0); } inputIndex = idx - m_padding[NumDims-1].first; } for (int i = NumDims - 2; i >= 0; --i) { const Index idx = coords[i]; if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { - return Scalar(0); + return internal::scalar_cast_op<int, Scalar>()(0); } inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; } @@ -245,11 +245,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device if (last < lastPaddedLeft) { // all the coefficient are in the padding zone. - return internal::pset1<PacketReturnType>(Scalar(0)); + return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0)); } else if (first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. - return internal::pset1<PacketReturnType>(Scalar(0)); + return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0)); } else if (first >= lastPaddedLeft && last < firstPaddedRight) { // all the coefficient are between the 2 padding zones. @@ -271,11 +271,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device if (last < lastPaddedLeft) { // all the coefficient are in the padding zone. - return internal::pset1<PacketReturnType>(Scalar(0)); + return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0)); } else if (first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. - return internal::pset1<PacketReturnType>(Scalar(0)); + return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0)); } else if (first >= lastPaddedLeft && last < firstPaddedRight) { // all the coefficient are between the 2 padding zones. @@ -304,11 +304,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device if (last < lastPaddedLeft) { // all the coefficient are in the padding zone. - return internal::pset1<PacketReturnType>(Scalar(0)); + return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0)); } else if (first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. - return internal::pset1<PacketReturnType>(Scalar(0)); + return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0)); } else if (first >= lastPaddedLeft && last < firstPaddedRight) { // all the coefficient are between the 2 padding zones. @@ -330,11 +330,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device if (last < lastPaddedLeft) { // all the coefficient are in the padding zone. - return internal::pset1<PacketReturnType>(Scalar(0)); + return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0)); } else if (first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. - return internal::pset1<PacketReturnType>(Scalar(0)); + return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0)); } else if (first >= lastPaddedLeft && last < firstPaddedRight) { // all the coefficient are between the 2 padding zones. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 2cbb820b1..57b716fd6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -93,7 +93,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> IsAligned = false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = true, + CoordAccess = false, RawAccess = false }; @@ -248,56 +248,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const - { - Index patch_coord_idx = Layout == ColMajor ? NumDims - 1 : 0; - // Location of the first element of the patch. - const Index patchIndex = coords[patch_coord_idx]; - - if (TensorEvaluator<ArgType, Device>::CoordAccess) { - array<Index, NumDims-1> inputCoords; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int i = NumDims - 2; i > 0; --i) { - const Index patchIdx = patchIndex / m_patchStrides[i]; - patchIndex -= patchIdx * m_patchStrides[i]; - const Index offsetIdx = coords[i]; - inputCoords[i] = coords[i] + patchIdx; - } - } else { - for (int i = 0; i < NumDims - 2; ++i) { - const Index patchIdx = patchIndex / m_patchStrides[i]; - patchIndex -= patchIdx * m_patchStrides[i]; - const Index offsetIdx = coords[i+1]; - inputCoords[i] = coords[i+1] + patchIdx; - } - } - Index coords_idx = Layout == ColMajor ? 0 : NumDims - 1; - inputCoords[0] = (patchIndex + coords[coords_idx]); - return m_impl.coeff(inputCoords); - } - else { - Index inputIndex = 0; - if (Layout == ColMajor) { - for (int i = NumDims - 2; i > 0; --i) { - const Index patchIdx = patchIndex / m_patchStrides[i]; - patchIndex -= patchIdx * m_patchStrides[i]; - const Index offsetIdx = coords[i]; - inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; - } - } else { - for (int i = 0; i < NumDims - 2; ++i) { - const Index patchIdx = patchIndex / m_patchStrides[i]; - patchIndex -= patchIdx * m_patchStrides[i]; - const Index offsetIdx = coords[i+1]; - inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; - } - } - Index coords_idx = Layout == ColMajor ? 0 : NumDims - 1; - inputIndex += (patchIndex + coords[coords_idx]); - return m_impl.coeff(inputIndex); - } - } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 52b78b261..04f4f8ffc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -180,7 +180,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = NumDims == 6, + CoordAccess = false, RawAccess = false }; @@ -518,79 +518,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D Index rowInflateStride() const { return m_row_inflate_strides; } Index colInflateStride() const { return m_col_inflate_strides; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const - { - // ColMajor - // 0: depth, 1: patch_planes, 2: patch_rows, 3: patch_cols, 4: number of patches, 5: batches - // RowMajor - // 0: batches, 1: number of patches, 2: patch_cols , 3: patch_rows, 4: patch_planes, 5: depth - const Index patch3DIndex = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 4 : 1]; - const Index colOffset = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 3 : 2]; - const Index rowOffset= coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 3]; - const Index planeOffset = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 4]; - - array<Index, NumDims-1> inputCoords; - - const Index colIndex = patch3DIndex / m_fastOutputPlanesRows; - const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft; - const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); - if (inputCol < 0 || inputCol >= m_input_cols_eff || - ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) { - return Scalar(m_paddingValue); - } - - const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes; - const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop; - const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); - if (inputRow < 0 || inputRow >= m_input_rows_eff || - ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { - return Scalar(m_paddingValue); - } - - const Index planeIndex = patch3DIndex - colIndex * m_outputPlanesRows - rowIndex * m_outputRows; - const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop; - const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0); - if (inputPlane < 0 || inputPlane >= m_input_planes_eff || - ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) { - return Scalar(m_paddingValue); - } - - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - inputCoords[0] = coords[0]; // depth - inputCoords[1] = origInputPlane; - inputCoords[2] = origInputRow; - inputCoords[3] = origInputCol; - inputCoords[4] = coords[5]; // batch - } else { - inputCoords[4] = coords[5]; // depth - inputCoords[3] = origInputPlane; - inputCoords[2] = origInputRow; - inputCoords[1] = origInputCol; - inputCoords[0] = coords[0]; // batch - } - if (TensorEvaluator<ArgType, Device>::CoordAccess) { - return m_impl.coeff(inputCoords); - } else { - Index inputIndex; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - inputIndex = - inputCoords[4] * m_otherInputStride + - inputCoords[3] * m_colInputStride + - inputCoords[2] * m_rowInputStride + - inputCoords[1] * m_planeInputStride + - inputCoords[0]; - } else { - inputIndex = - inputCoords[0] * m_otherInputStride + - inputCoords[1] * m_colInputStride + - inputCoords[2] * m_rowInputStride + - inputCoords[3] * m_planeInputStride + - inputCoords[4]; - } - return m_impl.coeff(inputIndex); - } - } - protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 7449d6f8c..9b9fd843c 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -55,6 +55,44 @@ void test_cuda_conversion() { gpu_device.deallocate(d_conv); } + +void test_cuda_unary() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half( + d_res_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float( + d_res_float, num_elem); + + gpu_float.device(gpu_device) = gpu_float.random(); + gpu_res_float.device(gpu_device) = gpu_float.abs(); + gpu_res_half.device(gpu_device) = gpu_float.cast<half>().abs().cast<float>(); + + Tensor<float, 1> half_prec(num_elem); + Tensor<float, 1> full_prec(num_elem); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + gpu_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking unary " << i << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + + void test_cuda_elementwise() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -201,7 +239,10 @@ void test_cxx11_tensor_of_float16_cuda() Eigen::GpuDevice device(&stream); if (device.majorDeviceVersion() > 5 || (device.majorDeviceVersion() == 5 && device.minorDeviceVersion() >= 3)) { + std::cout << "Running test on device with capability " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << std::endl; + CALL_SUBTEST_1(test_cuda_conversion()); + CALL_SUBTEST_1(test_cuda_unary()); CALL_SUBTEST_1(test_cuda_elementwise()); // CALL_SUBTEST_2(test_cuda_contractions()); CALL_SUBTEST_3(test_cuda_reductions()); |