aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2015-11-06 14:54:28 -0800
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2015-11-06 14:54:28 -0800
commitd573efe303b51317d57754f52a22debfa095dc9f (patch)
tree3f136741afcee6d5700410df4c6d8c680743e357 /unsupported
parent9fa283339f64224643089740d89c9f35aa3ab2b3 (diff)
Code cleanup
Diffstat (limited to 'unsupported')
-rw-r--r--unsupported/Eigen/CXX11/Tensor1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h181
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h140
3 files changed, 141 insertions, 181 deletions
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 282ea00bb..1e3d2c06a 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -74,6 +74,7 @@
#include "src/Tensor/TensorEvaluator.h"
#include "src/Tensor/TensorExpr.h"
#include "src/Tensor/TensorReduction.h"
+#include "src/Tensor/TensorReductionCuda.h"
#include "src/Tensor/TensorArgMax.h"
#include "src/Tensor/TensorConcatenation.h"
#include "src/Tensor/TensorContraction.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 4233f7341..d4e88fabd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -336,187 +336,6 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> {
};
#endif
-
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-// Full reducers for GPU, don't vectorize for now
-
-// Reducer function that enables multiple cuda thread to safely accumulate at the same
-// output address. It basically reads the current value of the output variable, and
-// attempts to update it with the new value. If in the meantime another cuda thread
-// updated the content of the output address it will try again.
-template <typename T, typename R>
-__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
-#if __CUDA_ARCH__ >= 300
- if (sizeof(T) == 4)
- {
- unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
- unsigned int newval = oldval;
- reducer.reduce(accum, reinterpret_cast<T*>(&newval));
- if (newval == oldval) {
- return;
- }
- unsigned int readback;
- while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
- oldval = readback;
- newval = oldval;
- reducer.reduce(accum, reinterpret_cast<T*>(&newval));
- if (newval == oldval) {
- return;
- }
- }
- }
- else if (sizeof(T) == 8) {
- unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
- unsigned long long newval = oldval;
- reducer.reduce(accum, reinterpret_cast<T*>(&newval));
- if (newval == oldval) {
- return;
- }
- unsigned long long readback;
- while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
- oldval = readback;
- newval = oldval;
- reducer.reduce(accum, reinterpret_cast<T*>(&newval));
- if (newval == oldval) {
- return;
- }
- }
- }
- else {
- assert(0 && "Wordsize not supported");
- }
-#else
- assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-template <typename T>
-__device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
-#if __CUDA_ARCH__ >= 300
- atomicAdd(output, accum);
-#else
- assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-template <int BlockSize, int NumPerThread, typename Self,
- typename Reducer, typename Index>
-__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
- typename Self::CoeffReturnType* output) {
- const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
-
- if (first_index == 0) {
- *output = reducer.initialize();
- }
-
- typename Self::CoeffReturnType accum = reducer.initialize();
- for (Index i = 0; i < NumPerThread; ++i) {
- const Index index = first_index + i * BlockSize;
- if (index >= num_coeffs) {
- break;
- }
- typename Self::CoeffReturnType val = input.m_impl.coeff(index);
- reducer.reduce(val, &accum);
- }
-
- for (int offset = warpSize/2; offset > 0; offset /= 2) {
- reducer.reduce(__shfl_down(accum, offset), &accum);
- }
-
- if ((threadIdx.x & (warpSize - 1)) == 0) {
- atomicReduce(output, accum, reducer);
- }
-}
-
-
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
- // Unfortunately nvidia doesn't support well exotic types such as complex,
- // so reduce the scope of the optimized version of the code to the simple case
- // of floats.
- static const bool HasOptimizedImplementation = !Op::IsStateful &&
- internal::is_same<typename Self::CoeffReturnType, float>::value;
-
- template <typename OutputType>
- static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
- assert(false && "Should only be called on floats");
- }
-
- static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
- typedef typename Self::Index Index;
-
- const Index num_coeffs = array_prod(self.m_impl.dimensions());
- const int block_size = 256;
- const int num_per_thread = 128;
- const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
- LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>),
- num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
- }
-};
-
-#endif
-
-
-template <typename Self, typename Op,
- bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
-class BlockReducer {
- public:
- typedef typename Self::Index Index;
- typedef typename Self::Scalar Scalar;
- typedef typename Self::CoeffReturnType CoeffReturnType;
- explicit BlockReducer(const Op& reducer) : op_(reducer) {
- accum_ = op_.initialize();
- }
- void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
- for (Index i = 0; i < num_values_to_reduce; ++i) {
- op_.reduce(data[index + i], &accum_);
- }
- }
- CoeffReturnType Finalize() {
- return op_.finalize(accum_);
- }
-
- private:
- CoeffReturnType accum_;
- Op op_;
-};
-
-
-template <typename Self, typename Op>
-class BlockReducer<Self, Op, true> {
- public:
- typedef typename Self::Index Index;
- typedef typename Self::Scalar Scalar;
- typedef typename Self::CoeffReturnType CoeffReturnType;
- typedef typename Self::PacketReturnType PacketReturnType;
- explicit BlockReducer(const Op& reducer) : op_(reducer) {
- vaccum_ = op_.template initializePacket<PacketReturnType>();
- accum_ = op_.initialize();
- }
- void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
- const int packet_size = internal::unpacket_traits<PacketReturnType>::size;
- const typename Self::Index vectorized_size = (num_values_to_reduce /
- packet_size) * packet_size;
- for (typename Self::Index i = 0; i < vectorized_size; i += packet_size) {
- op_.reducePacket(internal::ploadt<PacketReturnType, Unaligned>(
- &data[index + i]), &vaccum_);
- }
-
- for (typename Self::Index i = vectorized_size;
- i < num_values_to_reduce; ++i) {
- op_.reduce(data[index + i], &accum_);
- }
- }
- typename Self::CoeffReturnType Finalize() {
- return op_.finalizeBoth(accum_, vaccum_);
- }
-
- private:
- typename Self::PacketReturnType vaccum_;
- typename Self::CoeffReturnType accum_;
- Op op_;
-};
-
} // end namespace internal
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
new file mode 100644
index 000000000..49102fca2
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -0,0 +1,140 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+
+namespace Eigen {
+namespace internal {
+
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+// Full reducers for GPU, don't vectorize for now
+
+// Reducer function that enables multiple cuda thread to safely accumulate at the same
+// output address. It basically reads the current value of the output variable, and
+// attempts to update it with the new value. If in the meantime another cuda thread
+// updated the content of the output address it will try again.
+template <typename T, typename R>
+__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if __CUDA_ARCH__ >= 300
+ if (sizeof(T) == 4)
+ {
+ unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+ unsigned int newval = oldval;
+ reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ unsigned int readback;
+ while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+ oldval = readback;
+ newval = oldval;
+ reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ }
+ }
+ else if (sizeof(T) == 8) {
+ unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
+ unsigned long long newval = oldval;
+ reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ unsigned long long readback;
+ while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
+ oldval = readback;
+ newval = oldval;
+ reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ }
+ }
+ else {
+ assert(0 && "Wordsize not supported");
+ }
+#else
+ assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+template <typename T>
+__device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
+#if __CUDA_ARCH__ >= 300
+ atomicAdd(output, accum);
+#else
+ assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+template <int BlockSize, int NumPerThread, typename Self,
+ typename Reducer, typename Index>
+__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+ typename Self::CoeffReturnType* output) {
+ const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
+
+ if (first_index == 0) {
+ *output = reducer.initialize();
+ }
+
+ typename Self::CoeffReturnType accum = reducer.initialize();
+ for (Index i = 0; i < NumPerThread; ++i) {
+ const Index index = first_index + i * BlockSize;
+ if (index >= num_coeffs) {
+ break;
+ }
+ typename Self::CoeffReturnType val = input.m_impl.coeff(index);
+ reducer.reduce(val, &accum);
+ }
+
+ for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ reducer.reduce(__shfl_down(accum, offset), &accum);
+ }
+
+ if ((threadIdx.x & (warpSize - 1)) == 0) {
+ atomicReduce(output, accum, reducer);
+ }
+}
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+ // Unfortunately nvidia doesn't support well exotic types such as complex,
+ // so reduce the scope of the optimized version of the code to the simple case
+ // of floats.
+ static const bool HasOptimizedImplementation = !Op::IsStateful &&
+ internal::is_same<typename Self::CoeffReturnType, float>::value;
+
+ template <typename OutputType>
+ EIGEN_DEVICE_FUNC static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+ assert(false && "Should only be called on floats");
+ }
+
+ EIGEN_DEVICE_FUNC static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
+ typedef typename Self::Index Index;
+
+ const Index num_coeffs = array_prod(self.m_impl.dimensions());
+ const int block_size = 256;
+ const int num_per_thread = 128;
+ const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
+ LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>),
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
+ }
+};
+
+#endif
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H