aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--Eigen/src/Core/util/ConfigureVectorization.h6
-rw-r--r--Eigen/src/Core/util/Memory.h18
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h2
5 files changed, 26 insertions, 10 deletions
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index e75c7d89e..a2743624e 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -379,10 +379,12 @@
#include <cuda_fp16.h>
#endif
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
+#if defined(EIGEN_HIPCC)
#define EIGEN_VECTORIZE_GPU
#include <hip/hip_vector_types.h>
+#endif
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
#define EIGEN_HAS_HIP_FP16
#include <hip/hip_fp16.h>
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 9dd2e0252..c624556c5 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -96,10 +96,16 @@ inline void throw_std_bad_alloc()
/** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned.
* Fast, but wastes 16 additional bytes of memory. Does not throw any exception.
*/
-inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES)
+EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES)
{
eigen_assert(alignment >= sizeof(void*) && (alignment & -alignment) == alignment && "Alignment must be at least sizeof(void*) and a power of 2");
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+ void *original = ::malloc(size+alignment);
+#else
void *original = std::malloc(size+alignment);
+#endif
+
if (original == 0) return 0;
void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(alignment-1))) + alignment);
*(reinterpret_cast<void**>(aligned) - 1) = original;
@@ -107,9 +113,15 @@ inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = E
}
/** \internal Frees memory allocated with handmade_aligned_malloc */
-inline void handmade_aligned_free(void *ptr)
+EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void *ptr)
{
- if (ptr) std::free(*(reinterpret_cast<void**>(ptr) - 1));
+ if (ptr) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+ ::free(*(reinterpret_cast<void**>(ptr) - 1));
+#else
+ std::free(*(reinterpret_cast<void**>(ptr) - 1));
+#endif
+ }
}
/** \internal
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index b92753c44..6fc1e4a6e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -186,21 +186,21 @@ struct TensorContractionKernel {
/*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
GebpKernel;
- EIGEN_DONT_INLINE
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
static void packLhs(LhsScalar* lhsBlock,
const typename LhsMapper::SubMapper& data_mapper,
const StorageIndex depth, const StorageIndex rows) {
LhsPacker()(lhsBlock, data_mapper, depth, rows, /*stride*/ 0, /*offset*/ 0);
}
- EIGEN_DONT_INLINE
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
static void packRhs(RhsScalar* rhsBlock,
const typename RhsMapper::SubMapper& data_mapper,
const StorageIndex depth, const StorageIndex cols) {
RhsPacker()(rhsBlock, data_mapper, depth, cols);
}
- EIGEN_DONT_INLINE
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
static void invoke(const OutputMapper& output_mapper,
const LhsScalar* lhsBlock, const RhsScalar* rhsBlock,
const StorageIndex rows, const StorageIndex depth,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 949764f3a..2c69e4fd4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -218,6 +218,7 @@ struct InnerMostDimReducer<Self, Op, false, true> {
}
};
+#if !defined(EIGEN_HIPCC)
template <typename Self, typename Op>
struct InnerMostDimReducer<Self, Op, true, true> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
@@ -257,7 +258,8 @@ struct InnerMostDimReducer<Self, Op, true, true> {
}
}
};
-
+#endif
+
template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
struct InnerMostDimPreserver {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
index 88940e6e6..375c570b3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -292,7 +292,7 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
}
template <typename Op>
-__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
+__global__ void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half2* scratch) {
eigen_assert(threadIdx.x == 1);
half tmp = __low2half(*scratch);
reducer.reduce(__high2half(*scratch), &tmp);