aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h48
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h4
5 files changed, 38 insertions, 38 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 8a04f7d34..e60fab713 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -934,8 +934,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
// Sizes of the blocks to load in cache. See the Goto paper for details.
BlockingType blocking(m, n, k, 1, true);
const Index kc = blocking.kc();
- const Index mc = (std::min)(m, blocking.mc());
- const Index nc = (std::min)(n, blocking.nc());
+ const Index mc = numext::mini(m, blocking.mc());
+ const Index nc = numext::mini(n, blocking.nc());
const Index sizeA = mc * kc;
const Index sizeB = kc * nc;
@@ -944,16 +944,16 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
for(Index i2=0; i2<m; i2+=mc)
{
- const Index actual_mc = (std::min)(i2+mc,m)-i2;
+ const Index actual_mc = numext::mini(i2+mc,m)-i2;
for (Index k2 = 0; k2 < k; k2 += kc) {
// make sure we don't overshoot right edge of left matrix, then pack vertical panel
- const Index actual_kc = (std::min)(k2 + kc, k) - k2;
+ const Index actual_kc = numext::mini(k2 + kc, k) - k2;
pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
// series of horizontal blocks
for (Index j2 = 0; j2 < n; j2 += nc) {
// make sure we don't overshoot right edge of right matrix, then pack block
- const Index actual_nc = (std::min)(j2 + nc, n) - j2;
+ const Index actual_nc = numext::mini(j2 + nc, n) - j2;
pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
// call gebp (matrix kernel)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 57030229d..576bea295 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -202,7 +202,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
// note: You can get away with allocating just a single blockA and offsets and meet the
// the alignment requirements with the assumption that
// (Traits::mr * sizeof(ResScalar)) % 16 == 0
- const Index numBlockAs = (std::min)(num_threads, m_blocks);
+ const Index numBlockAs = numext::mini(num_threads, m_blocks);
std::vector<LhsScalar *> blockAs;
blockAs.reserve(num_threads);
for (int i = 0; i < num_threads; i++) {
@@ -230,14 +230,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
const Index k_start = k_block_idx * kc;
// make sure we don't overshoot right edge of left matrix
- const Index actual_kc = (std::min)(k_start + kc, k) - k_start;
+ const Index actual_kc = numext::mini(k_start + kc, k) - k_start;
for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) {
- const Index num_blocks = (std::min)(m_blocks-m_block_idx, numBlockAs);
+ const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs);
for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) {
const Index m_start = mt_block_idx * mc;
- const Index actual_mc = (std::min)(m_start + mc, m) - m_start;
+ const Index actual_mc = numext::mini(m_start + mc, m) - m_start;
eigen_assert(actual_mc > 0);
Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;
@@ -275,7 +275,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) {
const Index n_start = n_block_idx * nc;
- const Index actual_nc = (std::min)(n_start + nc, n) - n_start;
+ const Index actual_nc = numext::mini(n_start + nc, n) - n_start;
// first make sure the previous kernels are all done before overwriting rhs. Also wait if
// we're going to start new k. In both cases need_to_pack is true.
@@ -376,7 +376,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
if (m_base_start < arg.max_m) {
Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
wait_until_ready((*arg.lhs_notifications)[blockAId]);
- const Index actual_mc = (std::min)(m_base_start + arg.mc, arg.max_m) - m_base_start;
+ const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start;
gebp(arg.output.getSubMapper(m_base_start, arg.n),
(*arg.blockAs)[blockAId], arg.blockB,
actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 23500e138..a82bfc0aa 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -857,29 +857,29 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
if (m_indices[0] == single_stride_dim) {
// Maximum the reuse
const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
- maxX = (std::min<int>)(inner_dim, numX);
- const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
- block_size.x = (std::min)(maxThreadsPerBlock, maxX);
- block_size.y = (std::min<int>)(maxThreadsPerBlock / block_size.x, maxP);
+ maxX = numext::mini<int>(inner_dim, numX);
+ const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
+ block_size.x = numext::mini(maxThreadsPerBlock, maxX);
+ block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
}
else {
// Read as much as possible alongside the inner most dimension, that is the plane
const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
- const int maxP = (std::min<int>)(inner_dim, numP);
- maxX = (std::min<int>)(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
+ const int maxP = numext::mini<int>(inner_dim, numP);
+ maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
- block_size.x = (std::min)(warpSize, maxX);
- block_size.y = (std::min<int>)(maxThreadsPerBlock/block_size.x, maxP);
+ block_size.x = numext::mini(warpSize, maxX);
+ block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
}
const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
assert(shared_mem <= maxSharedMem);
const int num_x_blocks = ceil(numX, maxX);
- const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+ const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
- dim3 num_blocks(num_x_blocks, std::min<int>(num_y_blocks, ceil(numP, block_size.y)));
+ dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
//cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
@@ -920,24 +920,24 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
// Snap maxX to warp size
int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
- const int maxX = (std::min<int>)(inner_dim, numX);
- const int maxY = (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
- const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
+ const int maxX = numext::mini<int>(inner_dim, numX);
+ const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
+ const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
dim3 block_size;
- block_size.x = (std::min)(1024, maxX);
- block_size.y = (std::min<int>)(1024/block_size.x, maxY);
- block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxP);
+ block_size.x = numext::mini(1024, maxX);
+ block_size.y = numext::mini<int>(1024/block_size.x, maxY);
+ block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
assert(shared_mem <= maxSharedMem);
const int num_x_blocks = ceil(numX, maxX);
const int num_y_blocks = ceil(numY, maxY);
- const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+ const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
- dim3 num_blocks(num_x_blocks, num_y_blocks, std::min<int>(num_z_blocks, ceil(numP, block_size.z)));
+ dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
//cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
@@ -999,14 +999,14 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
const int numZ = dimensions()[m_indices[idxZ]];
const int numP = dimensions().TotalSize() / (numX*numY*numZ);
- const int maxX = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
- const int maxY = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
- const int maxZ = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
+ const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
+ const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
+ const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
dim3 block_size;
- block_size.x = (std::min)(32, maxX);
- block_size.y = (std::min)(32, maxY);
- block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxZ);
+ block_size.x = numext::mini(32, maxX);
+ block_size.y = numext::mini(32, maxY);
+ block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 2c5e67f82..b2800aefb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -123,7 +123,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
- const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
+ const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
const Index numblocks = size / blocksize;
std::vector<Notification*> results;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 9f8819720..d9061c216 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -147,7 +147,7 @@ template <typename T> struct MaxReducer
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
- return (std::max)(saccum, predux_max(vaccum));
+ return numext::maxi(saccum, predux_max(vaccum));
}
};
@@ -180,7 +180,7 @@ template <typename T> struct MinReducer
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
- return (std::min)(saccum, predux_min(vaccum));
+ return numext::mini(saccum, predux_min(vaccum));
}
};