diff options
5 files changed, 38 insertions, 38 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 8a04f7d34..e60fab713 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -934,8 +934,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // Sizes of the blocks to load in cache. See the Goto paper for details. BlockingType blocking(m, n, k, 1, true); const Index kc = blocking.kc(); - const Index mc = (std::min)(m, blocking.mc()); - const Index nc = (std::min)(n, blocking.nc()); + const Index mc = numext::mini(m, blocking.mc()); + const Index nc = numext::mini(n, blocking.nc()); const Index sizeA = mc * kc; const Index sizeB = kc * nc; @@ -944,16 +944,16 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT for(Index i2=0; i2<m; i2+=mc) { - const Index actual_mc = (std::min)(i2+mc,m)-i2; + const Index actual_mc = numext::mini(i2+mc,m)-i2; for (Index k2 = 0; k2 < k; k2 += kc) { // make sure we don't overshoot right edge of left matrix, then pack vertical panel - const Index actual_kc = (std::min)(k2 + kc, k) - k2; + const Index actual_kc = numext::mini(k2 + kc, k) - k2; pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0); // series of horizontal blocks for (Index j2 = 0; j2 < n; j2 += nc) { // make sure we don't overshoot right edge of right matrix, then pack block - const Index actual_nc = (std::min)(j2 + nc, n) - j2; + const Index actual_nc = numext::mini(j2 + nc, n) - j2; pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0); // call gebp (matrix kernel) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 57030229d..576bea295 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -202,7 +202,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // note: You can get away with allocating just a single blockA and offsets and meet the // the alignment requirements with the assumption that // (Traits::mr * sizeof(ResScalar)) % 16 == 0 - const Index numBlockAs = (std::min)(num_threads, m_blocks); + const Index numBlockAs = numext::mini(num_threads, m_blocks); std::vector<LhsScalar *> blockAs; blockAs.reserve(num_threads); for (int i = 0; i < num_threads; i++) { @@ -230,14 +230,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { const Index k_start = k_block_idx * kc; // make sure we don't overshoot right edge of left matrix - const Index actual_kc = (std::min)(k_start + kc, k) - k_start; + const Index actual_kc = numext::mini(k_start + kc, k) - k_start; for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) { - const Index num_blocks = (std::min)(m_blocks-m_block_idx, numBlockAs); + const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs); for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) { const Index m_start = mt_block_idx * mc; - const Index actual_mc = (std::min)(m_start + mc, m) - m_start; + const Index actual_mc = numext::mini(m_start + mc, m) - m_start; eigen_assert(actual_mc > 0); Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; @@ -275,7 +275,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) { const Index n_start = n_block_idx * nc; - const Index actual_nc = (std::min)(n_start + nc, n) - n_start; + const Index actual_nc = numext::mini(n_start + nc, n) - n_start; // first make sure the previous kernels are all done before overwriting rhs. Also wait if // we're going to start new k. In both cases need_to_pack is true. @@ -376,7 +376,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT if (m_base_start < arg.max_m) { Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads; wait_until_ready((*arg.lhs_notifications)[blockAId]); - const Index actual_mc = (std::min)(m_base_start + arg.mc, arg.max_m) - m_base_start; + const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start; gebp(arg.output.getSubMapper(m_base_start, arg.n), (*arg.blockAs)[blockAId], arg.blockB, actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 23500e138..a82bfc0aa 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -857,29 +857,29 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr if (m_indices[0] == single_stride_dim) { // Maximum the reuse const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; - maxX = (std::min<int>)(inner_dim, numX); - const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); - block_size.x = (std::min)(maxThreadsPerBlock, maxX); - block_size.y = (std::min<int>)(maxThreadsPerBlock / block_size.x, maxP); + maxX = numext::mini<int>(inner_dim, numX); + const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); + block_size.x = numext::mini(maxThreadsPerBlock, maxX); + block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP); } else { // Read as much as possible alongside the inner most dimension, that is the plane const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar)); - const int maxP = (std::min<int>)(inner_dim, numP); - maxX = (std::min<int>)(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); + const int maxP = numext::mini<int>(inner_dim, numP); + maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); - block_size.x = (std::min)(warpSize, maxX); - block_size.y = (std::min<int>)(maxThreadsPerBlock/block_size.x, maxP); + block_size.x = numext::mini(warpSize, maxX); + block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP); } const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); assert(shared_mem <= maxSharedMem); const int num_x_blocks = ceil(numX, maxX); - const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); - dim3 num_blocks(num_x_blocks, std::min<int>(num_y_blocks, ceil(numP, block_size.y))); + dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y))); //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; @@ -920,24 +920,24 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr // Snap maxX to warp size int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; - const int maxX = (std::min<int>)(inner_dim, numX); - const int maxY = (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); - const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); + const int maxX = numext::mini<int>(inner_dim, numX); + const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); + const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); dim3 block_size; - block_size.x = (std::min)(1024, maxX); - block_size.y = (std::min<int>)(1024/block_size.x, maxY); - block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxP); + block_size.x = numext::mini(1024, maxX); + block_size.y = numext::mini<int>(1024/block_size.x, maxY); + block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP); const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); assert(shared_mem <= maxSharedMem); const int num_x_blocks = ceil(numX, maxX); const int num_y_blocks = ceil(numY, maxY); - const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); - dim3 num_blocks(num_x_blocks, num_y_blocks, std::min<int>(num_z_blocks, ceil(numP, block_size.z))); + dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z))); //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; @@ -999,14 +999,14 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr const int numZ = dimensions()[m_indices[idxZ]]; const int numP = dimensions().TotalSize() / (numX*numY*numZ); - const int maxX = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); - const int maxY = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); - const int maxZ = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); + const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); + const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); + const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); dim3 block_size; - block_size.x = (std::min)(32, maxX); - block_size.y = (std::min)(32, maxY); - block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxZ); + block_size.x = numext::mini(32, maxX); + block_size.y = numext::mini(32, maxY); + block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ); dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 2c5e67f82..b2800aefb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -123,7 +123,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1; - const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); const Index numblocks = size / blocksize; std::vector<Notification*> results; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 9f8819720..d9061c216 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -147,7 +147,7 @@ template <typename T> struct MaxReducer } template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - return (std::max)(saccum, predux_max(vaccum)); + return numext::maxi(saccum, predux_max(vaccum)); } }; @@ -180,7 +180,7 @@ template <typename T> struct MinReducer } template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - return (std::min)(saccum, predux_min(vaccum)); + return numext::mini(saccum, predux_min(vaccum)); } }; |