diff options
author | Yangqing Jia <me@daggerfs.com> | 2016-01-28 11:11:45 -0800 |
---|---|---|
committer | Yangqing Jia <me@daggerfs.com> | 2016-01-28 11:11:45 -0800 |
commit | 270c4e1ecd8fd10c42760dd67adbbab0b1387da2 (patch) | |
tree | 91ff1a778419cee6cc99567d01680670ec0bc26b /bench/tensors/tensor_benchmarks_gpu.cu | |
parent | c4e47630b16a716d01dc20b36afa8882b03681a1 (diff) |
bugfix
Diffstat (limited to 'bench/tensors/tensor_benchmarks_gpu.cu')
-rw-r--r-- | bench/tensors/tensor_benchmarks_gpu.cu | 67 |
1 files changed, 67 insertions, 0 deletions
diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu new file mode 100644 index 000000000..fbb486efd --- /dev/null +++ b/bench/tensors/tensor_benchmarks_gpu.cu @@ -0,0 +1,67 @@ +#define EIGEN_USE_GPU + +#include <cuda.h> +#include <cuda_runtime.h> +#include <iostream> + +#include "tensor_benchmarks.h" + +// Simple functions +#define BM_FuncGPU(FUNC) \ + static void BM_##FUNC(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::CudaStreamDevice stream; \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters); \ + } \ + BENCHMARK_RANGE(BM_##FUNC, 10, 5000); + +BM_FuncGPU(memcpy); +BM_FuncGPU(random); +BM_FuncGPU(slicing); +BM_FuncGPU(shuffling); +BM_FuncGPU(padding); +BM_FuncGPU(striding); +BM_FuncGPU(broadcasting); +BM_FuncGPU(coeffWiseOp); +BM_FuncGPU(reduction); + + +// Contractions +#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ + static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::CudaStreamDevice stream; \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); + + +BM_FuncWithInputDimsGPU(contraction, N, N, N); +BM_FuncWithInputDimsGPU(contraction, 64, N, N); +BM_FuncWithInputDimsGPU(contraction, N, 64, N); + + +// Convolutions +#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ + static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::CudaStreamDevice stream; \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters, DIM1, DIM2); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000); + +BM_FuncWithKernelDimsGPU(convolution, 7, 1); +BM_FuncWithKernelDimsGPU(convolution, 1, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 4); +BM_FuncWithKernelDimsGPU(convolution, 4, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 64); +BM_FuncWithKernelDimsGPU(convolution, 64, 7); |