diff options
-rw-r--r-- | bench/tensors/tensor_benchmarks.h | 6 | ||||
-rw-r--r-- | bench/tensors/tensor_benchmarks_fp16_gpu.cu | 2 |
2 files changed, 4 insertions, 4 deletions
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index b208a401a..131d056b4 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -48,12 +48,12 @@ template <typename Device, typename T> class BenchmarkSuite { Eigen::array<TensorIndex, 2> sizes; sizes[0] = m_; sizes[1] = k_; - const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> A(a_, sizes); - TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> B((int*)b_, sizes); + const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes); + TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { - B.device(device_) = A.template cast<int>(); + B.device(device_) = A.template cast<T>(); } // Record the number of values copied per second finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters); diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu index d841bcdac..49f75472a 100644 --- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu +++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu @@ -19,7 +19,7 @@ BENCHMARK_RANGE(BM_##FUNC, 10, 5000); BM_FuncGPU(memcpy); -//BM_FuncGPU(typeCasting); +BM_FuncGPU(typeCasting); //BM_FuncGPU(random); BM_FuncGPU(slicing); BM_FuncGPU(rowChip); |