// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2016 Benoit Steiner // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU #include "main.h" #include using Eigen::Tensor; #ifdef EIGEN_HAS_CUDA_FP16 void test_cuda_conversion() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::TensorMap, Eigen::Aligned> gpu_float( d_float, num_elem); Eigen::TensorMap, Eigen::Aligned> gpu_half( d_half, num_elem); Eigen::TensorMap, Eigen::Aligned> gpu_conv( d_conv, num_elem); gpu_float.device(gpu_device) = gpu_float.random(); gpu_half.device(gpu_device) = gpu_float.cast(); gpu_conv.device(gpu_device) = gpu_half.cast(); Tensor initial(num_elem); Tensor final(num_elem); gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float)); gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float)); for (int i = 0; i < num_elem; ++i) { VERIFY_IS_APPROX(initial(i), final(i)); } gpu_device.deallocate(d_float); gpu_device.deallocate(d_half); gpu_device.deallocate(d_conv); } void test_cuda_unary() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::TensorMap, Eigen::Aligned> gpu_float( d_float, num_elem); Eigen::TensorMap, Eigen::Aligned> gpu_res_half( d_res_half, num_elem); Eigen::TensorMap, Eigen::Aligned> gpu_res_float( d_res_float, num_elem); gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); gpu_res_float.device(gpu_device) = gpu_float.abs(); gpu_res_half.device(gpu_device) = gpu_float.cast().abs().cast(); Tensor half_prec(num_elem); Tensor full_prec(num_elem); gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); gpu_device.synchronize(); for (int i = 0; i < num_elem; ++i) { std::cout << "Checking unary " << i << std::endl; VERIFY_IS_APPROX(full_prec(i), half_prec(i)); } gpu_device.deallocate(d_float); gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); } void test_cuda_elementwise() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::TensorMap, Eigen::Aligned> gpu_float1( d_float1, num_elem); Eigen::TensorMap, Eigen::Aligned> gpu_float2( d_float2, num_elem); Eigen::TensorMap, Eigen::Aligned> gpu_res_half( d_res_half, num_elem); Eigen::TensorMap, Eigen::Aligned> gpu_res_float( d_res_float, num_elem); gpu_float1.device(gpu_device) = gpu_float1.random(); gpu_float2.device(gpu_device) = gpu_float2.random(); gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1; gpu_res_half.device(gpu_device) = ((gpu_float1.cast() + gpu_float2.cast()) * gpu_float1.cast()).cast(); Tensor half_prec(num_elem); Tensor full_prec(num_elem); gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); gpu_device.synchronize(); for (int i = 0; i < num_elem; ++i) { std::cout << "Checking elemwise " << i << std::endl; VERIFY_IS_APPROX(full_prec(i), half_prec(i)); } gpu_device.deallocate(d_float1); gpu_device.deallocate(d_float2); gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); } void test_cuda_contractions() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int rows = 23; int cols = 23; int num_elem = rows*cols; float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::TensorMap, Eigen::Aligned> gpu_float1( d_float1, rows, cols); Eigen::TensorMap, Eigen::Aligned> gpu_float2( d_float2, rows, cols); Eigen::TensorMap, Eigen::Aligned> gpu_res_half( d_res_half, rows, cols); Eigen::TensorMap, Eigen::Aligned> gpu_res_float( d_res_float, rows, cols); gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f); gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float1.constant(0.5f); typedef Tensor::DimensionPair DimPair; Eigen::array dims(DimPair(1, 0)); gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims); gpu_res_half.device(gpu_device) = gpu_float1.cast().contract(gpu_float2.cast(), dims).cast(); Tensor half_prec(rows, cols); Tensor full_prec(rows, cols); gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); gpu_device.synchronize(); for (int i = 0; i < rows; ++i) { for (int j = 0; j < cols; ++j) { std::cout << "Checking contract " << i << " " << j << std::endl; VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j)); } } gpu_device.deallocate(d_float1); gpu_device.deallocate(d_float2); gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); } void test_cuda_reductions() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int size = 13; int num_elem = size*size; float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_res_half = (float*)gpu_device.allocate(size * sizeof(float)); float* d_res_float = (float*)gpu_device.allocate(size * sizeof(float)); Eigen::TensorMap, Eigen::Aligned> gpu_float1( d_float1, size, size); Eigen::TensorMap, Eigen::Aligned> gpu_float2( d_float2, size, size); Eigen::TensorMap, Eigen::Aligned> gpu_res_half( d_res_half, size); Eigen::TensorMap, Eigen::Aligned> gpu_res_float( d_res_float, size); gpu_float1.device(gpu_device) = gpu_float1.random(); gpu_float2.device(gpu_device) = gpu_float2.random(); Eigen::array redux_dim = {{0}}; gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim); gpu_res_half.device(gpu_device) = gpu_float1.cast().sum(redux_dim).cast(); Tensor half_prec(size); Tensor full_prec(size); gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, size*sizeof(float)); gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, size*sizeof(float)); gpu_device.synchronize(); for (int i = 0; i < size; ++i) { std::cout << "Checking redux " << i << std::endl; VERIFY_IS_APPROX(full_prec(i), half_prec(i)); } gpu_device.deallocate(d_float1); gpu_device.deallocate(d_float2); gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); } #endif void test_cxx11_tensor_of_float16_cuda() { #ifdef EIGEN_HAS_CUDA_FP16 Eigen::CudaStreamDevice stream; Eigen::GpuDevice device(&stream); if (device.majorDeviceVersion() > 5 || (device.majorDeviceVersion() == 5 && device.minorDeviceVersion() >= 3)) { std::cout << "Running test on device with capability " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << std::endl; CALL_SUBTEST_1(test_cuda_conversion()); CALL_SUBTEST_1(test_cuda_unary()); CALL_SUBTEST_1(test_cuda_elementwise()); CALL_SUBTEST_2(test_cuda_contractions()); CALL_SUBTEST_3(test_cuda_reductions()); } else { std::cout << "Half floats require compute capability of at least 5.3. This device only supports " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << ". Skipping the test" << std::endl; } #else std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl; #endif }