Merge.

author: Rasmus Munk Larsen <rmlarsen@google.com> 2016-04-09 15:31:53 -0700
committer: Rasmus Munk Larsen <rmlarsen@google.com> 2016-04-09 15:31:53 -0700
commit: 1f70bd4134216678e850374222215ae2f9949bde (patch)
tree: ce68bc67548863e8098c5ebc7d11a895bc1ce555
parent: 096e355f8e1b12d3c1f50a8f69dfd7b01def54c5 (diff)
parent: be80fb49fc63c505fcecbe53d925714701f42b2b (diff)
10 files changed, 92 insertions, 69 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 6ff61c18a..001c2ffbf 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -62,7 +62,7 @@ struct default_packet_traits
     HasRsqrt  = 0,
     HasExp    = 0,
     HasLog    = 0,
-    HasLog10    = 0,
+    HasLog10  = 0,
     HasPow    = 0,
 
     HasSin    = 0,
@@ -71,9 +71,9 @@ struct default_packet_traits
     HasASin   = 0,
     HasACos   = 0,
     HasATan   = 0,
-    HasSinh    = 0,
-    HasCosh    = 0,
-    HasTanh    = 0,
+    HasSinh   = 0,
+    HasCosh   = 0,
+    HasTanh   = 0,
     HasLGamma = 0,
     HasDiGamma = 0,
     HasZeta = 0,
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index fd73f543b..dd19f080b 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -705,12 +705,12 @@ typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>:
 isfinite_impl(const T& x)
 {
   #ifdef __CUDA_ARCH__
-    return (isfinite)(x);
+    return (::isfinite)(x);
   #elif EIGEN_USE_STD_FPCLASSIFY
     using std::isfinite;
     return isfinite EIGEN_NOT_A_MACRO (x);
   #else
-    return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest();
+    return x<=NumTraits<T>::highest() && x>=NumTraits<T>::lowest();
   #endif
 }
 
@@ -720,7 +720,7 @@ typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>:
 isinf_impl(const T& x)
 {
   #ifdef __CUDA_ARCH__
-    return (isinf)(x);
+    return (::isinf)(x);
   #elif EIGEN_USE_STD_FPCLASSIFY
     using std::isinf;
     return isinf EIGEN_NOT_A_MACRO (x);
@@ -735,7 +735,7 @@ typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>:
 isnan_impl(const T& x)
 {
   #ifdef __CUDA_ARCH__
-    return (isnan)(x);
+    return (::isnan)(x);
   #elif EIGEN_USE_STD_FPCLASSIFY
     using std::isnan;
     return isnan EIGEN_NOT_A_MACRO (x);
diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index 0a3b301bf..3be7e88d7 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -406,6 +406,9 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::ha
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrt(const Eigen::half& a) {
   return Eigen::half(::sqrtf(float(a)));
 }
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half pow(const Eigen::half& a, const Eigen::half& b) {
+  return Eigen::half(::powf(float(a), float(b)));
+}
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen::half& a) {
   return Eigen::half(::floorf(float(a)));
 }
@@ -432,6 +435,9 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half&
 static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) {
   return Eigen::half(::sqrtf(float(a)));
 }
+static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) {
+  return Eigen::half(::powf(float(a), float(b)));
+}
 static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) {
   return Eigen::half(::floorf(float(a)));
 }
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index 14f0c9415..61d532e4d 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -17,7 +17,8 @@
 // we'll use on the host side (SSE, AVX, ...)
 #if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+// Most of the following operations require arch >= 5.3
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
 
 namespace Eigen {
 namespace internal {
@@ -33,14 +34,7 @@ template<> struct packet_traits<half> : default_packet_traits
     AlignedOnScalar = 1,
     size=2,
     HasHalfPacket = 0,
-
-    HasDiv  = 1,
-    HasLog  = 1,
-    HasExp  = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-
-    HasBlend = 0,
+    HasDiv  = 1
   };
 };
 
@@ -74,20 +68,12 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, co
 
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
-#if __CUDA_ARCH__ >= 320
   return __ldg((const half2*)from);
-#else
-  return __halves2half2(*(from+0), *(from+1));
-#endif
 }
 
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) {
-#if __CUDA_ARCH__ >= 320
   return __halves2half2(__ldg(from+0), __ldg(from+1));
-#else
-  return __halves2half2(*(from+0), *(from+1));
-#endif
 }
 
 template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) {
@@ -120,8 +106,6 @@ ptranspose(PacketBlock<half2,2>& kernel) {
   kernel.packet[1] = __halves2half2(a2, b2);
 }
 
-// The following operations require arch >= 5.3
-#if  __CUDA_ARCH__ >= 530
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) {
   return __halves2half2(a, __hadd(a, __float2half(1.0f)));
 }
@@ -197,7 +181,6 @@ template<> EIGEN_DEVICE_FUNC inline half predux_min<half2>(const half2& a) {
 template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) {
   return __hmul(__low2half(a), __high2half(a));
 }
-#endif
 
 } // end namespace internal
 
diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h
index b2a9724de..396b38eaf 100644
--- a/Eigen/src/Core/arch/CUDA/TypeCasting.h
+++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h
@@ -71,6 +71,7 @@ struct functor_traits<scalar_cast_op<half, float> >
 
 
 
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
 
 template <>
 struct type_casting_traits<half, float> {
@@ -82,22 +83,9 @@ struct type_casting_traits<half, float> {
 };
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
   float2 r1 = __half22float2(a);
   float2 r2 = __half22float2(b);
   return make_float4(r1.x, r1.y, r2.x, r2.y);
-#else
-  half r1;
-  r1.x = a.x & 0xFFFF;
-  half r2;
-  r2.x = (a.x & 0xFFFF0000) >> 16;
-  half r3;
-  r3.x = b.x & 0xFFFF;
-  half r4;
-  r4.x = (b.x & 0xFFFF0000) >> 16;
-  return make_float4(static_cast<float>(r1), static_cast<float>(r2),
-                     static_cast<float>(r3), static_cast<float>(r4));
-#endif
 }
 
 template <>
@@ -111,20 +99,11 @@ struct type_casting_traits<float, half> {
 
 template<> EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
   // Simply discard the second half of the input
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
   return __float22half2_rn(make_float2(a.x, a.y));
-#else
-  half r1 = static_cast<half>(a.x);
-  half r2 = static_cast<half>(a.y);
-  half2 r;
-  r.x = 0;
-  r.x |= r1.x;
-  r.x |= (static_cast<unsigned int>(r2.x) << 16);
-  return r;
-#endif
 }
 
 #endif
+#endif
 
 } // end namespace internal
 
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index d5e3972b5..6f3661921 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -19,10 +19,25 @@ macro(ei_add_test_internal testname testname_with_suffix)
   endif()
   
   if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu)
-    if (${ARGC} GREATER 2)
-      cuda_add_executable(${targetname} ${filename} OPTIONS ${ARGV2})
+    if(EIGEN_TEST_CUDA_CLANG)
+      set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
+      if(CUDA_64_BIT_DEVICE_CODE)
+        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+      else()
+        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib")
+      endif()
+      if (${ARGC} GREATER 2)
+        add_executable(${targetname} ${filename})
+      else()
+        add_executable(${targetname} ${filename} OPTIONS ${ARGV2})
+      endif()
+      target_link_libraries(${targetname} "cudart_static" "cuda" "dl" "rt" "pthread")
     else()
-      cuda_add_executable(${targetname} ${filename})
+      if (${ARGC} GREATER 2)
+        cuda_add_executable(${targetname} ${filename} OPTIONS ${ARGV2})
+      else()
+        cuda_add_executable(${targetname} ${filename})
+      endif()
     endif()
   else()
     add_executable(${targetname} ${filename})
@@ -316,7 +331,11 @@ macro(ei_testing_print_summary)
     endif()
 
     if(EIGEN_TEST_CUDA)
-      message(STATUS "CUDA:              ON")
+      if(EIGEN_TEST_CUDA_CLANG)
+        message(STATUS "CUDA:              ON (using clang)")
+      else()
+        message(STATUS "CUDA:              ON (using nvcc)")
+      endif()
     else()
       message(STATUS "CUDA:              OFF")
     endif()
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 841c4572b..7bed6a45c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -327,8 +327,14 @@ endif()
 
 # CUDA unit tests
 option(EIGEN_TEST_CUDA "Enable CUDA support in unit tests" OFF)
+option(EIGEN_TEST_CUDA_CLANG "Use clang instead of nvcc to compile the CUDA tests" OFF)
+
+if(EIGEN_TEST_CUDA_CLANG AND NOT CMAKE_CXX_COMPILER MATCHES "clang")
+  message(WARNING "EIGEN_TEST_CUDA_CLANG is set, but CMAKE_CXX_COMPILER does not appear to be clang.")
+endif()
+
 if(EIGEN_TEST_CUDA)
-  
+
 find_package(CUDA 5.0)
 if(CUDA_FOUND)
   
@@ -336,6 +342,9 @@ if(CUDA_FOUND)
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 
     set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
   endif()
+  if(EIGEN_TEST_CUDA_CLANG)
+   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_30")
+  endif()
   cuda_include_directories(${CMAKE_CURRENT_BINARY_DIR})
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
   
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index c6a92fe73..c088df1c1 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -190,6 +190,10 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
     set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
   endif()
+  if(EIGEN_TEST_CUDA_CLANG)
+   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}")
+  endif()
+
   set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\"")
   cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
@@ -206,10 +210,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
     ei_add_test(cxx11_tensor_random_cuda)
   endif()
 
-  # Operations other that casting of half floats are only supported starting with arch 5.3
-  if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 52)
-    ei_add_test(cxx11_tensor_of_float16_cuda)
-  endif()
+  ei_add_test(cxx11_tensor_of_float16_cuda)
 
   unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 endif()
diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp
index 724ea7b5b..6a5ed057f 100644
--- a/unsupported/test/NonLinearOptimization.cpp
+++ b/unsupported/test/NonLinearOptimization.cpp
@@ -14,6 +14,9 @@
 
 using std::sqrt;
 
+// tolerance for chekcing number of iterations
+#define LM_EVAL_COUNT_TOL 4/3
+
 int fcn_chkder(const VectorXd &x, VectorXd &fvec, MatrixXd &fjac, int iflag)
 {
     /*      subroutine fcn for chkder example. */
@@ -1023,7 +1026,8 @@ void testNistLanczos1(void)
   VERIFY_IS_EQUAL(lm.njev, 72);
   // check norm^2
   std::cout.precision(30);
-  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4290986055242372e-25);  // should be 1.4307867721E-25, but nist results are on 128-bit floats
+  std::cout << lm.fvec.squaredNorm() << "\n";
+  VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25);
   // check x
   VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
   VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
@@ -1044,7 +1048,7 @@ void testNistLanczos1(void)
   VERIFY_IS_EQUAL(lm.nfev, 9);
   VERIFY_IS_EQUAL(lm.njev, 8);
   // check norm^2
-  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.430571737783119393e-25);  // should be 1.4307867721E-25, but nist results are on 128-bit floats
+  VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25);
   // check x
   VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
   VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
@@ -1354,8 +1358,12 @@ void testNistMGH17(void)
   
   // check return value
   VERIFY_IS_EQUAL(info, 2); 
-  VERIFY(lm.nfev < 650);  // 602
-  VERIFY(lm.njev < 600);  // 545
+  ++g_test_level;
+  VERIFY_IS_EQUAL(lm.nfev, 602);  // 602
+  VERIFY_IS_EQUAL(lm.njev, 545);  // 545
+  --g_test_level;
+  VERIFY(lm.nfev < 602 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev < 545 * LM_EVAL_COUNT_TOL);
 
   /*
    * Second try
diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp
index a2bdb99e4..6dc17bd17 100644
--- a/unsupported/test/levenberg_marquardt.cpp
+++ b/unsupported/test/levenberg_marquardt.cpp
@@ -23,6 +23,9 @@
 
 using std::sqrt;
 
+// tolerance for chekcing number of iterations
+#define LM_EVAL_COUNT_TOL 4/3
+
 struct lmder_functor : DenseFunctor<double>
 {
     lmder_functor(void): DenseFunctor<double>(3,15) {}
@@ -631,7 +634,7 @@ void testNistLanczos1(void)
   VERIFY_IS_EQUAL(lm.nfev(), 79);
   VERIFY_IS_EQUAL(lm.njev(), 72);
   // check norm^2
-//   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.430899764097e-25);  // should be 1.4307867721E-25, but nist results are on 128-bit floats
+  VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25);
   // check x
   VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
   VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
@@ -652,7 +655,7 @@ void testNistLanczos1(void)
   VERIFY_IS_EQUAL(lm.nfev(), 9);
   VERIFY_IS_EQUAL(lm.njev(), 8);
   // check norm^2
-//   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.428595533845e-25);  // should be 1.4307867721E-25, but nist results are on 128-bit floats
+  VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25);
   // check x
   VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
   VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
@@ -789,7 +792,8 @@ void testNistMGH10(void)
   MGH10_functor functor;
   LevenbergMarquardt<MGH10_functor> lm(functor);
   info = lm.minimize(x);
-  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall);
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  // was: VERIFY_IS_EQUAL(info, 1);
 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01);
@@ -799,9 +803,13 @@ void testNistMGH10(void)
   VERIFY_IS_APPROX(x[2], 3.4522363462E+02);
   
   // check return value
-  //VERIFY_IS_EQUAL(info, 1);
+
+  ++g_test_level;
   VERIFY_IS_EQUAL(lm.nfev(), 284 );
   VERIFY_IS_EQUAL(lm.njev(), 249 );
+  --g_test_level;
+  VERIFY(lm.nfev() < 284 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev() < 249 * LM_EVAL_COUNT_TOL);
 
   /*
    * Second try
@@ -809,7 +817,10 @@ void testNistMGH10(void)
   x<< 0.02, 4000., 250.;
   // do the computation
   info = lm.minimize(x);
+  ++g_test_level;
   VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  // was: VERIFY_IS_EQUAL(info, 1);
+  --g_test_level;
 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01);
@@ -819,9 +830,12 @@ void testNistMGH10(void)
   VERIFY_IS_APPROX(x[2], 3.4522363462E+02);
   
   // check return value
-  //VERIFY_IS_EQUAL(info, 1);
+  ++g_test_level;
   VERIFY_IS_EQUAL(lm.nfev(), 126);
   VERIFY_IS_EQUAL(lm.njev(), 116);
+  --g_test_level;
+  VERIFY(lm.nfev() < 126 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev() < 116 * LM_EVAL_COUNT_TOL);
 }
 
 
@@ -896,8 +910,12 @@ void testNistBoxBOD(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1); 
+  ++g_test_level;
   VERIFY_IS_EQUAL(lm.nfev(), 16 );
   VERIFY_IS_EQUAL(lm.njev(), 15 );
+  --g_test_level;
+  VERIFY(lm.nfev() < 16 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev() < 15 * LM_EVAL_COUNT_TOL);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.1680088766E+03);
   // check x
author	Rasmus Munk Larsen <rmlarsen@google.com>	2016-04-09 15:31:53 -0700
committer	Rasmus Munk Larsen <rmlarsen@google.com>	2016-04-09 15:31:53 -0700
commit	1f70bd4134216678e850374222215ae2f9949bde (patch)
tree	ce68bc67548863e8098c5ebc7d11a895bc1ce555
parent	096e355f8e1b12d3c1f50a8f69dfd7b01def54c5 (diff)
parent	be80fb49fc63c505fcecbe53d925714701f42b2b (diff)