8 files changed, 480 insertions, 53 deletions
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index cf962aed1..11de45c2e 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -173,6 +173,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
   : public internal::dense_xpr_base<Block<XprType, BlockRows, BlockCols, InnerPanel> >::type
 {
     typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested;
   public:
 
     typedef typename internal::dense_xpr_base<BlockType>::type Base;
@@ -294,10 +295,13 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
     #endif
 
     EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const
+    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
     { 
       return m_xpr; 
     }
+
+    EIGEN_DEVICE_FUNC
+    XprType& nestedExpression() { return m_xpr; }
       
     EIGEN_DEVICE_FUNC
     StorageIndex startRow() const
@@ -313,9 +317,9 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
 
   protected:
 
-    typename XprType::Nested m_xpr;
-    const internal::variable_if_dynamic<StorageIndex, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
-    const internal::variable_if_dynamic<StorageIndex, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
+    XprTypeNested m_xpr;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
     const internal::variable_if_dynamic<StorageIndex, RowsAtCompileTime> m_blockRows;
     const internal::variable_if_dynamic<StorageIndex, ColsAtCompileTime> m_blockCols;
 };
@@ -326,6 +330,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
   : public MapBase<Block<XprType, BlockRows, BlockCols, InnerPanel> >
 {
     typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested;
     enum {
       XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0
     };
@@ -343,7 +348,9 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
                                 || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()),
              BlockRows==1 ? 1 : xpr.rows(),
              BlockCols==1 ? 1 : xpr.cols()),
-        m_xpr(xpr)
+        m_xpr(xpr),
+        m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0),
+        m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0)
     {
       init();
     }
@@ -353,7 +360,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
     EIGEN_DEVICE_FUNC
     inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
       : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
-        m_xpr(xpr)
+        m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
     {
       init();
     }
@@ -365,16 +372,19 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
           Index startRow, Index startCol,
           Index blockRows, Index blockCols)
       : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols),
-        m_xpr(xpr)
+        m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
     {
       init();
     }
 
     EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const
+    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
     { 
       return m_xpr; 
     }
+
+    EIGEN_DEVICE_FUNC
+    XprType& nestedExpression() { return m_xpr; }
       
     /** \sa MapBase::innerStride() */
     EIGEN_DEVICE_FUNC
@@ -392,6 +402,18 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
       return m_outerStride;
     }
 
+    EIGEN_DEVICE_FUNC
+    StorageIndex startRow() const
+    {
+      return m_startRow.value();
+    }
+
+    EIGEN_DEVICE_FUNC
+    StorageIndex startCol() const
+    {
+      return m_startCol.value();
+    }
+
   #ifndef __SUNPRO_CC
   // FIXME sunstudio is not friendly with the above friend...
   // META-FIXME there is no 'friend' keyword around here. Is this obsolete?
@@ -417,7 +439,9 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
                     : m_xpr.innerStride();
     }
 
-    typename XprType::Nested m_xpr;
+    XprTypeNested m_xpr;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
     Index m_outerStride;
 };
 
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index a729e0454..388805f0d 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -865,8 +865,8 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
  
 protected:
   evaluator<ArgType> m_argImpl;
-  const variable_if_dynamic<Index, ArgType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
-  const variable_if_dynamic<Index, ArgType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
+  const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
+  const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
 };
 
 // TODO: This evaluator does not actually use the child evaluator; 
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index a24bf6e26..98d8e029f 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -267,31 +267,34 @@ pexp<Packet8f>(const Packet8f& _x) {
 
 // Hyperbolic Tangent function.
 // Doesn't do anything fancy, just a 13/6-degree rational interpolant which
-// is accurate up to a couple of ulp in the range [-8, 8], outside of which the
+// is accurate up to a couple of ulp in the range [-9, 9], outside of which the
 // fl(tanh(x)) = +/-1.
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
 ptanh<Packet8f>(const Packet8f& _x) {
-  // Map the range [-8, 8] to [-1, 1], we will clamp bad coefficients later.
-  const Packet8f x = _mm256_mul_ps(_x, _mm256_set1_ps(0.125f));
+  // Clamp the inputs to the range [-9, 9] since anything outside
+  // this range is +/-1.0f in single-precision.
+  _EIGEN_DECLARE_CONST_Packet8f(plus_9, 9.0f);
+  _EIGEN_DECLARE_CONST_Packet8f(minus_9, -9.0f);
+  const Packet8f x = pmax(p8f_minus_9, pmin(p8f_plus_9, _x));
 
   // The monomial coefficients of the numerator polynomial (odd).
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_1, -2.47030171958948e-03f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_3, -2.06804010015822e-02f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_5, -3.13693994587418e-02f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_7, -7.19851201683627e-03f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_9, 8.31561269687160e-04f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_11, -1.37626659546502e-04f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_13, 1.39116714700458e-05f);
+  _EIGEN_DECLARE_CONST_Packet8f(alpha_1, 4.89352455891786e-03f);
+  _EIGEN_DECLARE_CONST_Packet8f(alpha_3, 6.37261928875436e-04f);
+  _EIGEN_DECLARE_CONST_Packet8f(alpha_5, 1.48572235717979e-05f);
+  _EIGEN_DECLARE_CONST_Packet8f(alpha_7, 5.12229709037114e-08f);
+  _EIGEN_DECLARE_CONST_Packet8f(alpha_9, -8.60467152213735e-11f);
+  _EIGEN_DECLARE_CONST_Packet8f(alpha_11, 2.00018790482477e-13f);
+  _EIGEN_DECLARE_CONST_Packet8f(alpha_13, -2.76076847742355e-16f);
 
   // The monomial coefficients of the denominator polynomial (even).
-  _EIGEN_DECLARE_CONST_Packet8f(beta_0, -3.08787724141615e-04f);
-  _EIGEN_DECLARE_CONST_Packet8f(beta_2, -9.17251911622436e-03f);
-  _EIGEN_DECLARE_CONST_Packet8f(beta_4, -3.09625062090444e-02f);
-  _EIGEN_DECLARE_CONST_Packet8f(beta_6, -2.05669680763032e-02f);
+  _EIGEN_DECLARE_CONST_Packet8f(beta_0, 4.89352518554385e-03f);
+  _EIGEN_DECLARE_CONST_Packet8f(beta_2, 2.26843463243900e-03f);
+  _EIGEN_DECLARE_CONST_Packet8f(beta_4, 1.18534705686654e-04f);
+  _EIGEN_DECLARE_CONST_Packet8f(beta_6, 1.19825839466702e-06f);
 
   // Since the polynomials are odd/even, we need x^2.
-  const Packet8f x2 = _mm256_mul_ps(x, x);
+  const Packet8f x2 = pmul(x, x);
 
   // Evaluate the numerator polynomial p.
   Packet8f p = pmadd(x2, p8f_alpha_13, p8f_alpha_11);
@@ -308,14 +311,7 @@ ptanh<Packet8f>(const Packet8f& _x) {
   q = pmadd(x2, q, p8f_beta_0);
 
   // Divide the numerator by the denominator.
-  const Packet8f res = pdiv(p, q);
-
-  // Mask-out values outside of [-8, 8].
-  _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
-  _EIGEN_DECLARE_CONST_Packet8f(minus_one, -1.0f);
-  return _mm256_blendv_ps(
-      _mm256_blendv_ps(res, p8f_one, _mm256_cmp_ps(x, p8f_one, _CMP_GT_OQ)),
-      p8f_minus_one, _mm256_cmp_ps(x, p8f_minus_one, _CMP_LT_OQ));
+  return pdiv(p, q);
 }
 
 template <>
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index d3d9f910e..a32b41e18 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -21,7 +21,6 @@ namespace internal {
 template<> struct is_arithmetic<float4>  { enum { value = true }; };
 template<> struct is_arithmetic<double2> { enum { value = true }; };
 
-
 template<> struct packet_traits<float> : default_packet_traits
 {
   typedef float4 type;
@@ -273,6 +272,35 @@ template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a)
   return a.x * a.y;
 }
 
+template<size_t offset>
+struct protate_impl<offset, float4>
+{
+  static float4 run(const float4& a) {
+    if (offset == 0) {
+      return make_float4(a.x, a.y, a.z, a.w);
+    }
+    if (offset == 1) {
+      return make_float4(a.w, a.x, a.y, a.z);
+    }
+    if (offset == 2) {
+      return make_float4(a.z, a.w, a.x, a.y);
+    }
+    return make_float4(a.y, a.z, a.w, a.x);
+  }
+};
+
+template<size_t offset>
+struct protate_impl<offset, double2>
+{
+  static double2 run(const double2& a) {
+    if (offset == 0) {
+      return make_double2(a.x, a.y);
+    }
+    return make_double2(a.y, a.x);
+  }
+};
+
+
 template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
   return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
 }
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
new file mode 100644
index 000000000..1a1b4ec3d
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -0,0 +1,245 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
+#define EIGEN_PACKET_MATH_HALF_CUDA_H
+
+#if defined(EIGEN_HAS_CUDA_FP16)
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+
+__device__ half operator + (const half& a, const half& b) {
+  return __hadd(a, b);
+}
+__device__ half operator * (const half& a, const half& b) {
+  return __hmul(a, b);
+}
+__device__ half operator - (const half& a, const half& b) {
+  return __hsub(a, b);
+}
+__device__ half operator / (const half& a, const half& b) {
+  float num = __half2float(a);
+  float denom = __half2float(b);
+  return __float2half(num / denom);
+}
+__device__ half operator - (const half& a) {
+  return __hneg(a);
+}
+__device__ half operator += (half& a, const half& b) {
+   a = a + b;
+   return a;
+}
+__device__ half operator *= (half& a, const half& b) {
+  a = a * b;
+  return a;
+}
+__device__ half operator -= (half& a, const half& b) {
+  a = a - b;
+  return a;
+}
+__device__ half operator /= (half& a, const half& b) {
+  a = a / b;
+  return a;
+}
+
+namespace std {
+__device__ half abs(const half& a) {
+  half result;
+  result.x = a.x & 0x7FFF;
+  return result;
+}
+}
+
+namespace Eigen {
+namespace internal {
+
+template<> struct is_arithmetic<half> { enum { value = true }; };
+template<> struct is_arithmetic<half2> { enum { value = true }; };
+
+template<> struct packet_traits<half> : default_packet_traits
+{
+  typedef half2 type;
+  typedef half2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasDiGamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+
+    HasBlend = 0,
+  };
+};
+
+
+template<> struct unpacket_traits<half2> { typedef half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const half& from) {
+  return __half2half2(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) {
+  return __halves2half2(a, __hadd(a, __float2half(1)));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+  return __hadd2(a, b);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+  return __hsub2(a, b);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+  return __hneg2(a);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+  return __hmul2(a, b);
+}
+
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+   return __hfma2(a, b, c);
+ }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 / b1;
+  float r2 = a2 / b2;
+  return __floats2half2_rn(r1, r2);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+  half r2 = a2 < b2 ? __high2half(a) : __high2half(b); 
+  return __halves2half2(r1, r2);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+  half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const half* from) {
+  return *reinterpret_cast<const half2*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const half* from) {
+  return __halves2half2(from[0], from[1]);
+}
+
+template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const half*  from) {
+  return __halves2half2(from[0], from[0]);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<half>(half* to, const half2& from) {
+  *reinterpret_cast<half2*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, const half2& from) {
+  to[0] = __low2half(from);
+  to[1] = __high2half(from);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
+  return __ldg((const half2*)from);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) {
+  return __halves2half2(__ldg(from+0), __ldg(from+1));
+}
+
+template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) {
+  return __halves2half2(from[0*stride], from[1*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<half, half2>(half* to, const half2& from, Index stride) {
+  to[stride*0] = __low2half(from);
+  to[stride*1] = __high2half(from);
+}
+
+template<> EIGEN_DEVICE_FUNC inline half pfirst<half2>(const half2& a) {
+  return __low2half(a);
+}
+
+template<> EIGEN_DEVICE_FUNC inline half predux<half2>(const half2& a) {
+  return __hadd(__low2half(a), __high2half(a));
+}
+
+template<> EIGEN_DEVICE_FUNC inline half predux_max<half2>(const half2& a) {
+  half first = __low2half(a);
+  half second = __high2half(a);
+  return __hgt(first, second) ? first : second;
+}
+
+template<> EIGEN_DEVICE_FUNC inline half predux_min<half2>(const half2& a) {
+  half first = __low2half(a);
+  half second = __high2half(a);
+  return __hlt(first, second) ? first : second;
+}
+
+template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) {
+  return __hmul(__low2half(a), __high2half(a));
+}
+
+template<> EIGEN_DEVICE_FUNC inline half2 pabs<half2>(const half2& a) {
+  half2 result;
+  result.x = a.x & 0x7FFF7FFF; 
+  return result;
+}
+
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<half2,2>& kernel) {
+  half a1 = __low2half(kernel.packet[0]);
+  half a2 = __high2half(kernel.packet[0]);
+  half b1 = __low2half(kernel.packet[1]);
+  half b2 = __high2half(kernel.packet[1]);
+  kernel.packet[0] = __halves2half2(a1, b1);
+  kernel.packet[1] = __halves2half2(a2, b2);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif
+#endif
+#endif
+#endif // EIGEN_PACKET_MATH_HALF_CUDA_H
diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h
new file mode 100644
index 000000000..2742a4e7b
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h
@@ -0,0 +1,123 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_CUDA_H
+#define EIGEN_TYPE_CASTING_CUDA_H
+
+namespace Eigen {
+
+namespace internal {
+
+#if defined(EIGEN_HAS_CUDA_FP16)
+
+template<>
+struct scalar_cast_op<float, half> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef half result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const float& a) const {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+      return __float2half(a);
+    #else
+      assert(false && "tbd");
+      return half();
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<float, half> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<int, half> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef half result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const int& a) const {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+      return __float2half(static_cast<float>(a));
+    #else
+      assert(false && "tbd");
+      return half();
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<int, half> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<half, float> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef float result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const half& a) const {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+      return __half2float(a);
+    #else
+      assert(false && "tbd");
+      return 0.0f;
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<half, float> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+
+
+template <>
+struct type_casting_traits<half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 2,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  float2 r1 = __half22float2(a);
+  float2 r2 = __half22float2(b);
+  return make_float4(r1.x, r1.y, r2.x, r2.y);
+#else
+  assert(false && "tbd");
+  return float4();
+#endif
+}
+
+template <>
+struct type_casting_traits<float, half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 2
+  };
+};
+
+template<> EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
+  // Simply discard the second half of the input
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __float22half2_rn(make_float2(a.x, a.y));
+#else
+  assert(false && "tbd");
+  return half2();
+#endif
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_CUDA_H
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index a7a0d906f..28f103eeb 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -518,30 +518,31 @@ Packet2d prsqrt<Packet2d>(const Packet2d& x) {
 
 // Hyperbolic Tangent function.
 // Doesn't do anything fancy, just a 13/6-degree rational interpolant which
-// is accurate up to a couple of ulp in the range [-8, 8], outside of which the
+// is accurate up to a couple of ulp in the range [-9, 9], outside of which the
 // fl(tanh(x)) = +/-1.
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
 ptanh<Packet4f>(const Packet4f& _x) {
-  // Map the range [-8, 8] to [-1, 1], we will clamp bad coefficients later.
-  const Packet4f x =
-      pmax(pset1<Packet4f>(-1.0f),
-           pmin(pset1<Packet4f>(1.0f), pmul(_x, pset1<Packet4f>(0.125f))));
+  // Clamp the inputs to the range [-9, 9] since anything outside
+  // this range is +/-1.0f in single-precision.
+  _EIGEN_DECLARE_CONST_Packet4f(plus_9, 9.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_9, -9.0f);
+  const Packet4f x = pmax(p4f_minus_9, pmin(p4f_plus_9, _x));
 
   // The monomial coefficients of the numerator polynomial (odd).
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_1, -2.47030171958948e-03f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_3, -2.06804010015822e-02f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_5, -3.13693994587418e-02f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_7, -7.19851201683627e-03f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_9, 8.31561269687160e-04f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_11, -1.37626659546502e-04f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_13, 1.39116714700458e-05f);
+  _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-03f);
+  _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-04f);
+  _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-05f);
+  _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-08f);
+  _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f);
+  _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f);
+  _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f);
 
   // The monomial coefficients of the denominator polynomial (even).
-  _EIGEN_DECLARE_CONST_Packet4f(beta_0, -3.08787724141615e-04f);
-  _EIGEN_DECLARE_CONST_Packet4f(beta_2, -9.17251911622436e-03f);
-  _EIGEN_DECLARE_CONST_Packet4f(beta_4, -3.09625062090444e-02f);
-  _EIGEN_DECLARE_CONST_Packet4f(beta_6, -2.05669680763032e-02f);
+  _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-03f);
+  _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-03f);
+  _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-04f);
+  _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-06f);
 
   // Since the polynomials are odd/even, we need x^2.
   const Packet4f x2 = pmul(x, x);
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 01513a59e..5f8bf15b2 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -486,7 +486,12 @@ template<typename T> EIGEN_DEVICE_FUNC void smart_copy(const T* start, const T*
 
 template<typename T> struct smart_copy_helper<T,true> {
   EIGEN_DEVICE_FUNC static inline void run(const T* start, const T* end, T* target)
-  { memcpy(target, start, std::ptrdiff_t(end)-std::ptrdiff_t(start)); }
+  {
+    std::ptrdiff_t size = std::ptrdiff_t(end)-std::ptrdiff_t(start);
+    if(size==0) return;
+    eigen_internal_assert(start!=0 && end!=0 && target!=0);
+    memcpy(target, start, size);
+  }
 };
 
 template<typename T> struct smart_copy_helper<T,false> {
@@ -504,7 +509,12 @@ template<typename T> void smart_memmove(const T* start, const T* end, T* target)
 
 template<typename T> struct smart_memmove_helper<T,true> {
   static inline void run(const T* start, const T* end, T* target)
-  { std::memmove(target, start, std::ptrdiff_t(end)-std::ptrdiff_t(start)); }
+  {
+    std::ptrdiff_t size = std::ptrdiff_t(end)-std::ptrdiff_t(start);
+    if(size==0) return;
+    eigen_internal_assert(start!=0 && end!=0 && target!=0);
+    std::memmove(target, start, size);
+  }
 };
 
 template<typename T> struct smart_memmove_helper<T,false> {