8 files changed, 429 insertions, 15 deletions
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index 2810a7a0b..06cd56684 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -271,6 +271,86 @@ pexp<Packet8f>(const Packet8f& _x) {
   return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x);
 }
 
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
+pexp<Packet4d>(const Packet4d& _x) {
+  Packet4d x = _x;
+
+  _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
+  _EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
+  _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
+
+  _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
+  _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
+  _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
+
+  Packet4d tmp, fx;
+
+  // clamp x
+  x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
+  // Express exp(x) as exp(g + n*log(2)).
+  fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
+
+  // Get the integer modulus of log(2), i.e. the "n" described above.
+  fx = _mm256_floor_pd(fx);
+
+  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
+  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
+  // digits right.
+  tmp = pmul(fx, p4d_cephes_exp_C1);
+  Packet4d z = pmul(fx, p4d_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet4d x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial of the rational interpolant.
+  Packet4d px = p4d_cephes_exp_p0;
+  px = pmadd(px, x2, p4d_cephes_exp_p1);
+  px = pmadd(px, x2, p4d_cephes_exp_p2);
+  px = pmul(px, x);
+
+  // Evaluate the denominator polynomial of the rational interpolant.
+  Packet4d qx = p4d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p4d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p4d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p4d_cephes_exp_q3);
+
+  // I don't really get this bit, copied from the SSE2 routines, so...
+  // TODO(gonnet): Figure out what is going on here, perhaps find a better
+  // rational interpolant?
+  x = _mm256_div_pd(px, psub(qx, px));
+  x = pmadd(p4d_2, x, p4d_1);
+
+  // Build e=2^n by constructing the exponents in a 128-bit vector and
+  // shifting them to where they belong in double-precision values.
+  __m128i emm0 = _mm256_cvtpd_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, p4i_1023);
+  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
+  __m128i lo = _mm_slli_epi64(emm0, 52);
+  __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
+  __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
+  e = _mm256_insertf128_si256(e, hi, 1);
+
+  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
+  // non-finite values in the input.
+  return pmax(pmul(x, Packet4d(e)), _x);
+}
+
 // Functions for sqrt.
 // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
 // of Newton's method, at a cost of 1-2 bits of precision as opposed to the
@@ -300,15 +380,59 @@ psqrt<Packet8f>(const Packet8f& _x) {
   return pmul(_x, x);
 }
 #else
-template <>
-EIGEN_STRONG_INLINE Packet8f psqrt<Packet8f>(const Packet8f& x) {
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f psqrt<Packet8f>(const Packet8f& x) {
   return _mm256_sqrt_ps(x);
 }
 #endif
-template <>
-EIGEN_STRONG_INLINE Packet4d psqrt<Packet4d>(const Packet4d& x) {
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4d psqrt<Packet4d>(const Packet4d& x) {
   return _mm256_sqrt_pd(x);
 }
+#if EIGEN_FAST_MATH
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
+ _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000);
+  _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
+  _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
+
+  Packet8f neg_half = pmul(_x, p8f_minus_half);
+
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
+  Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x));
+
+  // Fill in NaNs and Infs for the negative/zero entries.
+  Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ);
+  Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask);
+  Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan),
+                                        _mm256_and_ps(zero_mask, p8f_inf));
+
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five));
+
+  // Insert NaNs and Infs in all the right places.
+  return _mm256_or_ps(x, infs_and_nans);
+}
+
+#else
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f prsqrt<Packet8f>(const Packet8f& x) {
+  _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
+  return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x));
+}
+#endif
+
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4d prsqrt<Packet4d>(const Packet4d& x) {
+  _EIGEN_DECLARE_CONST_Packet4d(one, 1.0);
+  return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x));
+}
+
 
 }  // end namespace internal
 
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 695185a49..1e751dc32 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -65,6 +65,7 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasLog  = 1,
     HasExp  = 1,
     HasSqrt = 1,
+    HasRsqrt = 1,
     HasBlend = 1
   };
 };
@@ -79,8 +80,9 @@ template<> struct packet_traits<double> : default_packet_traits
     HasHalfPacket = 1,
 
     HasDiv  = 1,
-    HasExp  = 0,
+    HasExp  = 1,
     HasSqrt = 1,
+    HasRsqrt = 1,
     HasBlend = 1
   };
 };
diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
new file mode 100644
index 000000000..83bfdc604
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -0,0 +1,51 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_AVX_H
+#define EIGEN_TYPE_CASTING_AVX_H
+
+namespace Eigen {
+
+namespace internal {
+
+// For now we use SSE to handle integers, so we can't use AVX instructions to cast
+// from int to float
+template <>
+struct type_casting_traits<float, int> {
+  enum {
+    VectorizedCast = 0,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<int, float> {
+  enum {
+    VectorizedCast = 0,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+
+
+template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
+  return _mm256_cvtps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
+  return _mm256_cvtepi32_ps(a);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_AVX_H
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index 19749c832..ceed1d1ef 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -197,21 +197,21 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(cons
 }
 #endif
 
-template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
   return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
 }
 
-template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
   return make_double2(from[0*stride], from[1*stride]);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
   to[stride*0] = from.x;
   to[stride*1] = from.y;
   to[stride*2] = from.z;
   to[stride*3] = from.w;
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
   to[stride*0] = from.x;
   to[stride*1] = from.y;
 }
@@ -245,14 +245,14 @@ template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a)
 }
 
 template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
-  return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w));
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
 }
 template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
-  return make_double2(abs(a.x), abs(a.y));
+  return make_double2(fabs(a.x), fabs(a.y));
 }
 
 
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<float4,4>& kernel) {
   double tmp = kernel.packet[0].y;
   kernel.packet[0].y = kernel.packet[1].x;
@@ -279,7 +279,7 @@ ptranspose(PacketBlock<float4,4>& kernel) {
   kernel.packet[3].z = tmp;
 }
 
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<double2,2>& kernel) {
   double tmp = kernel.packet[0].y;
   kernel.packet[0].y = kernel.packet[1].x;
diff --git a/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h
new file mode 100644
index 000000000..5007c155d
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h
@@ -0,0 +1,110 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H
+#define EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H
+
+namespace Eigen {
+namespace internal {
+
+/* The following lookup table was generated from measurements on a Nexus 5,
+ * which has a Qualcomm Krait 400 CPU. This is very representative of current
+ * 32bit (ARMv7) Android devices. On the other hand, I don't know how
+ * representative that is outside of these conditions. Accordingly,
+ * let's only use this lookup table on ARM 32bit on Android for now.
+ *
+ * Measurements were single-threaded, with Scalar=float, compiled with
+ * -mfpu=neon-vfpv4, so the pmadd instruction used was VFMA.F32.
+ *
+ * The device was cooled, allowing it to run a the max clock speed throughout.
+ * This may not be representative of real-world thermal conditions.
+ *
+ * The benchmark attempted to flush caches to test cold-cache performance.
+ */
+#if EIGEN_ARCH_ARM && EIGEN_OS_ANDROID
+template<>
+struct BlockingSizesLookupTable<float, float> {
+  static const size_t BaseSize = 16;
+  static const size_t NumSizes = 8;
+  static const unsigned short* Data() {
+    static const unsigned short data[512] = {
+      0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x447, 0x447,
+      0x454, 0x455, 0x456, 0x457, 0x458, 0x459, 0x45a, 0x456,
+      0x464, 0x465, 0x466, 0x467, 0x468, 0x469, 0x46a, 0x467,
+      0x474, 0x475, 0x476, 0x467, 0x478, 0x479, 0x476, 0x478,
+      0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x476, 0x476,
+      0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x496, 0x488,
+      0x474, 0x475, 0x476, 0x4a6, 0x496, 0x496, 0x495, 0x4a6,
+      0x474, 0x475, 0x466, 0x4a6, 0x497, 0x4a5, 0x496, 0x4a5,
+      0x544, 0x545, 0x546, 0x547, 0x548, 0x549, 0x54a, 0x54b,
+      0x554, 0x555, 0x556, 0x557, 0x558, 0x559, 0x55a, 0x55b,
+      0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x56b,
+      0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x576,
+      0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x587,
+      0x564, 0x565, 0x566, 0x567, 0x596, 0x596, 0x596, 0x597,
+      0x574, 0x565, 0x566, 0x596, 0x596, 0x5a6, 0x5a6, 0x5a6,
+      0x564, 0x565, 0x5a6, 0x596, 0x5a6, 0x5a6, 0x5a6, 0x5a6,
+      0x644, 0x645, 0x646, 0x647, 0x648, 0x649, 0x64a, 0x64b,
+      0x644, 0x655, 0x656, 0x657, 0x658, 0x659, 0x65a, 0x65b,
+      0x664, 0x665, 0x666, 0x667, 0x668, 0x669, 0x65a, 0x667,
+      0x654, 0x665, 0x676, 0x677, 0x678, 0x679, 0x67a, 0x675,
+      0x684, 0x675, 0x686, 0x687, 0x688, 0x688, 0x687, 0x686,
+      0x664, 0x685, 0x666, 0x677, 0x697, 0x696, 0x697, 0x697,
+      0x664, 0x665, 0x696, 0x696, 0x685, 0x6a6, 0x696, 0x696,
+      0x664, 0x675, 0x686, 0x696, 0x6a6, 0x696, 0x696, 0x696,
+      0x744, 0x745, 0x746, 0x747, 0x748, 0x749, 0x74a, 0x747,
+      0x754, 0x755, 0x756, 0x757, 0x758, 0x759, 0x75a, 0x757,
+      0x764, 0x765, 0x756, 0x767, 0x768, 0x759, 0x75a, 0x766,
+      0x744, 0x755, 0x766, 0x777, 0x768, 0x759, 0x778, 0x777,
+      0x744, 0x745, 0x766, 0x777, 0x788, 0x786, 0x786, 0x788,
+      0x754, 0x755, 0x766, 0x787, 0x796, 0x796, 0x787, 0x796,
+      0x684, 0x695, 0x696, 0x6a6, 0x795, 0x786, 0x795, 0x796,
+      0x684, 0x695, 0x696, 0x795, 0x786, 0x796, 0x795, 0x796,
+      0x844, 0x845, 0x846, 0x847, 0x848, 0x849, 0x848, 0x848,
+      0x844, 0x855, 0x846, 0x847, 0x848, 0x849, 0x855, 0x857,
+      0x844, 0x845, 0x846, 0x857, 0x848, 0x859, 0x866, 0x865,
+      0x844, 0x855, 0x846, 0x847, 0x878, 0x859, 0x877, 0x877,
+      0x844, 0x855, 0x846, 0x867, 0x886, 0x887, 0x885, 0x886,
+      0x784, 0x785, 0x786, 0x877, 0x897, 0x885, 0x896, 0x896,
+      0x684, 0x695, 0x686, 0x886, 0x885, 0x885, 0x886, 0x896,
+      0x694, 0x6a5, 0x6a6, 0x885, 0x885, 0x886, 0x896, 0x896,
+      0x944, 0x945, 0x946, 0x947, 0x948, 0x847, 0x847, 0x848,
+      0x954, 0x855, 0x856, 0x947, 0x858, 0x857, 0x858, 0x858,
+      0x944, 0x945, 0x946, 0x867, 0x948, 0x866, 0x867, 0x867,
+      0x944, 0x975, 0x976, 0x877, 0x877, 0x877, 0x877, 0x877,
+      0x784, 0x785, 0x886, 0x887, 0x886, 0x887, 0x887, 0x887,
+      0x784, 0x785, 0x786, 0x796, 0x887, 0x897, 0x896, 0x896,
+      0x684, 0x695, 0x6a6, 0x886, 0x886, 0x896, 0x896, 0x896,
+      0x6a4, 0x6a5, 0x696, 0x896, 0x886, 0x896, 0x896, 0x896,
+      0xa44, 0xa45, 0xa46, 0xa47, 0x847, 0x848, 0x847, 0x848,
+      0xa44, 0xa45, 0x856, 0x857, 0x857, 0x857, 0x857, 0x857,
+      0xa44, 0xa65, 0x866, 0x867, 0x867, 0x867, 0x867, 0x867,
+      0x774, 0x875, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877,
+      0x784, 0x785, 0x886, 0x887, 0x887, 0x887, 0x887, 0x887,
+      0x784, 0x785, 0x786, 0x787, 0x887, 0x896, 0x897, 0x897,
+      0x684, 0x6a5, 0x696, 0x886, 0x886, 0x896, 0x896, 0x896,
+      0x684, 0x6a5, 0x6a5, 0x886, 0x886, 0x896, 0x896, 0x896,
+      0xb44, 0x845, 0x846, 0x847, 0x847, 0x945, 0x846, 0x946,
+      0xb54, 0x855, 0x856, 0x857, 0x857, 0x856, 0x857, 0x856,
+      0x864, 0x865, 0x866, 0x867, 0x867, 0x866, 0x866, 0x867,
+      0x864, 0x875, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877,
+      0x784, 0x885, 0x886, 0x787, 0x887, 0x887, 0x887, 0x887,
+      0x784, 0x785, 0x786, 0x796, 0x886, 0x897, 0x897, 0x897,
+      0x684, 0x695, 0x696, 0x886, 0x896, 0x896, 0x896, 0x896,
+      0x684, 0x685, 0x696, 0xb57, 0x896, 0x896, 0x896, 0x896
+    };
+    return data;
+  }
+};
+#endif
+
+}
+}
+
+#endif // EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index f86c0a39a..3b8b7303f 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -462,11 +462,59 @@ Packet4f psqrt<Packet4f>(const Packet4f& _x)
 
 #else
 
-template<> EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
+template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED 
+Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
 
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
+
+#if EIGEN_FAST_MATH
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000);
+  _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000);
+
+  Packet4f neg_half = pmul(_x, p4f_minus_half);
+
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min);
+  Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x));
+
+  // Fill in NaNs and Infs for the negative/zero entries.
+  Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps());
+  Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask);
+  Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan),
+                                        _mm_and_ps(zero_mask, p4f_inf));
+
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five));
+
+  // Insert NaNs and Infs in all the right places.
+  return _mm_or_ps(x, infs_and_nans);
+}
+
+#else
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+  // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation.
+  return _mm_div_ps(pset1<Packet4f>(1.0f), _mm_sqrt_ps(x));
+}
+
+#endif
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d prsqrt<Packet2d>(const Packet2d& x) {
+  // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
+  return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x));
+}
 
 } // end namespace internal
 
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 3653783fd..38a84273d 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -108,6 +108,7 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasLog  = 1,
     HasExp  = 1,
     HasSqrt = 1,
+    HasRsqrt = 1,
     HasBlend = 1
   };
 };
@@ -124,6 +125,7 @@ template<> struct packet_traits<double> : default_packet_traits
     HasDiv  = 1,
     HasExp  = 1,
     HasSqrt = 1,
+    HasRsqrt = 1,
     HasBlend = 1
   };
 };
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
new file mode 100644
index 000000000..c84893230
--- /dev/null
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -0,0 +1,77 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_SSE_H
+#define EIGEN_TYPE_CASTING_SSE_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <>
+struct type_casting_traits<float, int> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return _mm_cvttps_epi32(a);
+}
+
+
+template <>
+struct type_casting_traits<int, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return _mm_cvtepi32_ps(a);
+}
+
+
+template <>
+struct type_casting_traits<double, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 2,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+  return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
+}
+
+template <>
+struct type_casting_traits<float, double> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 2
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
+  // Simply discard the second half of the input
+  return _mm_cvtps_pd(a);
+}
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_SSE_H