From 47fefa235f73315bc57d685a7bc9cd8d3577349f Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 3 Sep 2019 17:20:56 -0700
Subject: Allow move-only done callback in TensorAsyncDevice

---
 unsupported/Eigen/CXX11/src/Tensor/TensorBase.h    |  8 ++--
 unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h  | 10 ++---
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h        | 34 +++++++++-------
 .../CXX11/src/Tensor/TensorForwardDeclarations.h   |  4 +-
 unsupported/test/cxx11_tensor_executor.cpp         | 19 ++++++---
 unsupported/test/cxx11_tensor_thread_pool.cpp      | 47 ++++++++++++----------
 6 files changed, 69 insertions(+), 53 deletions(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 095c85dc4..f2aa37256 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -1065,12 +1065,12 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
 
 #ifdef EIGEN_USE_THREADS
     // Select the async device on which to evaluate the expression.
-    template <typename DeviceType>
+    template <typename DeviceType, typename DoneCallback>
     typename internal::enable_if<
         internal::is_same<DeviceType, ThreadPoolDevice>::value,
-        TensorAsyncDevice<Derived, DeviceType>>::type
-    device(const DeviceType& dev, std::function<void()> done) {
-      return TensorAsyncDevice<Derived, DeviceType>(dev, derived(), std::move(done));
+        TensorAsyncDevice<Derived, DeviceType, DoneCallback>>::type
+    device(const DeviceType& dev, DoneCallback done) {
+      return TensorAsyncDevice<Derived, DeviceType, DoneCallback>(dev, derived(), std::move(done));
     }
 #endif  // EIGEN_USE_THREADS
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index 5122b3623..cc9c65702 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -73,21 +73,21 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
   * ThreadPoolDevice).
   *
   * Example:
-  *    std::function<void()> done = []() {};
+  *    auto done = []() { ... expression evaluation done ... };
   *    C.device(EIGEN_THREAD_POOL, std::move(done)) = A + B;
  */
 
-template <typename ExpressionType, typename DeviceType>
+template <typename ExpressionType, typename DeviceType, typename DoneCallback>
 class TensorAsyncDevice {
  public:
   TensorAsyncDevice(const DeviceType& device, ExpressionType& expression,
-                    std::function<void()> done)
+                    DoneCallback done)
       : m_device(device), m_expression(expression), m_done(std::move(done)) {}
 
   template <typename OtherDerived>
   EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
     typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
-    typedef internal::TensorAsyncExecutor<const Assign, DeviceType> Executor;
+    typedef internal::TensorAsyncExecutor<const Assign, DeviceType, DoneCallback> Executor;
 
     // WARNING: After assignment 'm_done' callback will be in undefined state.
     Assign assign(m_expression, other);
@@ -99,7 +99,7 @@ class TensorAsyncDevice {
  protected:
   const DeviceType& m_device;
   ExpressionType& m_expression;
-  std::function<void()> m_done;
+  DoneCallback m_done;
 };
 
 #endif  // EIGEN_USE_THREADS
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 10339e5e7..cf07656b3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -101,8 +101,8 @@ class TensorExecutor {
  * Default async execution strategy is not implemented. Currently it's only
  * available for ThreadPoolDevice (see definition below).
  */
-template <typename Expression, typename Device, bool Vectorizable,
-          bool Tileable>
+template <typename Expression, typename Device, typename DoneCallback,
+          bool Vectorizable, bool Tileable>
 class TensorAsyncExecutor {};
 
 /**
@@ -419,15 +419,17 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
   }
 };
 
-template <typename Expression, bool Vectorizable, bool Tileable>
-class TensorAsyncExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
+template <typename Expression, typename DoneCallback, bool Vectorizable,
+          bool Tileable>
+class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
+                          Vectorizable, Tileable> {
  public:
   typedef typename Expression::Index StorageIndex;
   typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
 
   static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
                                            const ThreadPoolDevice& device,
-                                           std::function<void()> done) {
+                                           DoneCallback done) {
     TensorAsyncExecutorContext* const ctx =
         new TensorAsyncExecutorContext(expr, device, std::move(done));
 
@@ -455,7 +457,7 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable>
   struct TensorAsyncExecutorContext {
     TensorAsyncExecutorContext(const Expression& expr,
                                const ThreadPoolDevice& thread_pool,
-                               std::function<void()> done)
+                               DoneCallback done)
         : evaluator(expr, thread_pool), on_done(std::move(done)) {}
 
     ~TensorAsyncExecutorContext() {
@@ -466,12 +468,13 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable>
     Evaluator evaluator;
 
    private:
-    std::function<void()> on_done;
+    DoneCallback on_done;
   };
 };
 
-template <typename Expression, bool Vectorizable>
-class TensorAsyncExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
+template <typename Expression, typename DoneCallback, bool Vectorizable>
+class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
+                          Vectorizable, /*Tileable*/ true> {
  public:
   typedef typename traits<Expression>::Index StorageIndex;
   typedef typename traits<Expression>::Scalar Scalar;
@@ -485,7 +488,7 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable
 
   static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
                                            const ThreadPoolDevice& device,
-                                           std::function<void()> done) {
+                                           DoneCallback done) {
     TensorAsyncExecutorContext* const ctx =
         new TensorAsyncExecutorContext(expr, device, std::move(done));
 
@@ -494,9 +497,10 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable
 
     if (total_size < cache_size &&
         !ExpressionHasTensorBroadcastingOp<Expression>::value) {
-      internal::TensorAsyncExecutor<Expression, ThreadPoolDevice, Vectorizable,
-                                    /*Tileable*/ false>::runAsync(
-                                        expr, device, [ctx]() { delete ctx; });
+      auto delete_ctx = [ctx]() { delete ctx; };
+      internal::TensorAsyncExecutor<
+          Expression, ThreadPoolDevice, decltype(delete_ctx), Vectorizable,
+          /*Tileable*/ false>::runAsync(expr, device, std::move(delete_ctx));
       return;
     }
 
@@ -532,7 +536,7 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable
   struct TensorAsyncExecutorContext {
     TensorAsyncExecutorContext(const Expression& expr,
                                const ThreadPoolDevice& thread_pool,
-                               std::function<void()> done)
+                               DoneCallback done)
         : device(thread_pool),
           evaluator(expr, thread_pool),
           on_done(std::move(done)) {}
@@ -548,7 +552,7 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable
     TilingContext tiling;
 
    private:
-    std::function<void()> on_done;
+    DoneCallback on_done;
   };
 };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index e823bd932..772dbbe35 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -94,7 +94,7 @@ template<typename XprType, template <class> class MakePointer_ = MakePointer> cl
 template<typename XprType> class TensorForcedEvalOp;
 
 template<typename ExpressionType, typename DeviceType> class TensorDevice;
-template<typename ExpressionType, typename DeviceType> class TensorAsyncDevice;
+template<typename ExpressionType, typename DeviceType, typename DoneCallback> class TensorAsyncDevice;
 template<typename Derived, typename Device> struct TensorEvaluator;
 
 struct NoOpOutputKernel;
@@ -168,7 +168,7 @@ template <typename Expression, typename Device,
           bool Tileable = IsTileable<Device, Expression>::value>
 class TensorExecutor;
 
-template <typename Expression, typename Device,
+template <typename Expression, typename Device, typename DoneCallback,
           bool Vectorizable = IsVectorizable<Device, Expression>::value,
           bool Tileable = IsTileable<Device, Expression>::value>
 class TensorAsyncExecutor;
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index f4d0401da..aa4ab0b80 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -578,11 +578,15 @@ static void test_async_execute_unary_expr(Device d)
   src.setRandom();
   const auto expr = src.square();
 
+  Eigen::Barrier done(1);
+  auto on_done = [&done]() { done.Notify(); };
+
   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
-  using Executor = internal::TensorAsyncExecutor<const Assign, Device,
+  using DoneCallback = decltype(on_done);
+  using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
                                                  Vectorizable, Tileable>;
-  Eigen::Barrier done(1);
-  Executor::runAsync(Assign(dst, expr), d, [&done]() { done.Notify(); });
+
+  Executor::runAsync(Assign(dst, expr), d, on_done);
   done.Wait();
 
   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
@@ -610,12 +614,15 @@ static void test_async_execute_binary_expr(Device d)
 
   const auto expr = lhs + rhs;
 
+  Eigen::Barrier done(1);
+  auto on_done = [&done]() { done.Notify(); };
+
   using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
-  using Executor = internal::TensorAsyncExecutor<const Assign, Device,
+  using DoneCallback = decltype(on_done);
+  using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
                                                  Vectorizable, Tileable>;
 
-  Eigen::Barrier done(1);
-  Executor::runAsync(Assign(dst, expr), d, [&done]() { done.Notify(); });
+  Executor::runAsync(Assign(dst, expr), d, on_done);
   done.Wait();
 
   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index 62973cd08..dae7b0335 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -683,34 +683,39 @@ EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)
   CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
   CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>());
   CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>());
-  CALL_SUBTEST_3(test_async_multithread_contraction_agrees_with_singlethread<ColMajor>());
-  CALL_SUBTEST_3(test_async_multithread_contraction_agrees_with_singlethread<RowMajor>());
+
+  CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<ColMajor>());
+  CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<RowMajor>());
 
   // Test EvalShardedByInnerDimContext parallelization strategy.
-  CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction<ColMajor>());
-  CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction<RowMajor>());
-  CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
-  CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
-  CALL_SUBTEST_4(test_async_sharded_by_inner_dim_contraction<ColMajor>());
-  CALL_SUBTEST_4(test_async_sharded_by_inner_dim_contraction<RowMajor>());
-  CALL_SUBTEST_4(test_async_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
-  CALL_SUBTEST_4(test_async_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<ColMajor>());
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<RowMajor>());
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
+
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<ColMajor>());
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<RowMajor>());
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
 
   // Exercise various cases that have been problematic in the past.
-  CALL_SUBTEST_5(test_contraction_corner_cases<ColMajor>());
-  CALL_SUBTEST_5(test_contraction_corner_cases<RowMajor>());
+  CALL_SUBTEST_7(test_contraction_corner_cases<ColMajor>());
+  CALL_SUBTEST_7(test_contraction_corner_cases<RowMajor>());
 
-  CALL_SUBTEST_6(test_full_contraction<ColMajor>());
-  CALL_SUBTEST_6(test_full_contraction<RowMajor>());
+  CALL_SUBTEST_8(test_full_contraction<ColMajor>());
+  CALL_SUBTEST_8(test_full_contraction<RowMajor>());
 
-  CALL_SUBTEST_7(test_multithreaded_reductions<ColMajor>());
-  CALL_SUBTEST_7(test_multithreaded_reductions<RowMajor>());
+  CALL_SUBTEST_9(test_multithreaded_reductions<ColMajor>());
+  CALL_SUBTEST_9(test_multithreaded_reductions<RowMajor>());
 
-  CALL_SUBTEST_7(test_memcpy());
-  CALL_SUBTEST_7(test_multithread_random());
+  CALL_SUBTEST_10(test_memcpy());
+  CALL_SUBTEST_10(test_multithread_random());
 
   TestAllocator test_allocator;
-  CALL_SUBTEST_7(test_multithread_shuffle<ColMajor>(NULL));
-  CALL_SUBTEST_7(test_multithread_shuffle<RowMajor>(&test_allocator));
-  CALL_SUBTEST_7(test_threadpool_allocate(&test_allocator));
+  CALL_SUBTEST_11(test_multithread_shuffle<ColMajor>(NULL));
+  CALL_SUBTEST_11(test_multithread_shuffle<RowMajor>(&test_allocator));
+  CALL_SUBTEST_11(test_threadpool_allocate(&test_allocator));
+
+  // Force CMake to split this test.
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11
 }
-- 
cgit v1.2.3


From f59bed7a133322955dac03f3654def706aff3ba6 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 3 Sep 2019 19:11:36 -0700
Subject: Change typedefs from private to protected to fix MSVC compilation

---
 Eigen/src/SparseCore/SparseCwiseUnaryOp.h | 2 ++
 Eigen/src/SparseCore/SparseView.h         | 1 +
 2 files changed, 3 insertions(+)

diff --git a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
index ea7973790..df6c28d2b 100644
--- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
@@ -49,6 +49,7 @@ template<typename UnaryOp, typename ArgType>
 class unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::InnerIterator
     : public unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::EvalIterator
 {
+  protected:
     typedef typename XprType::Scalar Scalar;
     typedef typename unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::EvalIterator Base;
   public:
@@ -99,6 +100,7 @@ template<typename ViewOp, typename ArgType>
 class unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::InnerIterator
     : public unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::EvalIterator
 {
+  protected:
     typedef typename XprType::Scalar Scalar;
     typedef typename unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::EvalIterator Base;
   public:
diff --git a/Eigen/src/SparseCore/SparseView.h b/Eigen/src/SparseCore/SparseView.h
index 7c4aea743..92b3d1f7b 100644
--- a/Eigen/src/SparseCore/SparseView.h
+++ b/Eigen/src/SparseCore/SparseView.h
@@ -90,6 +90,7 @@ struct unary_evaluator<SparseView<ArgType>, IteratorBased>
     
     class InnerIterator : public EvalIterator
     {
+      protected:
         typedef typename XprType::Scalar Scalar;
       public:
 
-- 
cgit v1.2.3


From e38dd48a27b59b59a924b66a9486c3c2856acdf9 Mon Sep 17 00:00:00 2001
From: Srinivas Vasudevan <srvasude@google.com>
Date: Mon, 12 Aug 2019 19:26:29 -0400
Subject: PR 681: Add ndtri function, the inverse of the normal distribution
 function.

---
 Eigen/src/Core/GenericPacketMath.h                 |  21 ++
 Eigen/src/Core/GlobalFunctions.h                   |   1 +
 Eigen/src/Core/arch/AVX/PacketMath.h               |   1 +
 Eigen/src/Core/arch/AVX512/PacketMath.h            |   1 +
 .../Core/arch/Default/GenericPacketMathFunctions.h |  55 ++++
 Eigen/src/Core/arch/GPU/PacketMath.h               |   2 +
 Eigen/src/Core/arch/SSE/PacketMath.h               |   1 +
 Eigen/src/Core/arch/SYCL/InteropHeaders.h          |   1 +
 Eigen/src/Core/util/ForwardDeclarations.h          |   1 +
 Eigen/src/plugins/ArrayCwiseUnaryOps.h             |  28 +-
 test/packetmath.cpp                                |   6 +
 unsupported/Eigen/CXX11/src/Tensor/TensorBase.h    |   6 +
 unsupported/Eigen/SpecialFunctions                 |   1 +
 .../SpecialFunctions/SpecialFunctionsFunctors.h    |  25 ++
 .../src/SpecialFunctions/SpecialFunctionsHalf.h    |   3 +
 .../src/SpecialFunctions/SpecialFunctionsImpl.h    | 311 +++++++++++++++++----
 .../SpecialFunctions/SpecialFunctionsPacketMath.h  |   7 +
 .../arch/GPU/GpuSpecialFunctions.h                 |  13 +
 unsupported/test/cxx11_tensor_gpu.cu               |  64 +++++
 unsupported/test/special_functions.cpp             |  20 ++
 20 files changed, 498 insertions(+), 70 deletions(-)

diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 6ab38994f..651e3f7b3 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -83,6 +83,7 @@ struct default_packet_traits
     HasPolygamma = 0,
     HasErf = 0,
     HasErfc = 0,
+    HasNdtri = 0,
     HasI0e = 0,
     HasI1e = 0,
     HasIGamma = 0,
@@ -257,12 +258,32 @@ pldexp(const Packet &a, const Packet &exponent) { return std::ldexp(a,exponent);
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pzero(const Packet& a) { return pxor(a,a); }
 
+template<> EIGEN_DEVICE_FUNC inline float pzero<float>(const float& a) {
+  EIGEN_UNUSED_VARIABLE(a);
+  return 0.;
+}
+
+template<> EIGEN_DEVICE_FUNC inline double pzero<double>(const double& a) {
+  EIGEN_UNUSED_VARIABLE(a);
+  return 0.;
+}
+
 /** \internal \returns bits of \a or \b according to the input bit mask \a mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pselect(const Packet& mask, const Packet& a, const Packet& b) {
   return por(pand(a,mask),pandnot(b,mask));
 }
 
+template<> EIGEN_DEVICE_FUNC inline float pselect<float>(
+    const float& mask, const float& a, const float&b) {
+  return mask == 0 ? b : a;
+}
+
+template<> EIGEN_DEVICE_FUNC inline double pselect<double>(
+    const double& mask, const double& a, const double& b) {
+  return mask == 0 ? b : a;
+}
+
 /** \internal \returns a <= b as a bit mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pcmp_le(const Packet& a, const Packet& b)  { return a<=b ? ptrue(a) : pzero(a); }
diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index 71377cee5..7f132bdd0 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -76,6 +76,7 @@ namespace Eigen
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op,complement error function,\sa ArrayBase::erfc)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ndtri,scalar_ndtri_op,inverse normal distribution function,\sa ArrayBase::ndtri)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op,exponential,\sa ArrayBase::exp)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(expm1,scalar_expm1_op,exponential of a value minus 1,\sa ArrayBase::expm1)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op,natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log)
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 020f6c276..e3363d006 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -72,6 +72,7 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasLog1p  = 1,
     HasExpm1  = 1,
     HasExp  = 1,
+    HasNdtri = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
     HasTanh  = EIGEN_FAST_MATH,
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index e37855693..11c8dae02 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -97,6 +97,7 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasLog = 1,
     HasLog1p  = 1,
     HasExpm1  = 1,
+    HasNdtri = 1,
 #endif
     HasExp = 1,
     HasSqrt = EIGEN_FAST_MATH,
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 0fc673e12..13351d5ec 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -512,5 +512,60 @@ Packet pcos_float(const Packet& x)
   return psincos_float<false>(x);
 }
 
+/* polevl (modified for Eigen)
+ *
+ *      Evaluate polynomial
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N+1];
+ *
+ * y = polevl<decltype(x), N>( x, coef);
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates polynomial of degree N:
+ *
+ *                     2          N
+ * y  =  C  + C x + C x  +...+ C x
+ *        0    1     2          N
+ *
+ * Coefficients are stored in reverse order:
+ *
+ * coef[0] = C  , ..., coef[N] = C  .
+ *            N                   0
+ *
+ *  The function p1evl() assumes that coef[N] = 1.0 and is
+ * omitted from the array.  Its calling arguments are
+ * otherwise the same as polevl().
+ *
+ *
+ * The Eigen implementation is templatized.  For best speed, store
+ * coef as a const array (constexpr), e.g.
+ *
+ * const double coef[] = {1.0, 2.0, 3.0, ...};
+ *
+ */
+template <typename Packet, int N>
+struct ppolevl {
+  static EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
+    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return pmadd(ppolevl<Packet, N-1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
+  }
+};
+
+template <typename Packet>
+struct ppolevl<Packet, 0> {
+  static EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
+    EIGEN_UNUSED_VARIABLE(x);
+    return pset1<Packet>(coeff[0]);
+  }
+};
+
 } // end namespace internal
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index d1df26e57..bdbaa5362 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -44,6 +44,7 @@ template<> struct packet_traits<float> : default_packet_traits
     HasPolygamma = 1,
     HasErf = 1,
     HasErfc = 1,
+    HasNdtri = 1,
     HasI0e = 1,
     HasI1e = 1,
     HasIGamma = 1,
@@ -78,6 +79,7 @@ template<> struct packet_traits<double> : default_packet_traits
     HasPolygamma = 1,
     HasErf = 1,
     HasErfc = 1,
+    HasNdtri = 1,
     HasI0e = 1,
     HasI1e = 1,
     HasIGamma = 1,
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index b59e2c602..61500c06d 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -112,6 +112,7 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasLog  = 1,
     HasLog1p  = 1,
     HasExpm1  = 1,
+    HasNdtri = 1,
     HasExp  = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
index ef66fc7de..d76030419 100644
--- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h
+++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
@@ -54,6 +54,7 @@ struct sycl_packet_traits : default_packet_traits {
     HasPolygamma = 0,
     HasErf = 0,
     HasErfc = 0,
+    HasNdtri = 0,
     HasIGamma = 0,
     HasIGammac = 0,
     HasBetaInc = 0,
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 050d15e96..18889fcf4 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -214,6 +214,7 @@ template<typename Scalar> struct scalar_lgamma_op;
 template<typename Scalar> struct scalar_digamma_op;
 template<typename Scalar> struct scalar_erf_op;
 template<typename Scalar> struct scalar_erfc_op;
+template<typename Scalar> struct scalar_ndtri_op;
 template<typename Scalar> struct scalar_igamma_op;
 template<typename Scalar> struct scalar_igammac_op;
 template<typename Scalar> struct scalar_zeta_op;
diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
index 2f99ee0b2..4aef72d92 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@@ -536,14 +536,12 @@ typedef CwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived> LgammaRe
 typedef CwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived> DigammaReturnType;
 typedef CwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> ErfReturnType;
 typedef CwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived> ErfcReturnType;
+typedef CwiseUnaryOp<internal::scalar_ndtri_op<Scalar>, const Derived> NdtriReturnType;
 
 /** \cpp11 \returns an expression of the coefficient-wise ln(|gamma(*this)|).
   *
   * \specialfunctions_module
   *
-  * Example: \include Cwise_lgamma.cpp
-  * Output: \verbinclude Cwise_lgamma.out
-  *
   * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
   * or float/double in non c++11 mode, the user has to provide implementations of lgamma(T) for any scalar
   * type T to be supported.
@@ -579,9 +577,6 @@ digamma() const
   *
   * \specialfunctions_module
   *
-  * Example: \include Cwise_erf.cpp
-  * Output: \verbinclude Cwise_erf.out
-  *
   * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
   * or float/double in non c++11 mode, the user has to provide implementations of erf(T) for any scalar
   * type T to be supported.
@@ -600,9 +595,6 @@ erf() const
   *
   * \specialfunctions_module
   *
-  * Example: \include Cwise_erfc.cpp
-  * Output: \verbinclude Cwise_erfc.out
-  *
   * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
   * or float/double in non c++11 mode, the user has to provide implementations of erfc(T) for any scalar
   * type T to be supported.
@@ -615,3 +607,21 @@ erfc() const
 {
   return ErfcReturnType(derived());
 }
+
+/** \cpp11 \returns an expression of the coefficient-wise Complementary error
+  * function of *this.
+  *
+  * \specialfunctions_module
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of ndtri(T) for any scalar
+  * type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_ndtri">Math functions</a>, erf()
+  */
+EIGEN_DEVICE_FUNC
+inline const NdtriReturnType
+ndtri() const
+{
+  return NdtriReturnType(derived());
+}
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 67ff6dc5b..0e1e50e21 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -590,6 +590,12 @@ template<typename Scalar,typename Packet> void packetmath_real()
     h.store(data2, internal::perfc(h.load(data1)));
     VERIFY((numext::isnan)(data2[0]));
   }
+  {
+    for (int i=0; i<size; ++i) {
+      data1[i] = internal::random<Scalar>(0,1);
+    }
+    CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasNdtri, numext::ndtri, internal::pndtri);
+  }
 #endif  // EIGEN_HAS_C99_MATH
 
   for (int i=0; i<size; ++i)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index f2aa37256..bcb0daf30 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -201,6 +201,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_erfc_op<Scalar>());
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ndtri_op<Scalar>, const Derived>
+    ndtri() const {
+      return unaryExpr(internal::scalar_ndtri_op<Scalar>());
+    }
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived>
     sigmoid() const {
diff --git a/unsupported/Eigen/SpecialFunctions b/unsupported/Eigen/SpecialFunctions
index 44fd99b43..a5abd407d 100644
--- a/unsupported/Eigen/SpecialFunctions
+++ b/unsupported/Eigen/SpecialFunctions
@@ -33,6 +33,7 @@ namespace Eigen {
   * - gamma_sample_der_alpha
   * - igammac
   * - digamma
+  * - ndtri
   * - polygamma
   * - zeta
   * - betainc
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
index c6fac91bb..13a72a3ee 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
@@ -283,6 +283,31 @@ struct functor_traits<scalar_erfc_op<Scalar> >
   };
 };
 
+/** \internal
+ * \brief Template functor to compute the Inverse of the normal distribution
+ * function of a scalar
+ * \sa class CwiseUnaryOp, Cwise::ndtri()
+ */
+template<typename Scalar> struct scalar_ndtri_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_ndtri_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
+    using numext::ndtri; return ndtri(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pndtri(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_ndtri_op<Scalar> >
+{
+  enum {
+    // On average, We are evaluating rational functions with degree N=9 in the
+    // numerator and denominator. This results in 2*N additions and 2*N
+    // multiplications.
+    Cost = 18 * NumTraits<Scalar>::MulCost + 18 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasNdtri
+  };
+};
+
 /** \internal
  * \brief Template functor to compute the exponentially scaled modified Bessel
  * function of order zero
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
index fbdfd299e..538db2afa 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
@@ -30,6 +30,9 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::ha
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
   return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
 }
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ndtri(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::ndtri(static_cast<float>(a)));
+}
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
   return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
 }
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index 1464a9751..78050d0a1 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -38,63 +38,6 @@ namespace internal {
 
 namespace cephes {
 
-/* polevl (modified for Eigen)
- *
- *      Evaluate polynomial
- *
- *
- *
- * SYNOPSIS:
- *
- * int N;
- * Scalar x, y, coef[N+1];
- *
- * y = polevl<decltype(x), N>( x, coef);
- *
- *
- *
- * DESCRIPTION:
- *
- * Evaluates polynomial of degree N:
- *
- *                     2          N
- * y  =  C  + C x + C x  +...+ C x
- *        0    1     2          N
- *
- * Coefficients are stored in reverse order:
- *
- * coef[0] = C  , ..., coef[N] = C  .
- *            N                   0
- *
- *  The function p1evl() assumes that coef[N] = 1.0 and is
- * omitted from the array.  Its calling arguments are
- * otherwise the same as polevl().
- *
- *
- * The Eigen implementation is templatized.  For best speed, store
- * coef as a const array (constexpr), e.g.
- *
- * const double coef[] = {1.0, 2.0, 3.0, ...};
- *
- */
-template <typename Scalar, int N>
-struct polevl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar x, const Scalar coef[]) {
-    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    return polevl<Scalar, N - 1>::run(x, coef) * x + coef[N];
-  }
-};
-
-template <typename Scalar>
-struct polevl<Scalar, 0> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar, const Scalar coef[]) {
-    return coef[0];
-  }
-};
-
 /* chbevl (modified for Eigen)
  *
  *     Evaluate Chebyshev series
@@ -190,7 +133,7 @@ template <>
 struct lgamma_impl<float> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE float run(float x) {
-#if !defined(EIGEN_GPU_COMPILE_PHASE) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) 
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
     int dummy;
     return ::lgammaf_r(x, &dummy);
 #elif defined(SYCL_DEVICE_ONLY)
@@ -205,7 +148,7 @@ template <>
 struct lgamma_impl<double> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE double run(double x) {
-#if !defined(EIGEN_GPU_COMPILE_PHASE) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) 
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
     int dummy;
     return ::lgamma_r(x, &dummy);
 #elif defined(SYCL_DEVICE_ONLY)
@@ -264,7 +207,7 @@ struct digamma_impl_maybe_poly<float> {
     float z;
     if (s < 1.0e8f) {
       z = 1.0f / (s * s);
-      return z * cephes::polevl<float, 3>::run(z, A);
+      return z * internal::ppolevl<float, 3>::run(z, A);
     } else return 0.0f;
   }
 };
@@ -286,7 +229,7 @@ struct digamma_impl_maybe_poly<double> {
     double z;
     if (s < 1.0e17) {
       z = 1.0 / (s * s);
-      return z * cephes::polevl<double, 6>::run(z, A);
+      return z * internal::ppolevl<double, 6>::run(z, A);
     }
     else return 0.0;
   }
@@ -494,6 +437,246 @@ struct erfc_impl<double> {
 };
 #endif  // EIGEN_HAS_C99_MATH
 
+
+/***************************************************************************
+* Implementation of ndtri.                                                 *
+****************************************************************************/
+
+/* Inverse of Normal distribution function (modified for Eigen).
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, ndtri();
+ *
+ * x = ndtri( y );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns the argument, x, for which the area under the
+ * Gaussian probability density function (integrated from
+ * minus infinity to x) is equal to y.
+ *
+ *
+ * For small arguments 0 < y < exp(-2), the program computes
+ * z = sqrt( -2.0 * log(y) );  then the approximation is
+ * x = z - log(z)/z  - (1/z) P(1/z) / Q(1/z).
+ * There are two rational functions P/Q, one for 0 < y < exp(-32)
+ * and the other for y up to exp(-2).  For larger arguments,
+ * w = y - 0.5, and  x/sqrt(2pi) = w + w**3 R(w**2)/S(w**2)).
+ *
+ *
+ * ACCURACY:
+ *
+ *                      Relative error:
+ * arithmetic   domain        # trials      peak         rms
+ *    DEC      0.125, 1         5500       9.5e-17     2.1e-17
+ *    DEC      6e-39, 0.135     3500       5.7e-17     1.3e-17
+ *    IEEE     0.125, 1        20000       7.2e-16     1.3e-16
+ *    IEEE     3e-308, 0.135   50000       4.6e-16     9.8e-17
+ *
+ *
+ * ERROR MESSAGES:
+ *
+ *   message         condition    value returned
+ * ndtri domain       x <= 0        -MAXNUM
+ * ndtri domain       x >= 1         MAXNUM
+ *
+ */
+ /*
+   Cephes Math Library Release 2.2: June, 1992
+   Copyright 1985, 1987, 1992 by Stephen L. Moshier
+   Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+ */
+
+
+// TODO: Add a cheaper approximation for float.
+
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T flipsign(
+    const T& should_flipsign, const T& x) {
+  const T sign_mask = pset1<T>(-0.0);
+  T sign_bit = pand<T>(should_flipsign, sign_mask);
+  return pxor<T>(sign_bit, x);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double flipsign<double>(
+    const double& should_flipsign, const double& x) {
+  return should_flipsign == 0 ? x : -x;
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float flipsign<float>(
+    const float& should_flipsign, const float& x) {
+  return should_flipsign == 0 ? x : -x;
+}
+
+// We split this computation in to two so that in the scalar path
+// only one branch is evaluated (due to our template specialization of pselect
+// being an if statement.)
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_gt_exp_neg_two(const T& b) {
+  const ScalarType p0[] = {
+    ScalarType(-5.99633501014107895267e1),
+    ScalarType(9.80010754185999661536e1),
+    ScalarType(-5.66762857469070293439e1),
+    ScalarType(1.39312609387279679503e1),
+    ScalarType(-1.23916583867381258016e0)
+  };
+  const ScalarType q0[] = {
+    ScalarType(1.0),
+    ScalarType(1.95448858338141759834e0),
+    ScalarType(4.67627912898881538453e0),
+    ScalarType(8.63602421390890590575e1),
+    ScalarType(-2.25462687854119370527e2),
+    ScalarType(2.00260212380060660359e2),
+    ScalarType(-8.20372256168333339912e1),
+    ScalarType(1.59056225126211695515e1),
+    ScalarType(-1.18331621121330003142e0)
+  };
+  const T sqrt2pi = pset1<T>(ScalarType(2.50662827463100050242e0));
+  const T half = pset1<T>(ScalarType(0.5));
+  T c, c2, ndtri_gt_exp_neg_two;
+
+  c = psub(b, half);
+  c2 = pmul(c, c);
+  ndtri_gt_exp_neg_two = pmadd(c, pmul(
+      c2, pdiv(
+          internal::ppolevl<T, 4>::run(c2, p0),
+          internal::ppolevl<T, 8>::run(c2, q0))), c);
+  return pmul(ndtri_gt_exp_neg_two, sqrt2pi);
+}
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_lt_exp_neg_two(
+    const T& b, const T& should_flipsign) {
+  /* Approximation for interval z = sqrt(-2 log a ) between 2 and 8
+   * i.e., a between exp(-2) = .135 and exp(-32) = 1.27e-14.
+   */
+  const ScalarType p1[] = {
+    ScalarType(4.05544892305962419923e0),
+    ScalarType(3.15251094599893866154e1),
+    ScalarType(5.71628192246421288162e1),
+    ScalarType(4.40805073893200834700e1),
+    ScalarType(1.46849561928858024014e1),
+    ScalarType(2.18663306850790267539e0),
+    ScalarType(-1.40256079171354495875e-1),
+    ScalarType(-3.50424626827848203418e-2),
+    ScalarType(-8.57456785154685413611e-4)
+  };
+  const ScalarType q1[] = {
+    ScalarType(1.0),
+    ScalarType(1.57799883256466749731e1),
+    ScalarType(4.53907635128879210584e1),
+    ScalarType(4.13172038254672030440e1),
+    ScalarType(1.50425385692907503408e1),
+    ScalarType(2.50464946208309415979e0),
+    ScalarType(-1.42182922854787788574e-1),
+    ScalarType(-3.80806407691578277194e-2),
+    ScalarType(-9.33259480895457427372e-4)
+  };
+  /* Approximation for interval z = sqrt(-2 log a ) between 8 and 64
+   * i.e., a between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890.
+   */
+  const ScalarType p2[] = {
+    ScalarType(3.23774891776946035970e0),
+    ScalarType(6.91522889068984211695e0),
+    ScalarType(3.93881025292474443415e0),
+    ScalarType(1.33303460815807542389e0),
+    ScalarType(2.01485389549179081538e-1),
+    ScalarType(1.23716634817820021358e-2),
+    ScalarType(3.01581553508235416007e-4),
+    ScalarType(2.65806974686737550832e-6),
+    ScalarType(6.23974539184983293730e-9)
+  };
+  const ScalarType q2[] = {
+    ScalarType(1.0),
+    ScalarType(6.02427039364742014255e0),
+    ScalarType(3.67983563856160859403e0),
+    ScalarType(1.37702099489081330271e0),
+    ScalarType(2.16236993594496635890e-1),
+    ScalarType(1.34204006088543189037e-2),
+    ScalarType(3.28014464682127739104e-4),
+    ScalarType(2.89247864745380683936e-6),
+    ScalarType(6.79019408009981274425e-9)
+  };
+  const T eight = pset1<T>(ScalarType(8.0));
+  const T one = pset1<T>(ScalarType(1));
+  const T neg_two = pset1<T>(ScalarType(-2));
+  T x, x0, x1, z;
+
+  x = psqrt(pmul(neg_two, plog(b)));
+  x0 = psub(x, pdiv(plog(x), x));
+  z = one / x;
+  x1 = pmul(
+      z, pselect(
+          pcmp_lt(x, eight),
+          pdiv(internal::ppolevl<T, 8>::run(z, p1),
+               internal::ppolevl<T, 8>::run(z, q1)),
+          pdiv(internal::ppolevl<T, 8>::run(z, p2),
+               internal::ppolevl<T, 8>::run(z, q2))));
+  return flipsign(should_flipsign, psub(x0, x1));
+}
+
+template <typename T, typename ScalarType>
+T generic_ndtri(const T& a) {
+  const T maxnum = pset1<T>(NumTraits<ScalarType>::infinity());
+  const T neg_maxnum = pset1<T>(-NumTraits<ScalarType>::infinity());
+
+  const T zero = pset1<T>(ScalarType(0));
+  const T one = pset1<T>(ScalarType(1));
+  // exp(-2)
+  const T exp_neg_two = pset1<T>(ScalarType(0.13533528323661269189));
+  T b, ndtri, should_flipsign;
+
+  should_flipsign = pcmp_le(a, psub(one, exp_neg_two));
+  b = pselect(should_flipsign, a, psub(one, a));
+
+  ndtri = pselect(
+      pcmp_lt(exp_neg_two, b),
+      generic_ndtri_gt_exp_neg_two<T, ScalarType>(b),
+      generic_ndtri_lt_exp_neg_two<T, ScalarType>(b, should_flipsign));
+
+  return pselect(
+      pcmp_le(a, zero), neg_maxnum,
+      pselect(pcmp_le(one, a), maxnum, ndtri));
+}
+
+template <typename Scalar>
+struct ndtri_retval {
+  typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct ndtri_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+# else
+
+template <typename Scalar>
+struct ndtri_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
+    return generic_ndtri<Scalar, Scalar>(x);
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+
 /**************************************************************************************************************
  * Implementation of igammac (complemented incomplete gamma integral), based on Cephes but requires C++11/C99 *
  **************************************************************************************************************/
@@ -2120,6 +2303,12 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar)
   return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x);
 }
 
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(ndtri, Scalar)
+    ndtri(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(ndtri, Scalar)::run(x);
+}
+
 template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar)
     igamma(const Scalar& a, const Scalar& x) {
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
index 465f41d54..b6323c4db 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
@@ -38,6 +38,13 @@ Packet perf(const Packet& a) { using numext::erf; return erf(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
 
+/** \internal \returns the ndtri(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pndtri(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type ScalarType;
+  using internal::generic_ndtri; return generic_ndtri<Packet, ScalarType>(a);
+}
+
 /** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
 template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); }
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h
index 40abcee3a..c831edc17 100644
--- a/unsupported/Eigen/src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h
@@ -101,6 +101,19 @@ double2 perfc<double2>(const double2& a)
   return make_double2(erfc(a.x), erfc(a.y));
 }
 
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pndtri<float4>(const float4& a)
+{
+  using numext::ndtri;
+  return make_float4(ndtri(a.x), ndtri(a.y), ndtri(a.z), ndtri(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pndtri<double2>(const double2& a)
+{
+  using numext::ndtri;
+  return make_double2(ndtri(a.x), ndtri(a.y));
+}
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 pigamma<float4>(const float4& a, const float4& x)
diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu
index 94625e6a3..0a2fa8e61 100644
--- a/unsupported/test/cxx11_tensor_gpu.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu
@@ -1069,6 +1069,66 @@ void test_gpu_erfc(const Scalar stddev)
   gpuFree(d_out);
 }
 #endif
+template <typename Scalar>
+void test_gpu_ndtri()
+{
+  Tensor<Scalar, 1> in_x(8);
+  Tensor<Scalar, 1> out(8);
+  Tensor<Scalar, 1> expected_out(8);
+  out.setZero();
+
+  in_x(0) = Scalar(1);
+  in_x(1) = Scalar(0.);
+  in_x(2) = Scalar(0.5);
+  in_x(3) = Scalar(0.2);
+  in_x(4) = Scalar(0.8);
+  in_x(5) = Scalar(0.9);
+  in_x(6) = Scalar(0.1);
+  in_x(7) = Scalar(0.99);
+  in_x(8) = Scalar(0.01);
+
+  expected_out(0) = std::numeric_limits<Scalar>::infinity();
+  expected_out(1) = -std::numeric_limits<Scalar>::infinity();
+  expected_out(2) = Scalar(0.0);
+  expected_out(3) = Scalar(-0.8416212335729142);
+  expected_out(4) = Scalar(0.8416212335729142);j
+  expected_out(5) = Scalar(1.2815515655446004);
+  expected_out(6) = Scalar(-1.2815515655446004);
+  expected_out(7) = Scalar(2.3263478740408408);
+  expected_out(8) = Scalar(-2.3263478740408408);
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
+
+  gpu_out.device(gpu_device) = gpu_in_x.ndtri();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  VERIFY_IS_EQUAL(out(0), expected_out(0));
+  VERIFY((std::isnan)(out(3)));
+
+  for (int i = 1; i < 6; ++i) {
+    if (i != 3) {
+      VERIFY_IS_APPROX(out(i), expected_out(i));
+    }
+  }
+
+  gpuFree(d_in_x);
+  gpuFree(d_out);
+}
 
 template <typename Scalar>
 void test_gpu_betainc()
@@ -1538,6 +1598,10 @@ EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
 
 #if !defined(EIGEN_USE_HIP)
 // disable these tests on HIP for now.
+
+  CALL_SUBTEST_5(test_gpu_ndtri<float>());
+  CALL_SUBTEST_5(test_gpu_ndtri<double>());
+
   CALL_SUBTEST_5(test_gpu_digamma<float>());
   CALL_SUBTEST_5(test_gpu_digamma<double>());
 
diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp
index 50dae6c93..140a5e4c1 100644
--- a/unsupported/test/special_functions.cpp
+++ b/unsupported/test/special_functions.cpp
@@ -133,6 +133,26 @@ template<typename ArrayType> void array_special_functions()
   }
 #endif  // EIGEN_HAS_C99_MATH
 
+  // Check the ndtri function against scipy.special.ndtri
+  {
+    ArrayType x(7), res(7), ref(7);
+    x << 0.5, 0.2, 0.8, 0.9, 0.1, 0.99, 0.01;
+    ref << 0., -0.8416212335729142, 0.8416212335729142, 1.2815515655446004, -1.2815515655446004, 2.3263478740408408, -2.3263478740408408;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+    CALL_SUBTEST( res = x.ndtri(); verify_component_wise(res, ref); );
+    CALL_SUBTEST( res = ndtri(x); verify_component_wise(res, ref); );
+
+    // ndtri(normal_cdf(x)) ~= x
+    CALL_SUBTEST(
+        ArrayType m1 = ArrayType::Random(32);
+        using std::sqrt;
+
+        ArrayType cdf_val = (m1 / sqrt(2.)).erf();
+        cdf_val = (cdf_val + 1.) / 2.;
+        verify_component_wise(cdf_val.ndtri(), m1););
+
+  }
+
   // Check the zeta function against scipy.special.zeta
   {
     ArrayType x(7), q(7), res(7), ref(7);
-- 
cgit v1.2.3


From 5702a579261b7227089a7e642fa9be0cb0fe1ad5 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 4 Sep 2019 22:57:04 +0200
Subject: Fix possible warning regarding strict equality comparisons

---
 Eigen/src/Core/GenericPacketMath.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 651e3f7b3..5ce984caf 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -276,12 +276,12 @@ pselect(const Packet& mask, const Packet& a, const Packet& b) {
 
 template<> EIGEN_DEVICE_FUNC inline float pselect<float>(
     const float& mask, const float& a, const float&b) {
-  return mask == 0 ? b : a;
+  return numext::equal_strict(mask,0.f) ? b : a;
 }
 
 template<> EIGEN_DEVICE_FUNC inline double pselect<double>(
     const double& mask, const double& a, const double& b) {
-  return mask == 0 ? b : a;
+  return numext::equal_strict(mask,0.) ? b : a;
 }
 
 /** \internal \returns a <= b as a bit mask */
-- 
cgit v1.2.3


From e6c183f8fd0c9c093eb30e08bd08e8e48a80264c Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 4 Sep 2019 23:00:21 +0200
Subject: Fix doc issues regarding ndtri

---
 Eigen/src/plugins/ArrayCwiseUnaryOps.h | 12 +++++++-----
 doc/CoeffwiseMathFunctionsTable.dox    | 12 ++++++++++++
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
index 4aef72d92..06ac7aad0 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@@ -608,16 +608,18 @@ erfc() const
   return ErfcReturnType(derived());
 }
 
-/** \cpp11 \returns an expression of the coefficient-wise Complementary error
+/** \returns an expression of the coefficient-wise inverse of the CDF of the Normal distribution function
   * function of *this.
   *
   * \specialfunctions_module
+  * 
+  * In other words, considering `x = ndtri(y)`, it returns the argument, x, for which the area under the
+  * Gaussian probability density function (integrated from minus infinity to x) is equal to y.
   *
-  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
-  * or float/double in non c++11 mode, the user has to provide implementations of ndtri(T) for any scalar
-  * type T to be supported.
+  * \note This function supports only float and double scalar types. To support other scalar types,
+  * the user has to provide implementations of ndtri(T) for any scalar type T to be supported.
   *
-  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_ndtri">Math functions</a>, erf()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_ndtri">Math functions</a>
   */
 EIGEN_DEVICE_FUNC
 inline const NdtriReturnType
diff --git a/doc/CoeffwiseMathFunctionsTable.dox b/doc/CoeffwiseMathFunctionsTable.dox
index 080e056e1..8186a5272 100644
--- a/doc/CoeffwiseMathFunctionsTable.dox
+++ b/doc/CoeffwiseMathFunctionsTable.dox
@@ -553,6 +553,18 @@ This also means that, unless specified, if the function \c std::foo is available
   </td>
   <td></td>
 </tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_ndtri
+  a.\link ArrayBase::ndtri ndtri\endlink(); \n
+  \link Eigen::ndtri ndtri\endlink(a);
+  </td>
+  <td>Inverse of the CDF of the Normal distribution function</td>
+  <td>
+  built-in for float and double
+  </td>
+  <td></td>
+</tr>
 <tr><td colspan="4"></td></tr>
 </table>
 
-- 
cgit v1.2.3