* Add mechanism to CudaSolver for capturing references to temporary tensors. This way users of the class don't have to remember to capture each one manually to avoid premature deallocation and memory races for asynchronous op kernels.

* Add simple tests that run multiple ops concurrently for linalg ops that use CudaSolver. * Put a lock around the calls to cusolverDn*getrs and cusolverDn*gesvd, which appear not to be threadsafe. * Misc. cleanup in linalg GPU kernels. I ran all the related tests 1000 times without failure. Before this change, tests for matrix_solve and svd would fail or hang occasionally. PiperOrigin-RevId: 170557380
author: A. Unique TensorFlower <gardener@tensorflow.org> 2017-09-29 18:05:54 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-09-29 18:10:04 -0700
commit: ac742fab0bf4c8b7bde5febc33e09fedfcb57aa1 (patch)
tree: bd46bb1683f288b531cc422c31f27bf064e5c8cc /tensorflow/core/kernels/matrix_inverse_op.cc
parent: 2f7eef77426e4cd7b5d577b10968b6786acb5bbd (diff)
1 files changed, 59 insertions, 60 deletions
diff --git a/tensorflow/core/kernels/matrix_inverse_op.cc b/tensorflow/core/kernels/matrix_inverse_op.cc
index 715bad8b07..a152b5cbee 100644
--- a/tensorflow/core/kernels/matrix_inverse_op.cc
+++ b/tensorflow/core/kernels/matrix_inverse_op.cc
@@ -122,13 +122,17 @@ class MatrixInverseOpGpu : public AsyncOpKernel {
       return;
     }
 
+    // TODO(rmlarsen): Convert to std::make_unique when available.
+    std::unique_ptr<CudaSolver> solver(new CudaSolver(context));
+
     // Make a copy of the (possible adjointed) input that we will use for the
     // factorization step.
     Tensor input_copy;
-    OP_REQUIRES_OK_ASYNC(context,
-                         context->allocate_temp(DataTypeToEnum<Scalar>::value,
-                                                input.shape(), &input_copy),
-                         done);
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        solver->allocate_scoped_tensor(DataTypeToEnum<Scalar>::value,
+                                       input.shape(), &input_copy),
+        done);
     auto input_copy_reshaped = input_copy.template flat_inner_dims<Scalar, 3>();
     auto input_reshaped = input.template flat_inner_dims<Scalar, 3>();
     const GPUDevice& device = context->eigen_device<GPUDevice>();
@@ -142,14 +146,21 @@ class MatrixInverseOpGpu : public AsyncOpKernel {
     }
     const int64 batch_size = input_copy_reshaped.dimension(0);
 
-    CudaSolver solver(context);
+    Tensor pivots;
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        solver->allocate_scoped_tensor(DataTypeToEnum<int>::value,
+                                       TensorShape{batch_size, n}, &pivots),
+        done);
+    auto pivots_mat = pivots.template matrix<int>();
+    auto input_copy_ptr_array = solver->GetScratchSpace<uint8>(
+        sizeof(Scalar*) * batch_size, "input_copy_ptr_array",
+        /* on_host */ true);
+    auto output_ptr_array = solver->GetScratchSpace<uint8>(
+        sizeof(Scalar*) * batch_size, "output_copy_ptr_array",
+        /* on_host */ true);
+    auto output_reshaped = output->template flat_inner_dims<Scalar, 3>();
     std::vector<DeviceLapackInfo> dev_info;
-    ScratchSpace<int> pivots(context, n * batch_size, /* on_host */ false);
-    ScratchSpace<uint8> input_copy_ptr_array(context,
-                                             sizeof(Scalar*) * batch_size,
-                                             /* on_host */ true);
-    ScratchSpace<uint8> output_ptr_array(context, sizeof(Scalar*) * batch_size,
-                                         /* on_host */ true);
     if (n < 32 || batch_size > n) {
       // For small matrices or very large batch sizes, we use the batched
       // interfaces in cuBlas to avoid being dominated by kernel launch
@@ -160,37 +171,40 @@ class MatrixInverseOpGpu : public AsyncOpKernel {
           reinterpret_cast<const Scalar**>(input_copy_ptr_array.mutable_data());
       const Scalar** output_ptr_array_base =
           reinterpret_cast<const Scalar**>(output_ptr_array.mutable_data());
-      auto output_reshaped = output->template flat_inner_dims<Scalar, 3>();
-      for (int64 i = 0; i < batch_size; ++i) {
-        input_copy_ptr_array_base[i] = input_copy_reshaped.data() + i * n * n;
-        output_ptr_array_base[i] = output_reshaped.data() + i * n * n;
+      for (int batch = 0; batch < batch_size; ++batch) {
+        input_copy_ptr_array_base[batch] = &input_copy_reshaped(batch, 0, 0);
+        output_ptr_array_base[batch] = &output_reshaped(batch, 0, 0);
       }
 
       if (n < 32) {
         // MatInvBatched only supports n < 32.
-        dev_info.emplace_back(context, batch_size, "MatInvBatched");
-        OP_REQUIRES_OK_ASYNC(context,
-                             solver.MatInvBatched(n, input_copy_ptr_array_base,
-                                                  n, output_ptr_array_base, n,
-                                                  &dev_info.back(), batch_size),
+        dev_info.push_back(
+            solver->GetDeviceLapackInfo(batch_size, "MatInvBatched"));
+        OP_REQUIRES_OK_ASYNC(
+            context,
+            solver->MatInvBatched(n, input_copy_ptr_array_base, n,
+                                  output_ptr_array_base, n, &dev_info.back(),
+                                  batch_size),
 
-                             done);
+            done);
       } else {
         // For larger matrices and large batch size, we used the batched
         // GETRF/GETRI kernels.
-        dev_info.emplace_back(context, batch_size, "GetrfBatched");
+        dev_info.push_back(
+            solver->GetDeviceLapackInfo(batch_size, "GetrfBatched"));
         OP_REQUIRES_OK_ASYNC(context,
-                             solver.GetrfBatched(n, input_copy_ptr_array_base,
-                                                 n, pivots.mutable_data(),
-                                                 &dev_info.back(), batch_size),
+                             solver->GetrfBatched(n, input_copy_ptr_array_base,
+                                                  n, pivots_mat.data(),
+                                                  &dev_info.back(), batch_size),
                              done);
         // 2. Compute the inverse(s).
-        dev_info.emplace_back(context, batch_size, "GetriBatched");
+        dev_info.push_back(
+            solver->GetDeviceLapackInfo(batch_size, "GetriBatched"));
         OP_REQUIRES_OK_ASYNC(
             context,
-            solver.GetriBatched(n, input_copy_ptr_array_base, n, pivots.data(),
-                                output_ptr_array_base, n, &dev_info.back(),
-                                batch_size),
+            solver->GetriBatched(n, input_copy_ptr_array_base, n,
+                                 pivots_mat.data(), output_ptr_array_base, n,
+                                 &dev_info.back(), batch_size),
             done);
       }
     } else {
@@ -198,50 +212,38 @@ class MatrixInverseOpGpu : public AsyncOpKernel {
       // sequentially. Here we use the cuSolver methods GETRF/GETRS because they
       // are MUCH faster than their batched cuBlas equivalents for large
       // matrices.
-      dev_info.emplace_back(context, batch_size, "getrf");
-      int* dev_info_ptr = dev_info.back().mutable_data();
-      Scalar* input_copy_ptr = input_copy.flat<Scalar>().data();
-      int* pivots_ptr = pivots.mutable_data();
+      dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "getrf"));
       for (int batch = 0; batch < batch_size; ++batch) {
         OP_REQUIRES_OK_ASYNC(
             context,
-            solver.Getrf(n, n, input_copy_ptr, n, pivots_ptr, dev_info_ptr),
+            solver->Getrf(n, n, &input_copy_reshaped(batch, 0, 0), n,
+                          &pivots_mat(batch, 0), &dev_info.back()(batch)),
             done);
-        input_copy_ptr += n * n;
-        pivots_ptr += n;
-        ++dev_info_ptr;
       }
 
       // Set all right-hand sides to the identity.
       functor::EyeFunctor<GPUDevice, Scalar> eye;
-      eye(device, output->template flat_inner_dims<Scalar, 3>());
+      eye(device, output_reshaped);
 
       // Solve A X = I.
-      Scalar* output_ptr = output->template flat<Scalar>().data();
-      input_copy_ptr = input_copy.flat<Scalar>().data();
-      pivots_ptr = pivots.mutable_data();
-      dev_info.emplace_back(context, batch_size, "getrs");
-      dev_info_ptr = dev_info.back().mutable_data();
+      dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "getrs"));
       for (int batch = 0; batch < batch_size; ++batch) {
         OP_REQUIRES_OK_ASYNC(
             context,
-            solver.Getrs(CUBLAS_OP_N, n, n, input_copy_ptr, n, pivots_ptr,
-                         output_ptr, n, dev_info_ptr),
+            solver->Getrs(CUBLAS_OP_N, n, n, &input_copy_reshaped(batch, 0, 0),
+                          n, &pivots_mat(batch, 0),
+                          &output_reshaped(batch, 0, 0), n,
+                          &dev_info.back()(batch)),
             done);
-        output_ptr += n * n;
-        input_copy_ptr += n * n;
-        pivots_ptr += n;
-        ++dev_info_ptr;
       }
     }
-    // Register callback to check info after kernels finish. Also capture the
+    // Callback for checking info after kernels finish. Also capture the
     // temporary Tensors/ScratchSpace so they don't get deallocated before the
     // kernels run. TODO(rmlarsen): Use move capture once C++14 becomes
     // available.
-    auto info_checker = [context, dev_info, input_copy, pivots,
-                         input_copy_ptr_array, output_ptr_array,
-                         done](const Status& status,
-                               const std::vector<HostLapackInfo>& host_infos) {
+    auto info_checker = [context, done](
+                            const Status& status,
+                            const std::vector<HostLapackInfo>& host_infos) {
       if (!status.ok() && errors::IsInvalidArgument(status)) {
         for (const auto& host_info : host_infos) {
           for (int i = 0; i < host_info.size(); ++i) {
@@ -249,7 +251,7 @@ class MatrixInverseOpGpu : public AsyncOpKernel {
             // just print the original error message from the call itself
             // below.
             OP_REQUIRES_ASYNC(
-                context, host_info[i] <= 0,
+                context, host_info(i) <= 0,
                 errors::InvalidArgument("Input is not invertible."), done);
           }
         }
@@ -257,11 +259,8 @@ class MatrixInverseOpGpu : public AsyncOpKernel {
       OP_REQUIRES_OK_ASYNC(context, status, done);
       done();
     };
-
-    OP_REQUIRES_OK_ASYNC(
-        context,
-        solver.CopyLapackInfoToHostAsync(dev_info, std::move(info_checker)),
-        done);
+    CudaSolver::CheckLapackInfoAndDeleteSolverAsync(std::move(solver), dev_info,
+                                                    std::move(info_checker));
   }
 
  private:
author	A. Unique TensorFlower <gardener@tensorflow.org>	2017-09-29 18:05:54 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-09-29 18:10:04 -0700
commit	ac742fab0bf4c8b7bde5febc33e09fedfcb57aa1 (patch)
tree	bd46bb1683f288b531cc422c31f27bf064e5c8cc /tensorflow/core/kernels/matrix_inverse_op.cc
parent	2f7eef77426e4cd7b5d577b10968b6786acb5bbd (diff)