Disentangle the GPU code from the CPU code. This means a few things:

* The "core_cpu_internal" build target no longer includes files from the common_runtime/gpu/ directory. * tensorflow/core internal targets instead can get access to those headers via the "gpu_runtime" target. * The class "CopyTensor" is introduced. It lives in common_runtime/ but supports registration of copy functions so the "gpu_runtime" target can add a GPU->GPU copy ability if it is linked in. This registration should make it easier to add more device types in the future. * The "core_cpu" and "core_cpu_internal" build targets no longer reference GPUUtil::CopyViaDMA; rendezvous_mgr uses CopyTensor instead. Also the "copy_tensor" build target was not needed. Change: 112821119
author: Josh Levenberg <josh11b@tensorflow.org> 2016-01-22 14:45:34 -0800
committer: Vijay Vasudevan <vrv@google.com> 2016-01-22 17:04:38 -0800
commit: 4ba51b33357d68f882a920fb4f87bfe67bb034a0 (patch)
tree: 1d9741962b25bde407e90103c506ca9ab1ec2042 /tensorflow
parent: fde1dc4a489471bb9064f7a0013b9c89f46febf8 (diff)
7 files changed, 228 insertions, 153 deletions
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index f0e962a735..95d5531876 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -773,23 +773,13 @@ cc_library(
     deps = ["//tensorflow/core/platform/default/build_config:protos_cc"],
 )
 
-cc_library(
-    name = "copy_tensor",
-    deps = [
-        ":lib",
-        ":protos_cc",
-        ":stream_executor",
-        "//third_party/eigen3",
-    ],
-)
-
 tf_cuda_library(
     name = "core_cpu_internal",
     srcs = glob(
         [
             "client/**/*.cc",
-            "common_runtime/**/*.h",  # TODO(josh11b): exclude common_runtime/gpu/
-            "common_runtime/**/*.cc",
+            "common_runtime/*.h",
+            "common_runtime/*.cc",
             "graph/**/*.h",
             "graph/**/*.cc",
             "public/session.h",
@@ -800,8 +790,6 @@ tf_cuda_library(
         exclude = [
             "**/*test*",
             "**/*main.cc",
-            "common_runtime/gpu/*.cc",
-            "common_runtime/copy_tensor.cc",
             "common_runtime/gpu_device_factory.cc",
             "common_runtime/direct_session.cc",
             "common_runtime/direct_session.h",
@@ -809,7 +797,7 @@ tf_cuda_library(
     ),
     hdrs = glob(
         [
-            "common_runtime/**/*.h",  # TODO(josh11b): exclude common_runtime/gpu/
+            "common_runtime/*.h",
             "graph/**/*.h",
         ],
         exclude = [
@@ -819,7 +807,6 @@ tf_cuda_library(
     ),
     copts = tf_copts(),
     deps = [
-        ":copy_tensor",
         ":framework",
         ":framework_internal",
         ":lib",
@@ -861,7 +848,6 @@ tf_cuda_library(
     name = "gpu_runtime",
     srcs = glob(
         [
-            "common_runtime/gpu/*.h",
             "common_runtime/gpu/*.cc",
         ],
         exclude = [
@@ -869,6 +855,7 @@ tf_cuda_library(
             "**/*test.cc",
         ],
     ),
+    hdrs = glob(["common_runtime/gpu/*.h"]),
     copts = tf_copts(),
     cuda_deps = [
         ":cuda",
@@ -1020,6 +1007,7 @@ tf_cc_tests(
         ":direct_session",
         ":framework",
         ":framework_internal",
+        ":gpu_runtime",
         ":kernels",
         ":lib",
         ":lib_internal",
diff --git a/tensorflow/core/common_runtime/copy_tensor.cc b/tensorflow/core/common_runtime/copy_tensor.cc
new file mode 100644
index 0000000000..54fe6a4a08
--- /dev/null
+++ b/tensorflow/core/common_runtime/copy_tensor.cc
@@ -0,0 +1,110 @@
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+
+#include <vector>
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/tracing.h"
+
+namespace tensorflow {
+namespace {
+
+static bool initialization_done = false;
+
+struct RegistrationInfo {
+  RegistrationInfo(DeviceType s, DeviceType r, CopyTensor::CopyFunction cf)
+      : sender_device_type(s), receiver_device_type(r), copy_function(cf) {}
+  DeviceType sender_device_type;
+  DeviceType receiver_device_type;
+  CopyTensor::CopyFunction copy_function;
+};
+
+// We use a vector instead of a map since we expect there to be very
+// few registrations.
+std::vector<RegistrationInfo>* MutableRegistry() {
+  static std::vector<RegistrationInfo>* registry =
+      new std::vector<RegistrationInfo>;
+  return registry;
+}
+
+}  // namespace
+
+// static
+void CopyTensor::ViaDMA(const string& edge_name,
+                        DeviceContext* send_dev_context,
+                        DeviceContext* recv_dev_context, Device* src,
+                        Device* dst, const AllocatorAttributes src_alloc_attr,
+                        const AllocatorAttributes dst_alloc_attr,
+                        const Tensor* input, Tensor* output,
+                        StatusCallback done) {
+  initialization_done = true;
+  port::Tracing::ScopedAnnotation annotation(edge_name);
+  VLOG(1) << "CopyViaDMA " << edge_name;
+  const size_t total_bytes = input->TotalBytes();
+
+  // Note that 0-size tensors have no backing buffer.
+  if (total_bytes > 0) {
+    const DeviceType src_device_type(src_alloc_attr.on_host()
+                                         ? DEVICE_CPU
+                                         : src->attributes().device_type());
+    const DeviceType dst_device_type(dst_alloc_attr.on_host()
+                                         ? DEVICE_CPU
+                                         : dst->attributes().device_type());
+    const bool non_cpu_src = src_device_type != DeviceType(DEVICE_CPU);
+    const bool non_cpu_dst = dst_device_type != DeviceType(DEVICE_CPU);
+
+    if (non_cpu_src) {
+      if (non_cpu_dst) {
+        // Device to device copy.  Look through registry for an appropriate
+        // CopyFunction.
+        std::vector<RegistrationInfo>* registry = MutableRegistry();
+        for (const RegistrationInfo& ri : *registry) {
+          if (ri.sender_device_type == src_device_type &&
+              ri.receiver_device_type == dst_device_type) {
+            ri.copy_function(send_dev_context, recv_dev_context, src, dst,
+                             src_alloc_attr, dst_alloc_attr, input, output,
+                             done);
+            return;
+          }
+        }
+
+        // TODO(josh11b): If no CopyFunction is found, we currently fail
+        // but we could copy between devices via CPU.
+        done(errors::Unimplemented(
+            "No function registered to copy from devices of type ",
+            src_device_type.type(), " to devices of type ",
+            dst_device_type.type()));
+      } else {
+        // Device to host copy.
+        return send_dev_context->CopyDeviceTensorToCPU(input, edge_name, src,
+                                                       output, done);
+      }
+    } else if (non_cpu_dst) {
+      // Host to Device copy.
+      // Note that this is already an async copy.
+      recv_dev_context->CopyCPUTensorToDevice(input, dst, output, done);
+    } else {
+      *output = *input;
+      done(Status::OK());
+    }
+  } else {
+    // buffer is empty
+    done(Status::OK());
+  }
+}
+
+// static
+Status CopyTensor::Register(DeviceType sender_device_type,
+                            DeviceType receiver_device_type,
+                            CopyFunction copy_function) {
+  if (initialization_done) {
+    return errors::FailedPrecondition(
+        "May only register CopyTensor functions during before the first tensor "
+        "is copied.");
+  }
+  std::vector<RegistrationInfo>* registry = MutableRegistry();
+  registry->emplace_back(sender_device_type, receiver_device_type,
+                         copy_function);
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/copy_tensor.h b/tensorflow/core/common_runtime/copy_tensor.h
new file mode 100644
index 0000000000..5bfeadbca1
--- /dev/null
+++ b/tensorflow/core/common_runtime/copy_tensor.h
@@ -0,0 +1,54 @@
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COPY_TENSOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COPY_TENSOR_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class CopyTensor {
+ public:
+  typedef void (*CopyFunction)(DeviceContext* send_dev_context,
+                               DeviceContext* recv_dev_context, Device* src,
+                               Device* dst,
+                               const AllocatorAttributes src_alloc_attr,
+                               const AllocatorAttributes dst_alloc_attr,
+                               const Tensor* input, Tensor* output,
+                               StatusCallback done);
+
+  // Copies "input" to "output" between devices accessible to the
+  // local process via some DMA-like method.  "edge_name" is the name
+  // of the tensor being copied, for debugging purposes. Depending on
+  // the type of devices and memory in use, the copy may be performed
+  // synchronously or asynchronously.  'done' will be invoked only
+  // after the copy is actually complete.
+  static void ViaDMA(const string& edge_name, DeviceContext* send_dev_context,
+                     DeviceContext* recv_dev_context, Device* src, Device* dst,
+                     const AllocatorAttributes src_alloc_attr,
+                     const AllocatorAttributes dst_alloc_attr,
+                     const Tensor* input, Tensor* output, StatusCallback done);
+
+  // Register a function for copying between two specific DeviceTypes.
+  static Status Register(DeviceType sender_device_type,
+                         DeviceType receiver_device_type,
+                         CopyFunction copy_function);
+
+  // Object used to call Register() at static-initialization time.
+  class Registration {
+   public:
+    Registration(DeviceType sender_device_type, DeviceType receiver_device_type,
+                 CopyFunction copy_function) {
+      TF_QCHECK_OK(
+          Register(sender_device_type, receiver_device_type, copy_function));
+    }
+  };
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COPY_TENSOR_H_
diff --git a/tensorflow/core/common_runtime/gpu/dma_helper.h b/tensorflow/core/common_runtime/dma_helper.h
index 57303bbf81..d0a6414189 100644
--- a/tensorflow/core/common_runtime/gpu/dma_helper.h
+++ b/tensorflow/core/common_runtime/dma_helper.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_DMA_HELPER_H_
-#define TENSORFLOW_COMMON_RUNTIME_GPU_DMA_HELPER_H_
+#ifndef TENSORFLOW_COMMON_RUNTIME_DMA_HELPER_H_
+#define TENSORFLOW_COMMON_RUNTIME_DMA_HELPER_H_
 
 #include "tensorflow/core/public/tensor.h"
 
-// For internal use only.  Visibility should be limited to brain/framework.
-
 namespace tensorflow {
+
+// For TensorFlow internal use only.
 class DMAHelper {
  public:
   static bool CanUseDMA(const Tensor* t) { return t->CanUseDMA(); }
@@ -29,5 +29,7 @@ class DMAHelper {
   static TensorBuffer* buffer(Tensor* t) { return t->buf_; }
   static const TensorBuffer* buffer(const Tensor* t) { return t->buf_; }
 };
+
 }  // namespace tensorflow
-#endif  // TENSORFLOW_COMMON_RUNTIME_GPU_DMA_HELPER_H_
+
+#endif  // TENSORFLOW_COMMON_RUNTIME_DMA_HELPER_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 9d2f917f51..b3d5024b5c 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -15,8 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 
+#include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/gpu/dma_helper.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
@@ -105,92 +106,49 @@ void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
   }
 }
 
-typedef ProcessState::MemDesc PMD;
-
-/*static*/
-void GPUUtil::CopyViaDMA(const string& edge_name,
-                         DeviceContext* send_dev_context,
-                         DeviceContext* recv_dev_context, Device* src,
-                         Device* dst, AllocatorAttributes src_alloc_attr,
-                         AllocatorAttributes dst_alloc_attr,
-                         const Tensor* input, Tensor* output,
-                         StatusCallback done) {
-  port::Tracing::ScopedAnnotation annotation(edge_name);
-  VLOG(1) << "CopyViaDMA " << edge_name;
-  size_t total_bytes = input->TotalBytes();
-  // Note that 0-size tensors have no backing buffer.
-  if (total_bytes > 0) {
-    const void* src_ptr = DMAHelper::base(input);
-    void* dst_ptr = DMAHelper::base(output);
-    VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr;
-    if (FLAGS_brain_gpu_record_mem_types) {
-      ProcessState::MemDesc smd = ProcessState::singleton()->PtrType(src_ptr);
-      ProcessState::MemDesc dmd = ProcessState::singleton()->PtrType(dst_ptr);
-      VLOG(0) << "Src " << smd.DebugString() << " Dst " << dmd.DebugString();
-      if (smd.loc == PMD::CPU && dmd.loc == PMD::GPU && (!smd.gpu_registered)) {
-        LOG(WARNING) << "CPU -> GPU no reg for " << edge_name;
-      }
-      if (dmd.loc == PMD::CPU && smd.loc == PMD::GPU && (!dmd.gpu_registered)) {
-        LOG(WARNING) << "GPU -> CPU no reg for " << edge_name;
-      }
-    }
-
-    auto src_device_type = src->attributes().device_type();
-    auto dst_device_type = dst->attributes().device_type();
+// static
+void GPUUtil::DeviceToDeviceCopy(DeviceContext* send_dev_context,
+                                 DeviceContext* recv_dev_context, Device* src,
+                                 Device* dst,
+                                 AllocatorAttributes src_alloc_attr,
+                                 AllocatorAttributes dst_alloc_attr,
+                                 const Tensor* input, Tensor* output,
+                                 StatusCallback done) {
+  const void* src_ptr = DMAHelper::base(input);
+  void* dst_ptr = DMAHelper::base(output);
+  VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr;
+  const size_t total_bytes = input->TotalBytes();
 
-    bool non_cpu_src = (!src_alloc_attr.on_host() &&
-                        src_device_type != DeviceType(DEVICE_CPU).type());
-    bool non_cpu_dst = (!dst_alloc_attr.on_host() &&
-                        dst_device_type != DeviceType(DEVICE_CPU).type());
-    if (non_cpu_src) {
-      if (non_cpu_dst) {
-        // Device to device copy
-        gpu::Stream* stream = send_dev_context->stream();
-        if (stream == nullptr) {
-          done(errors::Internal("Failed to find device stream"));
-          return;
-        }
-        auto* src_dev_info = src->tensorflow_gpu_device_info();
-        CHECK(src_dev_info);
+  gpu::Stream* stream = send_dev_context->stream();
+  if (stream == nullptr) {
+    done(errors::Internal("Failed to find device stream"));
+    return;
+  }
+  auto* src_dev_info = src->tensorflow_gpu_device_info();
+  CHECK(src_dev_info);
 
-        DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
-        stream->ThenMemcpy(
-            &gpu_dst_ptr,
-            DeviceMemoryBase{const_cast<void*>(src_ptr), total_bytes},
-            total_bytes);
-        if (dst_device_type == DeviceType(DEVICE_GPU).type()) {
-          // Use of input may outlive stack scope, so keep a ref.
-          TensorReference input_ref(*input);
-          src_dev_info->event_mgr->ThenExecute(
-              stream, [done, stream, input_ref]() {
-                input_ref.Unref();
-                if (!stream->ok()) {
-                  done(errors::Internal("GPU->GPU Memcpy failed"));
-                } else {
-                  done(Status::OK());
-                }
-              });
-        }
-        send_dev_context->MaintainLifetimeOnStream(input, stream);
+  DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
+  stream->ThenMemcpy(&gpu_dst_ptr,
+                     DeviceMemoryBase{const_cast<void*>(src_ptr), total_bytes},
+                     total_bytes);
+  if (dst->attributes().device_type() == DeviceType(DEVICE_GPU).type()) {
+    // Use of input may outlive stack scope, so keep a ref.
+    TensorReference input_ref(*input);
+    src_dev_info->event_mgr->ThenExecute(stream, [done, stream, input_ref]() {
+      input_ref.Unref();
+      if (!stream->ok()) {
+        done(errors::Internal("GPU->GPU Memcpy failed"));
       } else {
-        // Device to host copy.
-        return send_dev_context->CopyDeviceTensorToCPU(input, edge_name, src,
-                                                       output, done);
+        done(Status::OK());
       }
-    } else if (non_cpu_dst) {
-      // Host to Device copy.
-      // Note that this is already an async copy.
-      recv_dev_context->CopyCPUTensorToDevice(input, dst, output, done);
-    } else {
-      memcpy(dst_ptr, src_ptr, total_bytes);
-      done(Status::OK());
-    }
-  } else {
-    // buffer is empty
-    done(Status::OK());
+    });
   }
+  send_dev_context->MaintainLifetimeOnStream(input, stream);
 }
 
+static CopyTensor::Registration register_gpu_gpu_copy(
+    DEVICE_GPU, DEVICE_GPU, GPUUtil::DeviceToDeviceCopy);
+
 void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device,
                                  const DeviceContext* device_context,
                                  const Tensor* gpu_tensor, Tensor* cpu_tensor,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.h b/tensorflow/core/common_runtime/gpu/gpu_util.h
index b9044fcd08..ab9d87186a 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_
 
 #include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/gpu/dma_helper.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/public/status.h"
 #include "tensorflow/core/public/tensor.h"
@@ -42,20 +42,6 @@ class GPUUtil {
                               TensorProto* proto, bool is_dead,
                               StatusCallback done);
 
-  // Copies "input" to "output" between devices accessible to the
-  // local process via some DMA-like method.  "edge_name" is the name
-  // of the tensor being copied, for debugging purposes. Depending on
-  // the type of devices and memory in use, the copy may be performed
-  // synchronously or asynchronously.  'done' will be invoked only
-  // after the copy is actually complete.
-  static void CopyViaDMA(const string& edge_name,
-                         DeviceContext* send_dev_context,
-                         DeviceContext* recv_dev_context, Device* src,
-                         Device* dst, const AllocatorAttributes src_alloc_attr,
-                         const AllocatorAttributes dst_alloc_attr,
-                         const Tensor* input, Tensor* output,
-                         StatusCallback done);
-
   // Copies the data in 'gpu_tensor' into 'cpu_tensor'.
   // 'gpu_tensor''s backing memory must be on 'gpu_device' and
   // 'cpu_tensor' must be allocated to be of the same size as
@@ -96,6 +82,14 @@ class GPUUtil {
                                  const DeviceContext* device_context,
                                  Device* gpu_device, Tensor* gpu_tensor,
                                  StatusCallback done);
+
+  static void DeviceToDeviceCopy(DeviceContext* send_dev_context,
+                                 DeviceContext* recv_dev_context, Device* src,
+                                 Device* dst,
+                                 AllocatorAttributes src_alloc_attr,
+                                 AllocatorAttributes dst_alloc_attr,
+                                 const Tensor* input, Tensor* output,
+                                 StatusCallback done);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index 95c11c075d..2f311f9db8 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -17,11 +17,9 @@ limitations under the License.
 
 #include <unordered_set>
 
+#include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
-#if !defined(__ANDROID__) && (defined(PLATFORM_GOOGLE) || GOOGLE_CUDA)
-#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
-#endif
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
@@ -33,35 +31,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
-
-void CopyTensorBetweenDevices(const string& id, DeviceContext* send_dev_context,
-                              DeviceContext* recv_dev_context, Device* src,
-                              Device* dst,
-                              const AllocatorAttributes src_alloc_attr,
-                              const AllocatorAttributes dst_alloc_attr,
-                              const Tensor* input, Tensor* output,
-                              std::function<void(const Status&)> done) {
-  if (src->attributes().device_type() != dst->attributes().device_type()) {
-    done(errors::Unimplemented(
-        "Copy between device types not yet implemented: src=", src->name(),
-        " dst=", dst->name()));
-  } else if (src->attributes().device_type() != "CPU") {
-    done(errors::Unimplemented(
-        "Copy between non-CPU devices not yet implemented"));
-  }
-  *output = *input;
-  done(Status::OK());
-}
-
-#if !defined(__ANDROID__) && (defined(PLATFORM_GOOGLE) || GOOGLE_CUDA)
-constexpr auto CopyTensorBetweenDevicesFunc = &GPUUtil::CopyViaDMA;
-#else
-constexpr auto CopyTensorBetweenDevicesFunc = &CopyTensorBetweenDevices;
-#endif
-
-}  // end namespace
-
 IntraProcessRendezvous::IntraProcessRendezvous(const DeviceMgr* device_mgr)
     : device_mgr_(device_mgr), local_(NewLocalRendezvous()) {}
 
@@ -136,10 +105,10 @@ void IntraProcessRendezvous::SameWorkerRecvDone(
   Tensor copy(out_allocator, in.dtype(), in.shape());
   *out = copy;
 
-  CopyTensorBetweenDevicesFunc(parsed.edge_name, send_args.device_context,
-                               recv_args.device_context, src_device, dst_device,
-                               send_args.alloc_attrs, recv_args.alloc_attrs,
-                               &in, out, done);
+  CopyTensor::ViaDMA(parsed.edge_name, send_args.device_context,
+                     recv_args.device_context, src_device, dst_device,
+                     send_args.alloc_attrs, recv_args.alloc_attrs, &in, out,
+                     done);
 }
 
 void IntraProcessRendezvous::RecvAsync(const string& key,
author	Josh Levenberg <josh11b@tensorflow.org>	2016-01-22 14:45:34 -0800
committer	Vijay Vasudevan <vrv@google.com>	2016-01-22 17:04:38 -0800
commit	4ba51b33357d68f882a920fb4f87bfe67bb034a0 (patch)
tree	1d9741962b25bde407e90103c506ca9ab1ec2042 /tensorflow
parent	fde1dc4a489471bb9064f7a0013b9c89f46febf8 (diff)