1 files changed, 23 insertions, 13 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 56d03d7a8c..674e8384d5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -65,6 +65,11 @@ class BaseGPUDevice : public LocalDevice {
   // completes.
   bool RequiresRecordingAccessedTensors() const override;
 
+  // GPU kernel execution requires us to use `tracing::ScopedAnnotation()`
+  // rather than `tracing::ScopedActivity()`, in order to relate asynchronously
+  // launched GPU kernels to the OpKernel.
+  bool TraceUsingAnnotations() const { return true; }
+
   void ConsumeListOfAccessedTensors(
       DeviceContext* device_context,
       const TensorReferenceVector& tensor_refs) override;
@@ -86,15 +91,16 @@ class BaseGPUDevice : public LocalDevice {
   // The caller owns the returned device.
   PerOpGpuDevice* MakeGpuDevice() override;
 
-  void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
-                             DeviceContext* dc, Allocator* allocator) override;
+  Status ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
+                               DeviceContext* dc,
+                               Allocator* allocator) override;
 
-  // Returns the CUDA GPU id of this device within the native driver system;
+  // Returns the platform GPU id of this device within the native driver system;
   // e.g., for CUDA this is the ordinal of the GPU within the system.
   int gpu_id() const {
-    CudaGpuId cuda_gpu_id;
-    TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id_, &cuda_gpu_id));
-    return cuda_gpu_id.value();
+    PlatformGpuId platform_gpu_id;
+    TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id));
+    return platform_gpu_id.value();
   }
 
   // The executor that provides control for the device; e.g., for CUDA this
@@ -125,6 +131,7 @@ class BaseGPUDevice : public LocalDevice {
   class StreamGroupFactory;
 
   gtl::InlinedVector<StreamGroup*, 4> streams_;
+  mutex scratch_init_mutex_;
   gtl::InlinedVector<char*, 4> scratch_;
   std::vector<GPUDeviceContext*> device_contexts_;
   GpuDeviceInfo* gpu_device_info_ = nullptr;
@@ -135,6 +142,9 @@ class BaseGPUDevice : public LocalDevice {
   std::unique_ptr<EventMgr> em_;
   std::unique_ptr<thread::ThreadPool> thread_pool_;
 
+  // Initialize scractch buffers used by Eigen.
+  Status InitScratchBuffers();
+
   void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device,
                           int stream_id, Allocator* allocator);
 
@@ -168,14 +178,14 @@ class BaseGPUDeviceFactory : public DeviceFactory {
     int32 strength;
     static const int kSameDeviceStrength;
     static const int kStreamExecutorStrength;
-    std::set<std::pair<CudaGpuId, CudaGpuId>> directed_links;
+    std::set<std::pair<PlatformGpuId, PlatformGpuId>> directed_links;
   };
 
  protected:
   // Populates *maps with interconnect maps for all local direct access
   // pathways between GPUs.
   virtual Status GetInterconnectMaps(
-      const std::vector<CudaGpuId>& visible_gpu_order,
+      const std::vector<PlatformGpuId>& visible_gpu_order,
       se::Platform* gpu_manager, std::vector<InterconnectMap>* maps);
 
   struct TfGpuIdHash {
@@ -207,16 +217,16 @@ class BaseGPUDeviceFactory : public DeviceFactory {
                                          Allocator* gpu_allocator,
                                          Allocator* cpu_allocator) = 0;
 
-  // Returns into 'ids' the list of valid CUDA GPU ids, in the order that
+  // Returns into 'ids' the list of valid platform GPU ids, in the order that
   // they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc,
   // based upon 'visible_gpu_order' which was generated by parsing
   // GPUOptions::visible_device_list which is a comma-separated list of CUDA GPU
   // ids.
-  Status GetValidDeviceIds(const std::vector<CudaGpuId>& visible_gpu_order,
-                           std::vector<CudaGpuId>* ids);
+  Status GetValidDeviceIds(const std::vector<PlatformGpuId>& visible_gpu_order,
+                           std::vector<PlatformGpuId>* ids);
 
-  // visible_gpu_initialized_[cuda_gpu_id] is true if visible GPU cuda_gpu_id
-  // has been initialized by the process.
+  // visible_gpu_initialized_[platform_gpu_id] is true if visible GPU
+  // platform_gpu_id has been initialized by the process.
   std::unordered_map<int, bool> visible_gpu_initialized_;
 };