7 files changed, 97 insertions, 14 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
index a031d2f1e4..ee2d5a869c 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
@@ -104,6 +104,18 @@ void* GPUBFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes) {
       kMaxMillisToWait, unused_alignment, num_bytes);
 }
 
+void* GPUBFCAllocator::AllocateRaw(
+    size_t unused_alignment, size_t num_bytes,
+    const AllocationAttributes& allocation_attr) {
+  if (allocation_attr.no_retry_on_failure) {
+    // Return immediately upon the first failure if this is for allocating an
+    // optional scratch space.
+    return AllocateRawInternal(unused_alignment, num_bytes, true);
+  } else {
+    return AllocateRaw(unused_alignment, num_bytes);
+  }
+}
+
 void* GPUBFCAllocator::AllocateRawInternal(size_t unused_alignment,
                                            size_t num_bytes,
                                            bool dump_log_on_failure) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
index 925fe8aa21..c2edf76dc0 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@@ -49,6 +49,8 @@ class GPUBFCAllocator : public VisitableAllocator {
 
   string Name() override { return "gpu_bfc"; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+  void* AllocateRaw(size_t alignment, size_t num_bytes,
+                    const AllocationAttributes& allocation_attr) override;
   void DeallocateRaw(void* ptr) override;
 
   void AddAllocVisitor(Visitor visitor) override;
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 548c9d54d2..41bbb08b3f 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -26,6 +26,16 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Attributes for a single allocation call. Different calls to the same
+// allocator could potentially have different allocation attributes.
+struct AllocationAttributes {
+  // If the first attempt to allocate the memory fails, the allocation
+  // should return immediately without retrying.
+  // An example use case is optional scratch spaces where a failure
+  // has only performance impact.
+  bool no_retry_on_failure = false;
+};
+
 // Allocator is an abstract interface for allocating and deallocating
 // device memory.
 class Allocator {
@@ -41,6 +51,17 @@ class Allocator {
   // REQUIRES: "alignment" is a power of 2.
   virtual void* AllocateRaw(size_t alignment, size_t num_bytes) = 0;
 
+  // Return an uninitialized block of memory that is "num_bytes" bytes
+  // in size with specified allocation attributes.  The returned pointer is
+  // guaranteed to be aligned to a multiple of "alignment" bytes.
+  // REQUIRES: "alignment" is a power of 2.
+  virtual void* AllocateRaw(size_t alignment, size_t num_bytes,
+                            const AllocationAttributes& allocation_attr) {
+    // The default behavior is to use the implementation without any allocation
+    // attributes.
+    return AllocateRaw(alignment, num_bytes);
+  }
+
   // Deallocate a block of memory pointer to by "ptr"
   // REQUIRES: "ptr" was previously returned by a call to AllocateRaw
   virtual void DeallocateRaw(void* ptr) = 0;
@@ -50,6 +71,12 @@ class Allocator {
   // tensor has too many elements to represent in a single allocation.
   template <typename T>
   T* Allocate(size_t num_elements) {
+    return Allocate<T>(num_elements, AllocationAttributes());
+  }
+
+  template <typename T>
+  T* Allocate(size_t num_elements,
+              const AllocationAttributes& allocation_attr) {
     // TODO(jeff): Do we need to allow clients to pass in alignment
     // requirements?
 
@@ -58,7 +85,7 @@ class Allocator {
     }
 
     void* p = AllocateRaw(32 /* align to 32 byte boundary */,
-                          sizeof(T) * num_elements);
+                          sizeof(T) * num_elements, allocation_attr);
     return reinterpret_cast<T*>(p);
   }
 
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 362202ac65..e61ddd0e2e 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -647,7 +647,13 @@ class OpKernelContext {
   // may retain references to the temporary tensors after the Op's
   // Compute method has run. See comment above.
   Status allocate_temp(DataType type, const TensorShape& shape,
-                       Tensor* out_temp, AllocatorAttributes attr);
+                       Tensor* out_temp, AllocatorAttributes allocator_attr,
+                       const AllocationAttributes& allocation_attr);
+  Status allocate_temp(DataType type, const TensorShape& shape,
+                       Tensor* out_temp, AllocatorAttributes allocator_attr) {
+    return allocate_temp(type, shape, out_temp, allocator_attr,
+                         AllocationAttributes());
+  }
   Status allocate_temp(DataType type, const TensorShape& shape,
                        Tensor* out_temp) {
     return allocate_temp(type, shape, out_temp, AllocatorAttributes());
@@ -851,7 +857,15 @@ class OpKernelContext {
 
   // Internal common method used when allocating tensor memory
   Status allocate_tensor(DataType type, const TensorShape& shape,
-                         Tensor* out_tensor, AllocatorAttributes attr);
+                         Tensor* out_tensor,
+                         AllocatorAttributes allocator_attr) {
+    return allocate_tensor(type, shape, out_tensor, allocator_attr,
+                           AllocationAttributes());
+  }
+
+  Status allocate_tensor(DataType type, const TensorShape& shape,
+                         Tensor* out_tensor, AllocatorAttributes allocator_attr,
+                         const AllocationAttributes& allocation_attr);
 
   // This is called by PersistentTensor::AccessTensor whenever the
   // wrapped tensor is retrieved, to ensure the runtime knows that the
@@ -1085,12 +1099,11 @@ inline Status OpKernelContext::allocate_output(int index,
   return allocate_output(index, shape, output, attr);
 }
 
-inline Status OpKernelContext::allocate_tensor(DataType type,
-                                               const TensorShape& shape,
-                                               Tensor* out_tensor,
-                                               AllocatorAttributes attr) {
+inline Status OpKernelContext::allocate_tensor(
+    DataType type, const TensorShape& shape, Tensor* out_tensor,
+    AllocatorAttributes attr, const AllocationAttributes& allocation_attr) {
   Allocator* a = get_allocator(attr);
-  Tensor new_tensor(a, type, shape);
+  Tensor new_tensor(a, type, shape, allocation_attr);
 
   if (!new_tensor.IsInitialized() && shape.num_elements() > 0) {
     return errors::ResourceExhausted("OOM when allocating tensor with shape",
@@ -1121,11 +1134,12 @@ inline Status OpKernelContext::allocate_output(int index,
   return s;
 }
 
-inline Status OpKernelContext::allocate_temp(DataType type,
-                                             const TensorShape& shape,
-                                             Tensor* out_temp,
-                                             AllocatorAttributes attr) {
-  Status s = allocate_tensor(type, shape, out_temp, attr);
+inline Status OpKernelContext::allocate_temp(
+    DataType type, const TensorShape& shape, Tensor* out_temp,
+    AllocatorAttributes allocator_attr,
+    const AllocationAttributes& allocation_attr) {
+  Status s =
+      allocate_tensor(type, shape, out_temp, allocator_attr, allocation_attr);
   if (s.ok()) {
     if (params_.device->SaveTemporaryTensors()) {
       // keep a reference to the underlying memory around
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index f14efdc913..9f573d2056 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -51,6 +51,7 @@ template <typename T>
 class Buffer : public TensorBuffer {
  public:
   Buffer(Allocator* a, int64 n);
+  Buffer(Allocator* a, int64 n, const AllocationAttributes& allocation_attr);
 
   void* data() const override { return data_; }
   size_t size() const override { return sizeof(T) * elem_; }
@@ -277,6 +278,13 @@ Buffer<T>::Buffer(Allocator* a, int64 n)
 }
 
 template <typename T>
+Buffer<T>::Buffer(Allocator* a, int64 n,
+                  const AllocationAttributes& allocation_attr)
+    : alloc_(a), data_(a->Allocate<T>(n, allocation_attr)), elem_(n) {
+  if (data_) Helper<T>::RunCtor(data_, elem_);
+}
+
+template <typename T>
 Buffer<T>::~Buffer() {
   if (data_) {
     Helper<T>::RunDtor(data_, elem_);
@@ -409,6 +417,15 @@ Tensor::Tensor(Allocator* a, DataType type, const TensorShape& shape)
   }
 }
 
+Tensor::Tensor(Allocator* a, DataType type, const TensorShape& shape,
+               const AllocationAttributes& allocation_attr)
+    : type_(type), shape_(shape), buf_(nullptr) {
+  CHECK_NOTNULL(a);
+  if (shape_.num_elements() > 0) {
+    CASES(type, buf_ = new Buffer<T>(a, shape.num_elements(), allocation_attr));
+  }
+}
+
 Tensor::Tensor(DataType type, const TensorShape& shape)
     : Tensor(cpu_allocator(), type, shape) {}
 
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index bcdc1c3510..8f131f7b81 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -57,8 +57,11 @@ class CudnnScratchAllocator : public perftools::gputools::ScratchAllocator {
   AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override {
     Tensor temporary_memory;
 
+    AllocationAttributes allocation_attr;
+    allocation_attr.no_retry_on_failure = true;
     Status allocation_status(context_->allocate_temp(
-        DT_UINT8, TensorShape({byte_size}), &temporary_memory));
+        DT_UINT8, TensorShape({byte_size}), &temporary_memory,
+        AllocatorAttributes(), allocation_attr));
     if (!allocation_status.ok()) {
       LOG(WARNING) << allocation_status;
       return perftools::gputools::port::StatusOr<
diff --git a/tensorflow/core/public/tensor.h b/tensorflow/core/public/tensor.h
index c613831a8d..cbed45363d 100644
--- a/tensorflow/core/public/tensor.h
+++ b/tensorflow/core/public/tensor.h
@@ -54,6 +54,14 @@ class Tensor {
   /// `a` must outlive the lifetime of this Tensor.
   Tensor(Allocator* a, DataType type, const TensorShape& shape);
 
+  /// \brief Creates a tensor with the input `type` and `shape`, using the
+  /// allocator `a` and the specified "allocation_attr" to allocate the
+  /// underlying buffer.
+  ///
+  /// `a` must outlive the lifetime of this Tensor.
+  Tensor(Allocator* a, DataType type, const TensorShape& shape,
+         const AllocationAttributes& allocation_attr);
+
   /// Creates an uninitialized Tensor of the given data type.
   explicit Tensor(DataType type);