4 files changed, 560 insertions, 135 deletions
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index d83d3042d5..5db86cefc3 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -38,6 +38,7 @@ limitations under the License.
 namespace perftools {
 namespace gputools {
 
+class HostBuffer;
 class Stream;
 class ScratchAllocator;
 
@@ -125,6 +126,15 @@ enum class RnnDirectionMode {
   kRnnBidirectional = 1,
 };
 
+// Relevant to DepthToSpace and SpaceToDepth. This is the write layout when
+// performing depth to space and the read layout when performing space to depth.
+// It's specified with most-major dimension first and most-minor dimension last.
+// In DepthToSpace, the D*M² values are read in and then, for DepthHeightWidth,
+// written out to the output patch, by varying first width, then height, then
+// depth. In C array format, it looks like [depth][height][width]. See
+// DepthToSpace comment for more information.
+enum class DepthToSpaceLayout { DepthHeightWidth };
+
 // Specifies the descriptor for a RNN model.
 //
 // An example use case:
@@ -530,6 +540,13 @@ enum class PoolingMode : int64 {
   kAverage,
 };
 
+// Specify the dimension in which to concatenate inputs in space.
+// Specify int64 so there's no padding in SpaceConcatenateMode.
+enum class SpaceConcatenateMode : int64 {
+  XDirection,
+  YDirection,
+};
+
 // Returns a short name for the pooling mode, e.g. "Avg".
 string ShortPoolingModeString(PoolingMode mode);
 
@@ -1319,6 +1336,129 @@ class DnnSupport {
       port::ArraySlice<const DeviceMemory<float>*> input_data,
       DeviceMemory<float>* output_data) = 0;
 
+  // Concatenates several layers into one, by concatenating each in the
+  // x-dimension or y-dimension, based on a user-specified flag.
+  // For x-concatenation, layers are aligned at matching y and depth
+  // coordinates, and for y-concatenation, they are aligned at matching x and
+  // depth coordinates. The inputs must all have the same depth and batch size.
+  // For x-concatenation, the inputs must have the same height (y-size), and the
+  // output will have the same depth and height as the inputs and its width (x-
+  // size) will be the sum of the input widths.  For y-concatenation, the inputs
+  // must have the same width, and the output will have the same depth and width
+  // as the inputs, and its height will be the sum of the input heights.
+  //
+  // Arguments:
+  //  stream: borrowed pointer to the stream that the 'space concatenate'
+  //    operation should be enqueued onto.
+  //  input_dimensions: the dimensions of each input.
+  //  input_data: un-owned device memory region which contains the input data
+  //    for each input layer.
+  //  output_data: un-owned device memory region in which to place the space
+  //    concatenate result.
+  //  concat_direction:  either dnn:SpaceConcatenateMode::XDirection or
+  //    dnn::SpaceConcatenateMode::YDirection.
+  virtual bool DoSpaceConcatenate(
+      Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+      port::ArraySlice<const DeviceMemory<float>*> input_data,
+      DeviceMemory<float>* output_data,
+      dnn::SpaceConcatenateMode concat_direction) {
+    return false;
+  }
+
+  // Change the layout of the data by shrinking one dimension (or set of
+  // dimensions) and growing another dimension (or set of dimensions), while
+  // keeping the total number of data elements constant, and maintaining the
+  // current data ordering.
+  //
+  // Currently, the only supported operation is depth into space by a power of
+  // 2. E.g. (y, x, z) -> (y*2, x*2, z/4)
+  //
+  // Note that Reshape may not be a no-op, depending on the platform and which
+  // dimensions are being changed.
+  //
+  // Example: forgetting about batch for the moment, let's take a tensor that's
+  // 2x1x8 (y by x by z) and reshape to a tensor that's 4x2x2. The memory layout
+  // is row-major order: y,x,z. I.e. z changes the fastest, then x, then y. The
+  // elements of the tensor range from 0 to 15. The x,y,z indices are below each
+  // element.
+  //
+  //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+  // y0 y0 y0 y0 y0 y0 y0 y0 y1 y1 y1 y1 y1 y1 y1 y1
+  // x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0
+  // z0 z1 z2 z3 z4 z5 z6 z7 z0 z1 z2 z3 z4 z5 z6 z7
+  //
+  // reshape to 4x2x2
+  //
+  //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+  // y0 y0 y0 y0 y1 y1 y1 y1 y2 y2 y2 y2 y3 y3 y3 y3
+  // x0 x0 x1 x1 x0 x0 x1 x1 x0 x0 x1 x1 x0 x0 x1 x1
+  // z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1
+  virtual bool DoReshape(Stream* stream,
+                         const dnn::BatchDescriptor& input_dimensions,
+                         const DeviceMemory<float>& input_data,
+                         const dnn::BatchDescriptor& output_dimensions,
+                         DeviceMemory<float>* output_data) {
+    return false;
+  }
+
+  // Depth to space takes an X by Y image with depth D*M² and changes it to an
+  // MX x MY image with depth D. Each input location (x,y) with depth D*M² in
+  // the input image is changed to an MxM contiguous area in the output image,
+  // with the values being laid out in the raster order by DepthToSpaceLayout,
+  // and will have a new depth of D.
+  //
+  // Example.
+  // M=2, Din =8, Xin=2, Yin=2. Xout=4, Yout=4,  Dout=2
+  // DepthHeightWidth layout
+  // Values within a 'cell' are at different depths and same x & y.
+  // Input:
+  // abcdefgh  ijklmnop
+  // qrstuvwx  yz012345
+  // Output:
+  // ae bf im jn
+  // cg dh ko lp
+  // qu rv y2 z3
+  // sw tx 04 15
+  //
+  // sqrt_depth_reduction: 'M' in the comment above
+  virtual bool DoDepthToSpace(Stream* stream,
+                              const dnn::BatchDescriptor& input_dimensions,
+                              const DeviceMemory<float>& input_data,
+                              const DepthToSpaceLayout& depth_to_space_layout,
+                              const int& sqrt_depth_reduction,
+                              DeviceMemory<float>* output_data) {
+    return false;
+  }
+
+  // Space to depth is the inverse of depth to space. Space to depth takes each
+  // non-overlapping M by M patch (in the X and Y dimensions) with depth D of
+  // the input, and transforms it to a 1 by 1 patch with depth D*M². If the
+  // input has size (MX, MY, D), the output has size (X, Y, D*M²). The number of
+  // data elements is not changed.
+  //
+  // Example.
+  // M=2, Din =2, Xin=4, Yin=4,  Dout=8
+  // DepthHeightWidth layout
+  // Values within a 'cell' are at different depths and same x & y.
+  // Input:
+  // ae bf im jn
+  // cg dh ko lp
+  // qu rv y2 z3
+  // sw tx 04 15
+  // Output:
+  // abcdefgh  ijklmnop
+  // qrstuvwx  yz012345
+  //
+  // sqrt_depth_increase: 'M' in the comment above
+  virtual bool DoSpaceToDepth(Stream* stream,
+                              const dnn::BatchDescriptor& input_dimensions,
+                              const DeviceMemory<float>& input_data,
+                              const DepthToSpaceLayout& space_to_depth_layout,
+                              const int& sqrt_depth_increase,
+                              DeviceMemory<float>* output_data) {
+    return false;
+  }
+
   // Computes the specified operation (e.g. addition or multiplication)
   // between corresponding elements in the inputs and stores the result in the
   // output element.
@@ -1342,6 +1482,37 @@ class DnnSupport {
       const dnn::BatchDescriptor& output_dimensions,
       DeviceMemory<float>* output_data) = 0;
 
+  // Computes the specified operation (e.g. addition or multiplication)
+  // between corresponding elements in the inputs and stores the result in the
+  // output element. Each input is multiplied by a scalar constant and the
+  // result is divided by a scalar constant.
+  // e.g. To perform Z = 0.9*X + 1.1*Y, set the input multiplicands to 9 and 11
+  // and the output divisor to 10.
+  // The inputs and output must all have the same dimensions, but may have
+  // different quantization parameters (min_value and max_value).
+  //
+  // Arguments (all borrowed):
+  //  stream: borrowed pointer to the stream that the 'elementwise operation'
+  // should be enqueued onto.
+  //  operation: The operation to perform.
+  //  input_multiplicands: Amount to scale each input.
+  //  output_divisor: Amount to divide the output.
+  //  input_dimensions: The dimensions of each input.
+  //  input_data: un-owned device memory region which contains the
+  //    input data for each input layer.
+  //  output_dimensions: The dimensions of the output.
+  //  output_data: un-owned device memory region in which to place the
+  //    operation result.
+  virtual bool DoElementwiseOperateScaledQuantized(
+      Stream* stream, ElementwiseOperation operation,
+      port::ArraySlice<int> input_multiplicands, int output_divisor,
+      port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+      port::ArraySlice<const DeviceMemory<float>*> input_data,
+      const dnn::BatchDescriptor& output_dimensions,
+      DeviceMemory<float>* output_data) {
+    return false;
+  }
+
   // Pads the input with zeros in the X and Y dimensions. The feature_map
   // dimension is unchanged.
   //
@@ -1382,6 +1553,43 @@ class DnnSupport {
                     int64 left_trim, int64 right_trim, int64 top_trim,
                     int64 bottom_trim, DeviceMemory<float> *output_data) = 0;
 
+  // Grows the input tensor by replicating the X and Y dimensions. The batch and
+  // depth/feature_map dimensions are unchanged. Currently, the input tensor is
+  // limited to X=1 and Y=1.
+  //
+  // For example, the input has dimensions x=2, y=3, and replicate_x=3,
+  // replicate_y=2. The diagonal elements of the output would be: [x0y0, x1y1,
+  // x0y2, x1y0, x0y1, x1y2].
+  // Here is the example as a picture. input:
+  // AB
+  // CD
+  // EF
+  // broadcast result:
+  // ABABAB
+  // CDCDCD
+  // EFEFEF
+  // ABABAB
+  // CDCDCD
+  // EFEFEF
+  //
+  // Arguments (all borrowed):
+  //  stream: borrowed pointer to the stream that the 'elementwise operation'
+  // should be enqueued onto.
+  //  dimensions: The dimensions of the input.
+  //  input_data: un-owned device memory region which contains the
+  //    input data for the input layer.
+  //  replicate_x: Amount to replicate the input's X dimension.
+  //  replicate_y: Amount to replicate the input's Y dimension.
+  //  output_data: un-owned device memory region in which to place the
+  //    padded result.
+  virtual bool DoXYBroadcast(Stream* stream,
+                             const dnn::BatchDescriptor& dimensions,
+                             const DeviceMemory<float>& input_data,
+                             int64 replicate_x, int64 replicate_y,
+                             DeviceMemory<float>* output_data) {
+    return false;
+  }
+
   // Enqueues an asynchronous memcpy of the *quantized* output of a layer (that
   // is, bytes instead of scaled floats) into 'host_dst' if they are available
   // for the underlying DNN implementation. If this quantized output is not
@@ -1425,6 +1633,21 @@ class DnnSupport {
       QuantizedActivationMode mode,
       DeviceMemory<float>* gpu_unquantized_dst) = 0;
 
+  // Enqueues an asynchronous copy of the contents of buffer_src to
+  // gpu_unquantized_dst.
+  virtual bool DoCopyHostBuffer2Device(
+      Stream* stream, HostBuffer* buffer_src,
+      DeviceMemory<float>* gpu_unquantized_dst) {
+    return false;
+  }
+
+  // Enqueues an asynchronous copy of the contents of gpu_unquantized_src to
+  // buffer_dst.
+  virtual bool DoCopyDevice2HostBuffer(
+      Stream* stream, const DeviceMemory<float>& gpu_unquantized_src,
+      HostBuffer* buffer_dst) {
+    return false;
+  }
 
   // Create an RNN descriptor based on model shapes and configurations.
   // The caller retains the ownership of the descriptor.
diff --git a/tensorflow/stream_executor/host_buffer.h b/tensorflow/stream_executor/host_buffer.h
new file mode 100644
index 0000000000..8fa542e9ff
--- /dev/null
+++ b/tensorflow/stream_executor/host_buffer.h
@@ -0,0 +1,48 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_BUFFER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_HOST_BUFFER_H_
+
+#include "tensorflow/stream_executor/dnn.h"
+
+namespace perftools {
+namespace gputools {
+
+// A HostBuffer is a block of memory in host memory containing the data for a
+// dnn::BatchDescriptor using a device-dependent memory layout.
+// Derived classes provide methods to construct a HostBuffer for a specific
+// device, and to copy data in and out of the buffer.
+class HostBuffer {
+ public:
+  const dnn::BatchDescriptor& descriptor() const { return descriptor_; }
+
+  // Returns a string describing the HostBuffer.
+  virtual string AsString() const = 0;
+
+ protected:
+  // Construct a HostBuffer from the supplied dnn::BatchDescriptor.
+  explicit HostBuffer(const dnn::BatchDescriptor& descriptor)
+      : descriptor_(descriptor) {}
+  virtual ~HostBuffer() {}
+
+ private:
+  const dnn::BatchDescriptor descriptor_;
+};
+
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_BUFFER_H_
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 512e882cad..980d544b01 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 
 #include "tensorflow/stream_executor/blas.h"
+#include "tensorflow/stream_executor/host_buffer.h"
 #include "tensorflow/stream_executor/lib/stacktrace.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/platform.h"
@@ -85,6 +86,8 @@ string ToVlogString(const void *ptr) {
   return out.str();
 }
 
+string ToVlogString(const HostBuffer &buffer) { return buffer.AsString(); }
+
 template <class T>
 string ToVlogString(const std::complex<T> &c) {
   // StrCat does not convert std::complex to text.
@@ -149,6 +152,13 @@ string ToVlogString(port::MutableArraySlice<T> elements) {
   return ToVlogString(port::ArraySlice<T>(elements));
 }
 
+string ToVlogString(dnn::DepthToSpaceLayout depth_to_space_layout) {
+  switch (depth_to_space_layout) {
+    case dnn::DepthToSpaceLayout::DepthHeightWidth:
+      return "DepthToSpaceLayout::DepthHeightWidth";
+  }
+}
+
 // Used together with PARAM to VLOG calls made to the stream. Intended
 // to be used like this:
 //
@@ -299,10 +309,7 @@ Stream &Stream::ThenBatchNormalizationForward(
           saved_inv_var, is_training, std::move(var_to_inv_var),
           std::move(inv_var_to_var)));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -324,10 +331,7 @@ Stream &Stream::ThenBatchNormalizationBackward(
           this, y_backprop, x, scale, mean, variance, x_desc, scale_offset_desc,
           epsilon, x_backprop, scale_backprop, offset_backprop));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -355,10 +359,7 @@ Stream &Stream::ThenConvolveWithScratch(
           /*scratch_allocator=*/scratch_allocator, dnn::AlgorithmConfig(),
           nullptr));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -385,10 +386,7 @@ Stream &Stream::ThenConvolveWithScratch(
           /*scratch_allocator=*/scratch_allocator, dnn::AlgorithmConfig(),
           nullptr));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -419,10 +417,7 @@ Stream &Stream::ThenConvolveWithAlgorithm(
         SetError();
       }
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -453,10 +448,7 @@ Stream &Stream::ThenConvolveWithAlgorithm(
         SetError();
       }
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -497,10 +489,7 @@ Stream &Stream::ThenSeparableConvolve(
           depth_multiplier, first_weights, second_weights,
           convolution_descriptor, output_descriptor, output));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -528,10 +517,7 @@ Stream &Stream::ThenConvolveBackwardDataWithScratch(
           backward_input_data, scratch_allocator, dnn::AlgorithmConfig(),
           nullptr));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -564,10 +550,7 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
         SetError();
       }
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -600,10 +583,7 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
         SetError();
       }
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -631,10 +611,7 @@ Stream &Stream::ThenConvolveBackwardDataWithScratch(
           backward_input_data, scratch_allocator, dnn::AlgorithmConfig(),
           nullptr));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -676,10 +653,7 @@ Stream &Stream::ThenConvolveBackwardFilterWithScratch(
           backward_filter_data, scratch_allocator, dnn::AlgorithmConfig(),
           nullptr));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -712,10 +686,7 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
         SetError();
       }
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -743,10 +714,7 @@ Stream &Stream::ThenConvolveBackwardFilterWithScratch(
           backward_filter_data, scratch_allocator, dnn::AlgorithmConfig(),
           nullptr));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -779,10 +747,7 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
         SetError();
       }
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -817,10 +782,7 @@ Stream &Stream::ThenConvolveBackwardBiasImpl(
                                              bias_descriptor,
                                              backward_bias_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -866,10 +828,7 @@ Stream &Stream::ThenMatMul(const DeviceMemory<float> &input_data,
       CheckError(dnn->DoMatMul(this, input_data, weights, input_dimensions,
                                output_dimensions, output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -891,10 +850,7 @@ Stream &Stream::ThenMatMulQuantized(
                                         weight_scales, input_dimensions,
                                         output_dimensions, output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -916,10 +872,7 @@ Stream &Stream::ThenMatMulQuantized(
                                         weight_scales, input_dimensions,
                                         output_dimensions, output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -937,10 +890,7 @@ Stream &Stream::ThenBiasAdd(const DeviceMemory<float> &input_data,
       CheckError(
           dnn->DoBiasAdd(this, input_data, biases, dimensions, output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -961,10 +911,7 @@ Stream &Stream::ThenPoolForward(
                                     input_data, output_dimensions,
                                     output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -985,10 +932,7 @@ Stream &Stream::ThenPoolForward(
                                     input_data, output_dimensions,
                                     output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1012,10 +956,7 @@ Stream &Stream::ThenPoolBackward(
                                      input_data, output_dimensions, output_data,
                                      input_diff_data, output_diff_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1039,10 +980,7 @@ Stream &Stream::ThenPoolBackward(
                                      input_data, output_dimensions, output_data,
                                      input_diff_data, output_diff_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1058,10 +996,7 @@ Stream &Stream::ThenNormalize(
       CheckError(dnn->DoNormalize(this, normalize_descriptor, input_data,
                                   output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1079,10 +1014,7 @@ Stream &Stream::ThenNormalizeWithDimensions(
       CheckError(dnn->DoNormalizeWithDimensions(
           this, normalize_descriptor, dimensions, input_data, output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1104,10 +1036,7 @@ Stream &Stream::ThenNormalizeBackwardWithDimensions(
           this, normalize_descriptor, dimensions, raw_data, normalized_data,
           normalized_variable_gradient, raw_variable_gradient));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1125,10 +1054,7 @@ Stream &Stream::ThenActivate(dnn::ActivationMode activation_mode,
       CheckError(dnn->DoActivate(this, activation_mode, dimensions, input_data,
                                  output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1158,10 +1084,114 @@ Stream &Stream::ThenDepthConcatenate(
       CheckError(dnn->DoDepthConcatenate(this, input_dimensions, input_data,
                                          output_data));
     } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenSpaceConcatenate(
+    port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+    port::ArraySlice<const DeviceMemory<float> *> input_data,
+    DeviceMemory<float> *output_data,
+    dnn::SpaceConcatenateMode concat_direction) {
+  VLOG_CALL(PARAM(input_dimensions), PARAM(input_data), PARAM(output_data));
+
+  // Check that the input dimensions of all the other batches match those of the
+  // first batch.
+  for (size_t i = 1; i < input_dimensions.size(); ++i) {
+    if ((concat_direction == dnn::SpaceConcatenateMode::XDirection) &&
+        (input_dimensions[i].count() != input_dimensions[0].count() ||
+         input_dimensions[i].height() != input_dimensions[0].height() ||
+         input_dimensions[i].feature_map_count() !=
+             input_dimensions[0].feature_map_count())) {
       SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      LOG(ERROR) << "Incompatible dimensions for X concatenation.\n"
+                 << "input_dimensions[0]: " << input_dimensions[0].ToString()
+                 << "input_dimensions[" << i
+                 << "]: " << input_dimensions[i].ToString();
+      return *this;
+    }
+
+    if ((concat_direction == dnn::SpaceConcatenateMode::YDirection) &&
+        (input_dimensions[i].count() != input_dimensions[0].count() ||
+         input_dimensions[i].width() != input_dimensions[0].width() ||
+         input_dimensions[i].feature_map_count() !=
+             input_dimensions[0].feature_map_count())) {
+      SetError();
+      LOG(ERROR) << "Incompatible dimensions for Y concatenation.\n"
+                 << "input_dimensions[0]: " << input_dimensions[0].ToString()
+                 << "input_dimensions[" << i
+                 << "]: " << input_dimensions[i].ToString();
+      return *this;
+    }
+  }
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoSpaceConcatenate(this, input_dimensions, input_data,
+                                         output_data, concat_direction));
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenReshape(const dnn::BatchDescriptor &input_dimensions,
+                            const DeviceMemory<float> &input_data,
+                            const dnn::BatchDescriptor &output_dimensions,
+                            DeviceMemory<float> *output_data) {
+  VLOG_CALL(PARAM(input_dimensions), PARAM(input_data),
+            PARAM(output_dimensions), PARAM(output_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoReshape(this, input_dimensions, input_data,
+                                output_dimensions, output_data));
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenDepthToSpace(
+    const dnn::BatchDescriptor &input_dimensions,
+    const DeviceMemory<float> &input_data,
+    const dnn::DepthToSpaceLayout &depth_to_space_layout,
+    const int sqrt_depth_reduction, DeviceMemory<float> *output_data) {
+  VLOG_CALL(PARAM(input_dimensions), PARAM(input_data),
+            PARAM(depth_to_space_layout), PARAM(sqrt_depth_reduction),
+            PARAM(output_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoDepthToSpace(this, input_dimensions, input_data,
+                                     depth_to_space_layout,
+                                     sqrt_depth_reduction, output_data));
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenSpaceToDepth(
+    const dnn::BatchDescriptor &input_dimensions,
+    const DeviceMemory<float> &input_data,
+    const dnn::DepthToSpaceLayout &space_to_depth_layout,
+    const int sqrt_depth_increase, DeviceMemory<float> *output_data) {
+  VLOG_CALL(PARAM(input_dimensions), PARAM(input_data),
+            PARAM(space_to_depth_layout), PARAM(sqrt_depth_increase),
+            PARAM(output_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoSpaceToDepth(this, input_dimensions, input_data,
+                                     space_to_depth_layout, sqrt_depth_increase,
+                                     output_data));
+    } else {
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1182,10 +1212,30 @@ Stream &Stream::ThenElementwiseOperate(
                                            input_data, output_dimensions,
                                            output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenElementwiseOperateScaledQuantized(
+    dnn::ElementwiseOperation operation,
+    port::ArraySlice<int> input_multiplicands, int output_divisor,
+    port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+    port::ArraySlice<const DeviceMemory<float> *> input_data,
+    const dnn::BatchDescriptor &output_dimensions,
+    DeviceMemory<float> *output_data) {
+  VLOG_CALL(PARAM(operation), PARAM(input_multiplicands), PARAM(output_divisor),
+            PARAM(input_dimensions), PARAM(input_data),
+            PARAM(output_dimensions), PARAM(output_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoElementwiseOperateScaledQuantized(
+          this, operation, input_multiplicands, output_divisor,
+          input_dimensions, input_data, output_dimensions, output_data));
+    } else {
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1204,10 +1254,7 @@ Stream &Stream::ThenXYPad(const dnn::BatchDescriptor &dimensions,
       CheckError(dnn->DoXYPad(this, dimensions, input_data, left_pad, right_pad,
                               top_pad, bottom_pad, output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1228,10 +1275,25 @@ Stream &Stream::ThenXYSlice(const dnn::BatchDescriptor &dimensions,
                                 right_trim, top_trim, bottom_trim,
                                 output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenXYBroadcast(const dnn::BatchDescriptor &dimensions,
+                                const DeviceMemory<float> &input_data,
+                                int64 replicate_x, int64 replicate_y,
+                                DeviceMemory<float> *output_data) {
+  VLOG_CALL(PARAM(dimensions), PARAM(input_data), PARAM(replicate_x),
+            PARAM(replicate_y), PARAM(output_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoXYBroadcast(this, dimensions, input_data, replicate_x,
+                                    replicate_y, output_data));
+    } else {
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1248,10 +1310,7 @@ Stream &Stream::ThenMemcpyD2HQuantized(
       CheckError(dnn->DoMemcpyD2HQuantized(this, gpu_unquantized_src, mode,
                                            host_dst, size));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1268,10 +1327,37 @@ Stream &Stream::ThenMemcpyH2DQuantized(
       CheckError(dnn->DoMemcpyH2DQuantized(this, host_src, size, mode,
                                            gpu_unquantized_dst));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenCopyHostBuffer2Device(
+    HostBuffer *buffer_src, DeviceMemory<float> *gpu_unquantized_dst) {
+  VLOG_CALL(PARAM(*buffer_src), PARAM(gpu_unquantized_dst));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(
+          dnn->DoCopyHostBuffer2Device(this, buffer_src, gpu_unquantized_dst));
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenCopyDevice2HostBuffer(
+    const DeviceMemory<float> &gpu_unquantized_src, HostBuffer *buffer_dst) {
+  VLOG_CALL(PARAM(gpu_unquantized_src), PARAM(*buffer_dst));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(
+          dnn->DoCopyDevice2HostBuffer(this, gpu_unquantized_src, buffer_dst));
+    } else {
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 0d16495a1d..711eb3079a 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -499,6 +499,44 @@ class Stream {
       port::ArraySlice<const DeviceMemory<float> *> input_data,
       DeviceMemory<float> *output_data);
 
+  Stream &ThenSpaceConcatenate(
+      port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+      port::ArraySlice<const DeviceMemory<float> *> input_data,
+      DeviceMemory<float> *output_data,
+      dnn::SpaceConcatenateMode concat_direction);
+
+  // Change the layout of the data by shrinking one dimension (or set of
+  // dimensions) and growing another dimension (or set of dimensions), while
+  // keeping the total number of data elements constant, and maintaining the
+  // current data ordering.
+  Stream &ThenReshape(const dnn::BatchDescriptor &input_dimensions,
+                      const DeviceMemory<float> &input_data,
+                      const dnn::BatchDescriptor &output_dimensions,
+                      DeviceMemory<float> *output_data);
+
+  // Depth to space takes an X by Y image with depth D*M² and changes it to an
+  // MX x MY image with depth D. Each input location (x,y) with depth D*M² in
+  // the input image is changed to an MxM contiguous area in the output image,
+  // with the values being laid out in raster order specified by
+  // DepthToSpaceLayout, and will have a new depth of D.
+  // See the DoDepthToSpace comment for more information.
+  Stream &ThenDepthToSpace(const dnn::BatchDescriptor &input_dimensions,
+                           const DeviceMemory<float> &input_data,
+                           const dnn::DepthToSpaceLayout &depth_to_space_layout,
+                           const int sqrt_depth_reduction,
+                           DeviceMemory<float> *output_data);
+
+  // Space to depth is the inverse of depth to space. Space to depth takes each
+  // non-overlapping M by M patch (in the X and Y dimensions) with depth D of
+  // the input, and transforms it to a 1 by 1 patch with depth D*M². If the
+  // input has size (MX, MY, D), the output has size (X, Y, D*M²). The number of
+  // data elements is not changed.
+  Stream &ThenSpaceToDepth(const dnn::BatchDescriptor &input_dimensions,
+                           const DeviceMemory<float> &input_data,
+                           const dnn::DepthToSpaceLayout &space_to_depth_layout,
+                           const int sqrt_depth_increase,
+                           DeviceMemory<float> *output_data);
+
   Stream &ThenElementwiseOperate(
       dnn::ElementwiseOperation operation,
       port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
@@ -506,6 +544,14 @@ class Stream {
       const dnn::BatchDescriptor &output_dimensions,
       DeviceMemory<float> *output_data);
 
+  Stream &ThenElementwiseOperateScaledQuantized(
+      dnn::ElementwiseOperation operation,
+      port::ArraySlice<int> input_multiplicands, int output_divisor,
+      port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+      port::ArraySlice<const DeviceMemory<float> *> input_data,
+      const dnn::BatchDescriptor &output_dimensions,
+      DeviceMemory<float> *output_data);
+
   Stream &ThenXYPad(const dnn::BatchDescriptor &dimensions,
                     const DeviceMemory<float> &input_data, int64 left_pad,
                     int64 right_pad, int64 top_pad, int64 bottom_pad,
@@ -516,6 +562,14 @@ class Stream {
                       int64 right_trim, int64 top_trim, int64 bottom_trim,
                       DeviceMemory<float> *output_data);
 
+  // Grows the input tensor by replicating the X and Y dimensions. The batch and
+  // depth/feature_map dimensions are unchanged. Currently, the input tensor is
+  // limited to X=1 and Y=1.
+  Stream &ThenXYBroadcast(const dnn::BatchDescriptor &dimensions,
+                          const DeviceMemory<float> &input_data,
+                          int64 replicate_x, int64 replicate_y,
+                          DeviceMemory<float> *output_data);
+
   // See DnnSupport::DoMemcpyD2HQuantized.
   Stream &ThenMemcpyD2HQuantized(const DeviceMemory<float> &gpu_unquantized_src,
                                  dnn::QuantizedActivationMode mode,
@@ -549,6 +603,14 @@ class Stream {
         Quantization<ElementType>::kModeId, gpu_unquantized_dst);
   }
 
+  // See DnnSupport::DoCopyHostBuffer2Device.
+  Stream &ThenCopyHostBuffer2Device(HostBuffer *buffer_src,
+                                    DeviceMemory<float> *gpu_unquantized_dst);
+
+  // See DnnSupport::DoCopyDevice2HostBuffer.
+  Stream &ThenCopyDevice2HostBuffer(
+      const DeviceMemory<float> &gpu_unquantized_src, HostBuffer *buffer_dst);
+
   /////////////////
   // BLAS support
 
@@ -1527,6 +1589,12 @@ class Stream {
 
   void SetError() { CheckError(false /* = operation_retcode */); }
 
+  void SetErrorAndLogNoDnnSupport() {
+    SetError();
+    LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor "
+                    "without DNN support";
+  }
+
   // The StreamExecutor that supports the operation of this stream.
   StreamExecutor *parent_;