diff options
-rw-r--r-- | tensorflow/stream_executor/dnn.h | 223 | ||||
-rw-r--r-- | tensorflow/stream_executor/host_buffer.h | 48 | ||||
-rw-r--r-- | tensorflow/stream_executor/stream.cc | 356 | ||||
-rw-r--r-- | tensorflow/stream_executor/stream.h | 68 |
4 files changed, 560 insertions, 135 deletions
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h index d83d3042d5..5db86cefc3 100644 --- a/tensorflow/stream_executor/dnn.h +++ b/tensorflow/stream_executor/dnn.h @@ -38,6 +38,7 @@ limitations under the License. namespace perftools { namespace gputools { +class HostBuffer; class Stream; class ScratchAllocator; @@ -125,6 +126,15 @@ enum class RnnDirectionMode { kRnnBidirectional = 1, }; +// Relevant to DepthToSpace and SpaceToDepth. This is the write layout when +// performing depth to space and the read layout when performing space to depth. +// It's specified with most-major dimension first and most-minor dimension last. +// In DepthToSpace, the D*M² values are read in and then, for DepthHeightWidth, +// written out to the output patch, by varying first width, then height, then +// depth. In C array format, it looks like [depth][height][width]. See +// DepthToSpace comment for more information. +enum class DepthToSpaceLayout { DepthHeightWidth }; + // Specifies the descriptor for a RNN model. // // An example use case: @@ -530,6 +540,13 @@ enum class PoolingMode : int64 { kAverage, }; +// Specify the dimension in which to concatenate inputs in space. +// Specify int64 so there's no padding in SpaceConcatenateMode. +enum class SpaceConcatenateMode : int64 { + XDirection, + YDirection, +}; + // Returns a short name for the pooling mode, e.g. "Avg". string ShortPoolingModeString(PoolingMode mode); @@ -1319,6 +1336,129 @@ class DnnSupport { port::ArraySlice<const DeviceMemory<float>*> input_data, DeviceMemory<float>* output_data) = 0; + // Concatenates several layers into one, by concatenating each in the + // x-dimension or y-dimension, based on a user-specified flag. + // For x-concatenation, layers are aligned at matching y and depth + // coordinates, and for y-concatenation, they are aligned at matching x and + // depth coordinates. The inputs must all have the same depth and batch size. + // For x-concatenation, the inputs must have the same height (y-size), and the + // output will have the same depth and height as the inputs and its width (x- + // size) will be the sum of the input widths. For y-concatenation, the inputs + // must have the same width, and the output will have the same depth and width + // as the inputs, and its height will be the sum of the input heights. + // + // Arguments: + // stream: borrowed pointer to the stream that the 'space concatenate' + // operation should be enqueued onto. + // input_dimensions: the dimensions of each input. + // input_data: un-owned device memory region which contains the input data + // for each input layer. + // output_data: un-owned device memory region in which to place the space + // concatenate result. + // concat_direction: either dnn:SpaceConcatenateMode::XDirection or + // dnn::SpaceConcatenateMode::YDirection. + virtual bool DoSpaceConcatenate( + Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float>*> input_data, + DeviceMemory<float>* output_data, + dnn::SpaceConcatenateMode concat_direction) { + return false; + } + + // Change the layout of the data by shrinking one dimension (or set of + // dimensions) and growing another dimension (or set of dimensions), while + // keeping the total number of data elements constant, and maintaining the + // current data ordering. + // + // Currently, the only supported operation is depth into space by a power of + // 2. E.g. (y, x, z) -> (y*2, x*2, z/4) + // + // Note that Reshape may not be a no-op, depending on the platform and which + // dimensions are being changed. + // + // Example: forgetting about batch for the moment, let's take a tensor that's + // 2x1x8 (y by x by z) and reshape to a tensor that's 4x2x2. The memory layout + // is row-major order: y,x,z. I.e. z changes the fastest, then x, then y. The + // elements of the tensor range from 0 to 15. The x,y,z indices are below each + // element. + // + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + // y0 y0 y0 y0 y0 y0 y0 y0 y1 y1 y1 y1 y1 y1 y1 y1 + // x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 + // z0 z1 z2 z3 z4 z5 z6 z7 z0 z1 z2 z3 z4 z5 z6 z7 + // + // reshape to 4x2x2 + // + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + // y0 y0 y0 y0 y1 y1 y1 y1 y2 y2 y2 y2 y3 y3 y3 y3 + // x0 x0 x1 x1 x0 x0 x1 x1 x0 x0 x1 x1 x0 x0 x1 x1 + // z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 + virtual bool DoReshape(Stream* stream, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) { + return false; + } + + // Depth to space takes an X by Y image with depth D*M² and changes it to an + // MX x MY image with depth D. Each input location (x,y) with depth D*M² in + // the input image is changed to an MxM contiguous area in the output image, + // with the values being laid out in the raster order by DepthToSpaceLayout, + // and will have a new depth of D. + // + // Example. + // M=2, Din =8, Xin=2, Yin=2. Xout=4, Yout=4, Dout=2 + // DepthHeightWidth layout + // Values within a 'cell' are at different depths and same x & y. + // Input: + // abcdefgh ijklmnop + // qrstuvwx yz012345 + // Output: + // ae bf im jn + // cg dh ko lp + // qu rv y2 z3 + // sw tx 04 15 + // + // sqrt_depth_reduction: 'M' in the comment above + virtual bool DoDepthToSpace(Stream* stream, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory<float>& input_data, + const DepthToSpaceLayout& depth_to_space_layout, + const int& sqrt_depth_reduction, + DeviceMemory<float>* output_data) { + return false; + } + + // Space to depth is the inverse of depth to space. Space to depth takes each + // non-overlapping M by M patch (in the X and Y dimensions) with depth D of + // the input, and transforms it to a 1 by 1 patch with depth D*M². If the + // input has size (MX, MY, D), the output has size (X, Y, D*M²). The number of + // data elements is not changed. + // + // Example. + // M=2, Din =2, Xin=4, Yin=4, Dout=8 + // DepthHeightWidth layout + // Values within a 'cell' are at different depths and same x & y. + // Input: + // ae bf im jn + // cg dh ko lp + // qu rv y2 z3 + // sw tx 04 15 + // Output: + // abcdefgh ijklmnop + // qrstuvwx yz012345 + // + // sqrt_depth_increase: 'M' in the comment above + virtual bool DoSpaceToDepth(Stream* stream, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory<float>& input_data, + const DepthToSpaceLayout& space_to_depth_layout, + const int& sqrt_depth_increase, + DeviceMemory<float>* output_data) { + return false; + } + // Computes the specified operation (e.g. addition or multiplication) // between corresponding elements in the inputs and stores the result in the // output element. @@ -1342,6 +1482,37 @@ class DnnSupport { const dnn::BatchDescriptor& output_dimensions, DeviceMemory<float>* output_data) = 0; + // Computes the specified operation (e.g. addition or multiplication) + // between corresponding elements in the inputs and stores the result in the + // output element. Each input is multiplied by a scalar constant and the + // result is divided by a scalar constant. + // e.g. To perform Z = 0.9*X + 1.1*Y, set the input multiplicands to 9 and 11 + // and the output divisor to 10. + // The inputs and output must all have the same dimensions, but may have + // different quantization parameters (min_value and max_value). + // + // Arguments (all borrowed): + // stream: borrowed pointer to the stream that the 'elementwise operation' + // should be enqueued onto. + // operation: The operation to perform. + // input_multiplicands: Amount to scale each input. + // output_divisor: Amount to divide the output. + // input_dimensions: The dimensions of each input. + // input_data: un-owned device memory region which contains the + // input data for each input layer. + // output_dimensions: The dimensions of the output. + // output_data: un-owned device memory region in which to place the + // operation result. + virtual bool DoElementwiseOperateScaledQuantized( + Stream* stream, ElementwiseOperation operation, + port::ArraySlice<int> input_multiplicands, int output_divisor, + port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float>*> input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) { + return false; + } + // Pads the input with zeros in the X and Y dimensions. The feature_map // dimension is unchanged. // @@ -1382,6 +1553,43 @@ class DnnSupport { int64 left_trim, int64 right_trim, int64 top_trim, int64 bottom_trim, DeviceMemory<float> *output_data) = 0; + // Grows the input tensor by replicating the X and Y dimensions. The batch and + // depth/feature_map dimensions are unchanged. Currently, the input tensor is + // limited to X=1 and Y=1. + // + // For example, the input has dimensions x=2, y=3, and replicate_x=3, + // replicate_y=2. The diagonal elements of the output would be: [x0y0, x1y1, + // x0y2, x1y0, x0y1, x1y2]. + // Here is the example as a picture. input: + // AB + // CD + // EF + // broadcast result: + // ABABAB + // CDCDCD + // EFEFEF + // ABABAB + // CDCDCD + // EFEFEF + // + // Arguments (all borrowed): + // stream: borrowed pointer to the stream that the 'elementwise operation' + // should be enqueued onto. + // dimensions: The dimensions of the input. + // input_data: un-owned device memory region which contains the + // input data for the input layer. + // replicate_x: Amount to replicate the input's X dimension. + // replicate_y: Amount to replicate the input's Y dimension. + // output_data: un-owned device memory region in which to place the + // padded result. + virtual bool DoXYBroadcast(Stream* stream, + const dnn::BatchDescriptor& dimensions, + const DeviceMemory<float>& input_data, + int64 replicate_x, int64 replicate_y, + DeviceMemory<float>* output_data) { + return false; + } + // Enqueues an asynchronous memcpy of the *quantized* output of a layer (that // is, bytes instead of scaled floats) into 'host_dst' if they are available // for the underlying DNN implementation. If this quantized output is not @@ -1425,6 +1633,21 @@ class DnnSupport { QuantizedActivationMode mode, DeviceMemory<float>* gpu_unquantized_dst) = 0; + // Enqueues an asynchronous copy of the contents of buffer_src to + // gpu_unquantized_dst. + virtual bool DoCopyHostBuffer2Device( + Stream* stream, HostBuffer* buffer_src, + DeviceMemory<float>* gpu_unquantized_dst) { + return false; + } + + // Enqueues an asynchronous copy of the contents of gpu_unquantized_src to + // buffer_dst. + virtual bool DoCopyDevice2HostBuffer( + Stream* stream, const DeviceMemory<float>& gpu_unquantized_src, + HostBuffer* buffer_dst) { + return false; + } // Create an RNN descriptor based on model shapes and configurations. // The caller retains the ownership of the descriptor. diff --git a/tensorflow/stream_executor/host_buffer.h b/tensorflow/stream_executor/host_buffer.h new file mode 100644 index 0000000000..8fa542e9ff --- /dev/null +++ b/tensorflow/stream_executor/host_buffer.h @@ -0,0 +1,48 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_BUFFER_H_ +#define TENSORFLOW_STREAM_EXECUTOR_HOST_BUFFER_H_ + +#include "tensorflow/stream_executor/dnn.h" + +namespace perftools { +namespace gputools { + +// A HostBuffer is a block of memory in host memory containing the data for a +// dnn::BatchDescriptor using a device-dependent memory layout. +// Derived classes provide methods to construct a HostBuffer for a specific +// device, and to copy data in and out of the buffer. +class HostBuffer { + public: + const dnn::BatchDescriptor& descriptor() const { return descriptor_; } + + // Returns a string describing the HostBuffer. + virtual string AsString() const = 0; + + protected: + // Construct a HostBuffer from the supplied dnn::BatchDescriptor. + explicit HostBuffer(const dnn::BatchDescriptor& descriptor) + : descriptor_(descriptor) {} + virtual ~HostBuffer() {} + + private: + const dnn::BatchDescriptor descriptor_; +}; + +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_HOST_BUFFER_H_ diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc index 512e882cad..980d544b01 100644 --- a/tensorflow/stream_executor/stream.cc +++ b/tensorflow/stream_executor/stream.cc @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/stream_executor/platform/port.h" #include "tensorflow/stream_executor/blas.h" +#include "tensorflow/stream_executor/host_buffer.h" #include "tensorflow/stream_executor/lib/stacktrace.h" #include "tensorflow/stream_executor/lib/strcat.h" #include "tensorflow/stream_executor/platform.h" @@ -85,6 +86,8 @@ string ToVlogString(const void *ptr) { return out.str(); } +string ToVlogString(const HostBuffer &buffer) { return buffer.AsString(); } + template <class T> string ToVlogString(const std::complex<T> &c) { // StrCat does not convert std::complex to text. @@ -149,6 +152,13 @@ string ToVlogString(port::MutableArraySlice<T> elements) { return ToVlogString(port::ArraySlice<T>(elements)); } +string ToVlogString(dnn::DepthToSpaceLayout depth_to_space_layout) { + switch (depth_to_space_layout) { + case dnn::DepthToSpaceLayout::DepthHeightWidth: + return "DepthToSpaceLayout::DepthHeightWidth"; + } +} + // Used together with PARAM to VLOG calls made to the stream. Intended // to be used like this: // @@ -299,10 +309,7 @@ Stream &Stream::ThenBatchNormalizationForward( saved_inv_var, is_training, std::move(var_to_inv_var), std::move(inv_var_to_var))); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -324,10 +331,7 @@ Stream &Stream::ThenBatchNormalizationBackward( this, y_backprop, x, scale, mean, variance, x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop, offset_backprop)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -355,10 +359,7 @@ Stream &Stream::ThenConvolveWithScratch( /*scratch_allocator=*/scratch_allocator, dnn::AlgorithmConfig(), nullptr)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -385,10 +386,7 @@ Stream &Stream::ThenConvolveWithScratch( /*scratch_allocator=*/scratch_allocator, dnn::AlgorithmConfig(), nullptr)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -419,10 +417,7 @@ Stream &Stream::ThenConvolveWithAlgorithm( SetError(); } } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -453,10 +448,7 @@ Stream &Stream::ThenConvolveWithAlgorithm( SetError(); } } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -497,10 +489,7 @@ Stream &Stream::ThenSeparableConvolve( depth_multiplier, first_weights, second_weights, convolution_descriptor, output_descriptor, output)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -528,10 +517,7 @@ Stream &Stream::ThenConvolveBackwardDataWithScratch( backward_input_data, scratch_allocator, dnn::AlgorithmConfig(), nullptr)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -564,10 +550,7 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm( SetError(); } } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -600,10 +583,7 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm( SetError(); } } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -631,10 +611,7 @@ Stream &Stream::ThenConvolveBackwardDataWithScratch( backward_input_data, scratch_allocator, dnn::AlgorithmConfig(), nullptr)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -676,10 +653,7 @@ Stream &Stream::ThenConvolveBackwardFilterWithScratch( backward_filter_data, scratch_allocator, dnn::AlgorithmConfig(), nullptr)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -712,10 +686,7 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm( SetError(); } } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -743,10 +714,7 @@ Stream &Stream::ThenConvolveBackwardFilterWithScratch( backward_filter_data, scratch_allocator, dnn::AlgorithmConfig(), nullptr)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -779,10 +747,7 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm( SetError(); } } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -817,10 +782,7 @@ Stream &Stream::ThenConvolveBackwardBiasImpl( bias_descriptor, backward_bias_data)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -866,10 +828,7 @@ Stream &Stream::ThenMatMul(const DeviceMemory<float> &input_data, CheckError(dnn->DoMatMul(this, input_data, weights, input_dimensions, output_dimensions, output_data)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -891,10 +850,7 @@ Stream &Stream::ThenMatMulQuantized( weight_scales, input_dimensions, output_dimensions, output_data)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -916,10 +872,7 @@ Stream &Stream::ThenMatMulQuantized( weight_scales, input_dimensions, output_dimensions, output_data)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -937,10 +890,7 @@ Stream &Stream::ThenBiasAdd(const DeviceMemory<float> &input_data, CheckError( dnn->DoBiasAdd(this, input_data, biases, dimensions, output_data)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -961,10 +911,7 @@ Stream &Stream::ThenPoolForward( input_data, output_dimensions, output_data)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -985,10 +932,7 @@ Stream &Stream::ThenPoolForward( input_data, output_dimensions, output_data)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -1012,10 +956,7 @@ Stream &Stream::ThenPoolBackward( input_data, output_dimensions, output_data, input_diff_data, output_diff_data)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -1039,10 +980,7 @@ Stream &Stream::ThenPoolBackward( input_data, output_dimensions, output_data, input_diff_data, output_diff_data)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -1058,10 +996,7 @@ Stream &Stream::ThenNormalize( CheckError(dnn->DoNormalize(this, normalize_descriptor, input_data, output_data)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -1079,10 +1014,7 @@ Stream &Stream::ThenNormalizeWithDimensions( CheckError(dnn->DoNormalizeWithDimensions( this, normalize_descriptor, dimensions, input_data, output_data)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -1104,10 +1036,7 @@ Stream &Stream::ThenNormalizeBackwardWithDimensions( this, normalize_descriptor, dimensions, raw_data, normalized_data, normalized_variable_gradient, raw_variable_gradient)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -1125,10 +1054,7 @@ Stream &Stream::ThenActivate(dnn::ActivationMode activation_mode, CheckError(dnn->DoActivate(this, activation_mode, dimensions, input_data, output_data)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -1158,10 +1084,114 @@ Stream &Stream::ThenDepthConcatenate( CheckError(dnn->DoDepthConcatenate(this, input_dimensions, input_data, output_data)); } else { + SetErrorAndLogNoDnnSupport(); + } + } + return *this; +} + +Stream &Stream::ThenSpaceConcatenate( + port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float> *> input_data, + DeviceMemory<float> *output_data, + dnn::SpaceConcatenateMode concat_direction) { + VLOG_CALL(PARAM(input_dimensions), PARAM(input_data), PARAM(output_data)); + + // Check that the input dimensions of all the other batches match those of the + // first batch. + for (size_t i = 1; i < input_dimensions.size(); ++i) { + if ((concat_direction == dnn::SpaceConcatenateMode::XDirection) && + (input_dimensions[i].count() != input_dimensions[0].count() || + input_dimensions[i].height() != input_dimensions[0].height() || + input_dimensions[i].feature_map_count() != + input_dimensions[0].feature_map_count())) { SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + LOG(ERROR) << "Incompatible dimensions for X concatenation.\n" + << "input_dimensions[0]: " << input_dimensions[0].ToString() + << "input_dimensions[" << i + << "]: " << input_dimensions[i].ToString(); + return *this; + } + + if ((concat_direction == dnn::SpaceConcatenateMode::YDirection) && + (input_dimensions[i].count() != input_dimensions[0].count() || + input_dimensions[i].width() != input_dimensions[0].width() || + input_dimensions[i].feature_map_count() != + input_dimensions[0].feature_map_count())) { + SetError(); + LOG(ERROR) << "Incompatible dimensions for Y concatenation.\n" + << "input_dimensions[0]: " << input_dimensions[0].ToString() + << "input_dimensions[" << i + << "]: " << input_dimensions[i].ToString(); + return *this; + } + } + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoSpaceConcatenate(this, input_dimensions, input_data, + output_data, concat_direction)); + } else { + SetErrorAndLogNoDnnSupport(); + } + } + return *this; +} + +Stream &Stream::ThenReshape(const dnn::BatchDescriptor &input_dimensions, + const DeviceMemory<float> &input_data, + const dnn::BatchDescriptor &output_dimensions, + DeviceMemory<float> *output_data) { + VLOG_CALL(PARAM(input_dimensions), PARAM(input_data), + PARAM(output_dimensions), PARAM(output_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoReshape(this, input_dimensions, input_data, + output_dimensions, output_data)); + } else { + SetErrorAndLogNoDnnSupport(); + } + } + return *this; +} + +Stream &Stream::ThenDepthToSpace( + const dnn::BatchDescriptor &input_dimensions, + const DeviceMemory<float> &input_data, + const dnn::DepthToSpaceLayout &depth_to_space_layout, + const int sqrt_depth_reduction, DeviceMemory<float> *output_data) { + VLOG_CALL(PARAM(input_dimensions), PARAM(input_data), + PARAM(depth_to_space_layout), PARAM(sqrt_depth_reduction), + PARAM(output_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoDepthToSpace(this, input_dimensions, input_data, + depth_to_space_layout, + sqrt_depth_reduction, output_data)); + } else { + SetErrorAndLogNoDnnSupport(); + } + } + return *this; +} + +Stream &Stream::ThenSpaceToDepth( + const dnn::BatchDescriptor &input_dimensions, + const DeviceMemory<float> &input_data, + const dnn::DepthToSpaceLayout &space_to_depth_layout, + const int sqrt_depth_increase, DeviceMemory<float> *output_data) { + VLOG_CALL(PARAM(input_dimensions), PARAM(input_data), + PARAM(space_to_depth_layout), PARAM(sqrt_depth_increase), + PARAM(output_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoSpaceToDepth(this, input_dimensions, input_data, + space_to_depth_layout, sqrt_depth_increase, + output_data)); + } else { + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -1182,10 +1212,30 @@ Stream &Stream::ThenElementwiseOperate( input_data, output_dimensions, output_data)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); + } + } + return *this; +} + +Stream &Stream::ThenElementwiseOperateScaledQuantized( + dnn::ElementwiseOperation operation, + port::ArraySlice<int> input_multiplicands, int output_divisor, + port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float> *> input_data, + const dnn::BatchDescriptor &output_dimensions, + DeviceMemory<float> *output_data) { + VLOG_CALL(PARAM(operation), PARAM(input_multiplicands), PARAM(output_divisor), + PARAM(input_dimensions), PARAM(input_data), + PARAM(output_dimensions), PARAM(output_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoElementwiseOperateScaledQuantized( + this, operation, input_multiplicands, output_divisor, + input_dimensions, input_data, output_dimensions, output_data)); + } else { + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -1204,10 +1254,7 @@ Stream &Stream::ThenXYPad(const dnn::BatchDescriptor &dimensions, CheckError(dnn->DoXYPad(this, dimensions, input_data, left_pad, right_pad, top_pad, bottom_pad, output_data)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -1228,10 +1275,25 @@ Stream &Stream::ThenXYSlice(const dnn::BatchDescriptor &dimensions, right_trim, top_trim, bottom_trim, output_data)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); + } + } + return *this; +} + +Stream &Stream::ThenXYBroadcast(const dnn::BatchDescriptor &dimensions, + const DeviceMemory<float> &input_data, + int64 replicate_x, int64 replicate_y, + DeviceMemory<float> *output_data) { + VLOG_CALL(PARAM(dimensions), PARAM(input_data), PARAM(replicate_x), + PARAM(replicate_y), PARAM(output_data)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError(dnn->DoXYBroadcast(this, dimensions, input_data, replicate_x, + replicate_y, output_data)); + } else { + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -1248,10 +1310,7 @@ Stream &Stream::ThenMemcpyD2HQuantized( CheckError(dnn->DoMemcpyD2HQuantized(this, gpu_unquantized_src, mode, host_dst, size)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); } } return *this; @@ -1268,10 +1327,37 @@ Stream &Stream::ThenMemcpyH2DQuantized( CheckError(dnn->DoMemcpyH2DQuantized(this, host_src, size, mode, gpu_unquantized_dst)); } else { - SetError(); - LOG(WARNING) - << "attempting to perform DNN operation using StreamExecutor " - "without DNN support"; + SetErrorAndLogNoDnnSupport(); + } + } + return *this; +} + +Stream &Stream::ThenCopyHostBuffer2Device( + HostBuffer *buffer_src, DeviceMemory<float> *gpu_unquantized_dst) { + VLOG_CALL(PARAM(*buffer_src), PARAM(gpu_unquantized_dst)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError( + dnn->DoCopyHostBuffer2Device(this, buffer_src, gpu_unquantized_dst)); + } else { + SetErrorAndLogNoDnnSupport(); + } + } + return *this; +} + +Stream &Stream::ThenCopyDevice2HostBuffer( + const DeviceMemory<float> &gpu_unquantized_src, HostBuffer *buffer_dst) { + VLOG_CALL(PARAM(gpu_unquantized_src), PARAM(*buffer_dst)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + CheckError( + dnn->DoCopyDevice2HostBuffer(this, gpu_unquantized_src, buffer_dst)); + } else { + SetErrorAndLogNoDnnSupport(); } } return *this; diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h index 0d16495a1d..711eb3079a 100644 --- a/tensorflow/stream_executor/stream.h +++ b/tensorflow/stream_executor/stream.h @@ -499,6 +499,44 @@ class Stream { port::ArraySlice<const DeviceMemory<float> *> input_data, DeviceMemory<float> *output_data); + Stream &ThenSpaceConcatenate( + port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float> *> input_data, + DeviceMemory<float> *output_data, + dnn::SpaceConcatenateMode concat_direction); + + // Change the layout of the data by shrinking one dimension (or set of + // dimensions) and growing another dimension (or set of dimensions), while + // keeping the total number of data elements constant, and maintaining the + // current data ordering. + Stream &ThenReshape(const dnn::BatchDescriptor &input_dimensions, + const DeviceMemory<float> &input_data, + const dnn::BatchDescriptor &output_dimensions, + DeviceMemory<float> *output_data); + + // Depth to space takes an X by Y image with depth D*M² and changes it to an + // MX x MY image with depth D. Each input location (x,y) with depth D*M² in + // the input image is changed to an MxM contiguous area in the output image, + // with the values being laid out in raster order specified by + // DepthToSpaceLayout, and will have a new depth of D. + // See the DoDepthToSpace comment for more information. + Stream &ThenDepthToSpace(const dnn::BatchDescriptor &input_dimensions, + const DeviceMemory<float> &input_data, + const dnn::DepthToSpaceLayout &depth_to_space_layout, + const int sqrt_depth_reduction, + DeviceMemory<float> *output_data); + + // Space to depth is the inverse of depth to space. Space to depth takes each + // non-overlapping M by M patch (in the X and Y dimensions) with depth D of + // the input, and transforms it to a 1 by 1 patch with depth D*M². If the + // input has size (MX, MY, D), the output has size (X, Y, D*M²). The number of + // data elements is not changed. + Stream &ThenSpaceToDepth(const dnn::BatchDescriptor &input_dimensions, + const DeviceMemory<float> &input_data, + const dnn::DepthToSpaceLayout &space_to_depth_layout, + const int sqrt_depth_increase, + DeviceMemory<float> *output_data); + Stream &ThenElementwiseOperate( dnn::ElementwiseOperation operation, port::ArraySlice<dnn::BatchDescriptor> input_dimensions, @@ -506,6 +544,14 @@ class Stream { const dnn::BatchDescriptor &output_dimensions, DeviceMemory<float> *output_data); + Stream &ThenElementwiseOperateScaledQuantized( + dnn::ElementwiseOperation operation, + port::ArraySlice<int> input_multiplicands, int output_divisor, + port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float> *> input_data, + const dnn::BatchDescriptor &output_dimensions, + DeviceMemory<float> *output_data); + Stream &ThenXYPad(const dnn::BatchDescriptor &dimensions, const DeviceMemory<float> &input_data, int64 left_pad, int64 right_pad, int64 top_pad, int64 bottom_pad, @@ -516,6 +562,14 @@ class Stream { int64 right_trim, int64 top_trim, int64 bottom_trim, DeviceMemory<float> *output_data); + // Grows the input tensor by replicating the X and Y dimensions. The batch and + // depth/feature_map dimensions are unchanged. Currently, the input tensor is + // limited to X=1 and Y=1. + Stream &ThenXYBroadcast(const dnn::BatchDescriptor &dimensions, + const DeviceMemory<float> &input_data, + int64 replicate_x, int64 replicate_y, + DeviceMemory<float> *output_data); + // See DnnSupport::DoMemcpyD2HQuantized. Stream &ThenMemcpyD2HQuantized(const DeviceMemory<float> &gpu_unquantized_src, dnn::QuantizedActivationMode mode, @@ -549,6 +603,14 @@ class Stream { Quantization<ElementType>::kModeId, gpu_unquantized_dst); } + // See DnnSupport::DoCopyHostBuffer2Device. + Stream &ThenCopyHostBuffer2Device(HostBuffer *buffer_src, + DeviceMemory<float> *gpu_unquantized_dst); + + // See DnnSupport::DoCopyDevice2HostBuffer. + Stream &ThenCopyDevice2HostBuffer( + const DeviceMemory<float> &gpu_unquantized_src, HostBuffer *buffer_dst); + ///////////////// // BLAS support @@ -1527,6 +1589,12 @@ class Stream { void SetError() { CheckError(false /* = operation_retcode */); } + void SetErrorAndLogNoDnnSupport() { + SetError(); + LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + // The StreamExecutor that supports the operation of this stream. StreamExecutor *parent_; |