diff options
Diffstat (limited to 'tensorflow/stream_executor/dnn.h')
-rw-r--r-- | tensorflow/stream_executor/dnn.h | 895 |
1 files changed, 895 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h new file mode 100644 index 0000000000..e737d1c78f --- /dev/null +++ b/tensorflow/stream_executor/dnn.h @@ -0,0 +1,895 @@ +// Neural Net operation support for StreamExecutor instances. +// +// This is an abstract interface for a platform to optionally support common +// neural net operations; it accommodates implementations such as the cudnn +// library operations. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_DNN_H_ +#define TENSORFLOW_STREAM_EXECUTOR_DNN_H_ + +#include "tensorflow/stream_executor/device_memory.h" +#include "tensorflow/stream_executor/lib/array_slice.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace perftools { +namespace gputools { + +class Stream; + +namespace dnn { + +// Describes how an input or output layer's data is formatted. +// Specify int64 so there's no padding in BatchDescriptor. +enum class DataLayout : int64 { + kYXDepthBatch = 0, // Same as dist_belief::DF_DEPTH_MAJOR. + kYXBatchDepth, // Same as dist_belief::DF_BATCH_MAJOR. + kBatchYXDepth, // Same as run_brain output, and tensorflow's layout. + kBatchDepthYX, // cuDNN's NCHW layout, data laid out as image, feature, + // maps, rows, columns. +}; + +// Returns a string representation of the given data layout. +string DataLayoutString(DataLayout layout); + +// Specifies a quantization for activations in a given BatchDescriptor. +enum class QuantizedActivationMode { + k8Bit = 1, + k16Bit = 2, + k32Bit = 4, +}; + +// Describes the dimensions that a layer consumes/produces. +// +// This is a matrix (height, width), its "depth" (feature_map_count), +// how many of these matrices are present (count), +// and the maximum and minimum values expected in the matrix (value_max, +// value_min). +// If input is quantized, all values greater +// than value_max will be clipped to value_max and all values less than +// value_min will be clipped to value_min. +// When quantized output is dequantized no value will be greater than +// value_max or less than value_min. +// +// Uses the named argument construction form: +// +// auto input_batch_dimensions = +// BatchDescriptor().set_count(42).set_feature_map_count(7)... +// +// Details: +// +// For a convolutional layer, a single inference takes a 3-dimensional matrix +// of input and produces a 3-dimensional matrix of output. We call the three +// dimensions height, width and feature_map_count, where for an image, the +// height and width correspond to the Y and X pixel indices, respectively, and +// the feature_map_count corresponds to the RGB dimension of the input data. +// Then the count indicates how many 3D matrices are being presented to be +// processed at once; this corresponds to the neural network concept of +// minibatch size. +// +// For a fully connected layer, it's better to put the nodes of the layer in +// the feature_map_count, and leave the height and weight as degenerate (== 1). +// Count indicates how many input vectors (degenerate 3D matrices) are to be +// processed. +// +// If unspecified, value_max and value_min default to 0.0. +// If value_max == value_min the Stream will attempt to derive valid values - +// for example the output of Relu6 activation will always be in the range +// [0.0, 6.0]. +// +// If unspecified, layout defaults to kYXDepthBatch. +class BatchDescriptor { + public: + // Creates a "blank" batch descriptor, which should be initialized via the + // named argument helpers. + BatchDescriptor(); + + // Clones values from 'other' for initialization. + void CloneFrom(const BatchDescriptor& other); + + string ToString() const; + string ToShortString() const; + + // Accessors. + int64 count() const { return count_; } + int64 feature_map_count() const { return feature_map_count_; } + int64 height() const { return height_; } + int64 width() const { return width_; } + float value_max() const { return value_max_; } + float value_min() const { return value_min_; } + DataLayout layout() const { return layout_; } + QuantizedActivationMode quantized_activation_mode() const { + return quantized_activation_mode_; + } + + // Named-argument helpers for avoiding user error during construction. + BatchDescriptor& set_count(int64 value) { + count_ = value; + return *this; + } + BatchDescriptor& set_feature_map_count(int64 value) { + feature_map_count_ = value; + return *this; + } + BatchDescriptor& set_height(int64 value) { + height_ = value; + return *this; + } + BatchDescriptor& set_width(int64 value) { + width_ = value; + return *this; + } + BatchDescriptor& set_value_max(float value) { + value_max_ = value; + return *this; + } + BatchDescriptor& set_value_min(float value) { + value_min_ = value; + return *this; + } + BatchDescriptor& set_layout(DataLayout layout) { + layout_ = layout; + return *this; + } + BatchDescriptor& set_quantized_activation_mode( + QuantizedActivationMode quantized_activation_mode) { + quantized_activation_mode_ = quantized_activation_mode; + return *this; + } + + // Return the number of nodes in a single feature map. + int64 NodesPerFeatureMap() const; + + // Return the number of nodes across all feature maps. Note that this is not + // affected by the batch count. + int64 NodesAcrossFeatureMaps() const; + + // Returns the number of elements (e.g. RGB pixel values) required to hold a + // given batch descriptor, given a no-padding assumption. Note that this is + // affected by the batch count. + int64 ElementCount() const; + + // Return the number of weights required to fully connect a layer with + // dimensions given by the 'input' descriptor with a layer with dimensions + // given by the 'output' descriptor. + static int64 FullyConnectedWeightCount(const BatchDescriptor& input, + const BatchDescriptor& output); + + // Return the number of biases required to fully connect to an output layer + // with dimensions given the 'output' descriptor. + static int64 FullyConnectedBiasCount(const BatchDescriptor& output); + + private: + int64 count_; + int64 feature_map_count_; + int64 height_; + int64 width_; + float value_max_; + float value_min_; + DataLayout layout_; + QuantizedActivationMode quantized_activation_mode_; +}; + +// Describes how a filter is laid out in the memory. +// Specify int64 so there's no padding in FilterDescriptor. +enum class FilterLayout : int64 { + kOutputInputYX = 0, // cuDNN's default filter layout, laid out as: + // (major) output feature maps >> input feature maps >> + // rows >> columns (minor). + kInputYXOutput, // Same as dist_belief's default filter layout. + kYXInputOutput, // Same as tensorflow's default filter layout. +}; + +// Returns a string representation of the given filter layout. +string FilterLayoutString(FilterLayout layout); + +// Describes a filter for the convolution. This is the "window" from +// height-by-width patches of each of the feature maps in the input layer to the +// cells within the output feature map. +// +// Uses the named argument construction form: +// +// FilterDescriptor filter_dimensions; +// filter_dimensions +// .set_output_feature_map_count(42) +// .set_input_feature_map_count(7) +// ... +// +// Arguments: +// - output_feature_map_count: number of feature maps in the output layer. +// - input_feature_map_count: number of feature maps in the input layer (from +// which the filter patch is taken). +// - input_filter_height: "height" number of neurons used in the sliding window +// over the input layer. +// - input_filter_width: "width" number of neurons used in the sliding window +// over the input layer. +// +// Sometimes names like "filter input height" are referred to by synonymous +// terminology, such as "kernel y size". +// +// If unspecified, layout defaults to kOutputInputYX. +class FilterDescriptor { + public: + // By default construction, all dimensions are set to zero, so they should all + // be populated by the user via the named-argument helpers below. (See class + // comment for details.) + FilterDescriptor(); + + ~FilterDescriptor(); + + // Named-argument helpers for avoiding user error during construction. + FilterDescriptor& set_output_feature_map_count(int64 value) { + output_feature_map_count_ = value; + return *this; + } + FilterDescriptor& set_input_feature_map_count(int64 value) { + input_feature_map_count_ = value; + return *this; + } + FilterDescriptor& set_input_filter_height(int64 value) { + input_filter_height_ = value; + return *this; + } + FilterDescriptor& set_input_filter_width(int64 value) { + input_filter_width_ = value; + return *this; + } + FilterDescriptor& set_layout(FilterLayout layout) { + layout_ = layout; + return *this; + } + + void CloneFrom(const FilterDescriptor& other); + + string ToString() const; + string ToShortString() const; + + // Returns the number of weights required as parameters for a convolution + // using this filter descriptor. + int64 ComputeWeightCount() const; + + // Returns the number of biases required as parameters for a convolution using + // this filter descriptor. + int64 bias_count() const { return output_feature_map_count_; } + + int64 output_feature_map_count() const { return output_feature_map_count_; } + int64 input_feature_map_count() const { return input_feature_map_count_; } + int64 input_filter_height() const { return input_filter_height_; } + int64 input_filter_width() const { return input_filter_width_; } + FilterLayout layout() const { return layout_; } + + private: + int64 output_feature_map_count_; + int64 input_feature_map_count_; + int64 input_filter_height_; + int64 input_filter_width_; + FilterLayout layout_; + + SE_DISALLOW_COPY_AND_ASSIGN(FilterDescriptor); +}; + +// Describes a convolution. +// +// Uses the named argument construction form: +// +// ConvolutionDescriptor convolution_dimensions; +// convolution_dimensions +// .set_vertical_filter_stride(2) +// .set_horizontal_filter_stride(2) +// ... +// +// Arguments: +// - zero_padding_height: padding of the "y dimension" of the input data. Note +// that this is different from the height of the filter. +// - zero_padding_width: analogouus to the height above, but in the "x +// dimension". +// - vertical_filter_stride: the convolution slides a 2-dimensional window of +// filter-height-by-filter-width over the input layer -- the center of that +// window is moved in the "y dimension" according to this stride value. +// - horizontal_filter_stride: analogous to the vertical stride above, but in +// the "x dimension". +class ConvolutionDescriptor { + public: + // By default construction, there is no zero-padding and the filter stride is + // 1x1 (centering the filter on every cell in the input layer's + // width-by-height area). + ConvolutionDescriptor(); + ~ConvolutionDescriptor(); + + string ToString() const; + string ToShortString() const; + + ConvolutionDescriptor& set_zero_padding_height(int64 value) { + zero_padding_height_ = value; + return *this; + } + ConvolutionDescriptor& set_zero_padding_width(int64 value) { + zero_padding_width_ = value; + return *this; + } + ConvolutionDescriptor& set_vertical_filter_stride(int64 value) { + vertical_filter_stride_ = value; + return *this; + } + ConvolutionDescriptor& set_horizontal_filter_stride(int64 value) { + horizontal_filter_stride_ = value; + return *this; + } + + int64 zero_padding_height() const { return zero_padding_height_; } + int64 zero_padding_width() const { return zero_padding_width_; } + int64 vertical_filter_stride() const { return vertical_filter_stride_; } + int64 horizontal_filter_stride() const { return horizontal_filter_stride_; } + + private: + int64 zero_padding_height_; + int64 zero_padding_width_; + int64 vertical_filter_stride_; + int64 horizontal_filter_stride_; + // TODO(leary) cudnn provides these fields, but need to characterize what + // their effect is -- they may be boolean rather than integral. + // int64 upscale_input_x; + // int64 upscale_input_y; +}; + +// A patch of values in the input can be pooled via either a max or an average +// operation. +// Specify int64 so there's no padding in PoolingDescriptor. +enum class PoolingMode : int64 { + kMaximum, + kAverage, +}; + +// Describes a pooling operation to be enqueued onto a stream via a platform's +// DnnSupport. +// +// TODO(broune): describe how padding works and what happens if the +// window height/width is not divisible by the vertical/horizontal +// stride. +// +// Arguments: +// pooling_mode: pooling operator to use on the input patch +// window_height: height of input window +// window_width: width of input window +// vertical_stride: vertical delta for center of the input patch +// horizontal_stride: horizontal delta for center of the input patch +class PoolingDescriptor { + public: + PoolingDescriptor(); + + PoolingDescriptor& set_pooling_mode(PoolingMode value) { + mode_ = value; + return *this; + } + PoolingDescriptor& set_window_height(int64 value) { + window_height_ = value; + return *this; + } + PoolingDescriptor& set_window_width(int64 value) { + window_width_ = value; + return *this; + } + PoolingDescriptor& set_vertical_padding(int64 value) { + vertical_padding_ = value; + return *this; + } + PoolingDescriptor& set_horizontal_padding(int64 value) { + horizontal_padding_ = value; + return *this; + } + PoolingDescriptor& set_vertical_stride(int64 value) { + vertical_stride_ = value; + return *this; + } + PoolingDescriptor& set_horizontal_stride(int64 value) { + horizontal_stride_ = value; + return *this; + } + + void CloneFrom(const PoolingDescriptor& other); + + string ToString() const; + string ToShortString() const; + + PoolingMode mode() const { return mode_; } + int64 window_height() const { return window_height_; } + int64 window_width() const { return window_width_; } + int64 vertical_padding() const { return vertical_padding_; } + int64 horizontal_padding() const { return horizontal_padding_; } + int64 vertical_stride() const { return vertical_stride_; } + int64 horizontal_stride() const { return horizontal_stride_; } + + private: + PoolingMode mode_; + int64 window_height_; + int64 window_width_; + int64 vertical_padding_; + int64 horizontal_padding_; + int64 vertical_stride_; + int64 horizontal_stride_; + + SE_DISALLOW_COPY_AND_ASSIGN(PoolingDescriptor); +}; + +// Describes a dist_belief local response normalization. +// The normalization equation is: +// y_i = x_i / (bias + alpha * (sum_j_{i - range}^{i + range} x_j^2)) ^ beta +// where x_i is the input in feature map i, y_i is the output. +// Each feature map is split into segment_size segments for performing the +// sum_j_. If wrap_around is true, the sum_j_ for y_i on the left and right of +// a segment wrap around at the edges of the segment, if wrap_around is false +// zeros are inserted instead. +class NormalizeDescriptor { + public: + NormalizeDescriptor(); + + NormalizeDescriptor& set_bias(float bias) { + bias_ = bias; + return *this; + } + + NormalizeDescriptor& set_range(int32 range) { + range_ = range; + return *this; + } + + NormalizeDescriptor& set_alpha(float alpha) { + alpha_ = alpha; + return *this; + } + + NormalizeDescriptor& set_beta(float beta) { + beta_ = beta; + return *this; + } + + NormalizeDescriptor& set_wrap_around(bool wrap_around) { + wrap_around_ = wrap_around; + return *this; + } + + NormalizeDescriptor& set_segment_size(int32 segment_size) { + segment_size_ = segment_size; + return *this; + } + + void CloneFrom(const NormalizeDescriptor& other); + + string ToString() const; + string ToShortString() const; + + float bias() const { return bias_; } + int32 range() const { return range_; } + float alpha() const { return alpha_; } + float beta() const { return beta_; } + bool wrap_around() const { return wrap_around_; } + int32 segment_size() const { return segment_size_; } + + private: + float bias_; + int32 range_; + float alpha_; + float beta_; + bool wrap_around_; + int32 segment_size_; + + SE_DISALLOW_COPY_AND_ASSIGN(NormalizeDescriptor); +}; + +// Describes a kind of non-linearity (threshold-like mathematical function). +enum class ActivationMode { + kSigmoid, + // Rectified linear activation: f(x) = x < 0 ? 0 : x + kRelu, + // Rectified linear activation, where upper maximum is 6.0. + kRelu6, + // Rectified linear activation, where upper maximum specified by + // BatchDescriptor::value_max(). + kReluX, + kTanh, +}; + +// Returns a string representation of the given activation mode. +string ActivationModeString(ActivationMode mode); + +// Describes the operation that DoElementwiseOperation should perform on its +// inputs. +enum class ElementwiseOperation { + kAdd, + kMultiply +}; + +string ElementwiseOperationString(ElementwiseOperation op); + +// Suite of operations typically used for implementing Deep/Convolutional Neural +// Nets. +class DnnSupport { + public: + DnnSupport() {} + virtual ~DnnSupport() {} + + virtual port::Status Init() = 0; + + // Enqueues a single-precision convolution operation onto the stream. + // + // Arguments (all borrowed): + // stream: borrowed pointer to the stream that the 'convolve' operation + // should be enqueued onto. + // input_descriptor: dimensions of the input layer. + // input_data: un-owned device memory region which contains the + // convolution input. + // filter_descriptor: dimensions of the convolution filter. + // weights: coefficients for the convolution filter, these are multiplied + // against values in the input that the filter convolves over. + // convolution_descriptor: stride of the convolution filter. + // output_descriptor: dimensions of the output layer. + // output_data: un-owned device memory region in which to place the + // convolution result. + // + // input_descriptor, filter_descriptor, convolution_descriptor and + // output_descriptor together specify exactly how the convolution is aligned + // with the input data: + // + // * (input dimensions - filter size + 1) / filter stride == output dimensions + // corresponds to dist_belief padding = VALID, i.e. the input is not padded. + // * input dimensions / filter stride == output dimensions + // corresponds to dist_belief padding = SAME, i.e. input and output are the + // same size - this requires padding the input. + // * (input dimensions + filter size - 1) / filter stride == output dimensions + // corresponds to dist_belief padding = FULL, i.e. the output is sized so + // that if the inverse of the filter is applied to the output in VALID mode + // the result is the same size as the input - this requires even more + // padding + // of the input. + virtual bool DoConvolve( + Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory<float>& input_data, + const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory<float>& filter_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory<float>* output_data) = 0; + + // Enqueues a double-precision convolution operation onto the stream. + // See DoConvolve above for argument details. + virtual bool DoConvolve( + Stream* stream, const dnn::BatchDescriptor& batch_descriptor, + const DeviceMemory<double>& input_data, + const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory<double>& filter_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory<double>* output_data) = 0; + + // Variation of the above with the weight matrix split into two matrices. + // first_weights: Coefficients of the first matrix. + // second_weights: Coefficients of the second matrix. + // depth_multiplier: specifies the columns of the first matrix and rows + // of the second one - first_weights columns = depth_multiplier, + // second_weights rows = depth_multiplier * + // filter_descriptor.input_feature_map_count(). + // see go/separable for documentation on separable convolutions. + virtual bool DoSeparableConvolve( + Stream* stream, const BatchDescriptor& input_descriptor, + const DeviceMemory<float>& input_data, + const FilterDescriptor& filter_descriptor, int depth_multiplier, + const DeviceMemory<float>& first_weights, + const DeviceMemory<float>& second_weights, + const ConvolutionDescriptor& convolution_descriptor, + const BatchDescriptor& output_descriptor, + DeviceMemory<float>* output_data) = 0; + + // Enqueues a single-precision backward convolution (for data) operation onto + // the stream. + // + // Arguments: + // stream: borrowed pointer to the stream that the 'convolve' operation + // should be enqueued onto. + // filter_descriptor: dimensions of the convolution filter. + // filter_data: coefficients for the convolution filter. + // output_descriptor: dimensions of the output gradients, which is the same + // as + // the dimensions of the ouput. + // backward_output_data: un-owned device memory region which contains the + // backprop of the output. + // convolution_descriptor: stride of the convolution filter. + // input_descriptor: dimensions of the input layer. + // backward_input_data: un-owned device memory region in which to place the + // backprop of the input. + virtual bool DoConvolveBackwardData( + Stream* stream, const FilterDescriptor& filter_descriptor, + const DeviceMemory<float>& filter_data, + const BatchDescriptor& output_descriptor, + DeviceMemory<float> backward_output_data, + const ConvolutionDescriptor& convolution_descriptor, + const BatchDescriptor& input_descriptor, + DeviceMemory<float>* backward_input_data) = 0; + + // Enqueues a single-precision backward convolution (for filter) operation + // onto + // the stream. + // + // Arguments: + // stream: borrowed pointer to the stream that the 'convolve' operation + // should be enqueued onto. + // input_descriptor: dimensions of the input layer. + // input_data: un-owned device memory region which contains the + // convolution input. + // output_descriptor: dimensions of the output gradients, which is the same + // as + // the dimensions of the ouput. + // backward_output_data: un-owned device memory region which contains the + // backprop of the output. + // convolution_descriptor: stride of the convolution filter. + // filter_descriptor: dimensions of the convolution filter. + // backward_filter_data: un-owned device memory region in which to place the + // backprop of the filter. + virtual bool DoConvolveBackwardFilter( + Stream* stream, const BatchDescriptor& input_descriptor, + const DeviceMemory<float>& input_data, + const BatchDescriptor& output_descriptor, + DeviceMemory<float> backward_output_data, + const ConvolutionDescriptor& convolution_descriptor, + const FilterDescriptor& filter_descriptor, + DeviceMemory<float>* backward_filter_data) = 0; + + // Fully connects the "nodes" (float values) in input_data with + // shape input_dimensions to output_data with output_dimensions + // using provided weights. This is equivalent to computing a matrix + // product, hence the name MatMul. + // + // A BatchDescriptor has four dimensions: batch, y, x, depth. Matrix products + // happen in two dimensions. To get down to two dimensions, we consider the + // input y, x and depth dimension as one combined dimension T. For now, + // assume that the output height and width are 1 and let OD be the output + // depth. + // + // There are three device memory buffers passed in to this + // function. We can now view all three as matrices: + // + // input_data: A batch x T matrix + // weights: A T x OD matrix + // output_data: A batch x OD matrix + // + // This function then computes the matrix product of input_data and + // weights and writes the result into output_data. + // + // Here the weights buffer is in row major order, i.e. the first OD + // entries in weights are the first row, the second OD entries in + // weights are the second row and so on. + // + // The case for output width*height > 1 is more complicated. Let K = + // OY * OX where OY is the output height and OX is the output + // width. Then weights is divided into K sub-arrays W_i, for + // i=0,...,k-1, that each represent a T x OD matrix. This function + // then computes the K matrix multiplications of input_data with + // each W_i. This creates K matrices with dimensions batch x + // OD. These K matrices are concatenated horizontally to form one + // larger matrix with dimensions batch x (K*OD); note that this is + // not the same as concatenating the bytes of the matrices. The + // combined matrix can then be interpreted as a tensor with + // dimensions (batch, OY, OX, OD). If the output tensor format is + // not kBatchYXDepth, this function would then need to arrange for + // the output to be in the requested layout, if that is + // supported. Note that the case K=1 is equivalent to the + // description above. It is recommended to prefer the case K=1. + // + // Arguments (all borrowed): + // stream: borrowed pointer to the stream that the 'fully connect' operation + // should be enqueued onto. + // output_data: un-owned device memory region in which to place the + // fully connected result. + virtual bool DoMatMul(Stream* stream, const DeviceMemory<float>& input_data, + const DeviceMemory<float>& weights, + const dnn::BatchDescriptor& input_dimensions, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) = 0; + + // Version of DoMatMul that uses pre-quantized 8 bit weights. + // weight_scales specifies the scaling of each column of weights: + // original float weight[row * num_columns + column] = + // quantized_weight[row * nnum_columns + column] * weight_scales[column]. + virtual bool DoMatMulQuantized(Stream* stream, + const DeviceMemory<float>& input_data, + const DeviceMemory<int8>& quantized_weights, + const DeviceMemory<float>& weight_scales, + const dnn::BatchDescriptor& input_dimensions, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) = 0; + + // Version of DoMatMul that uses pre-quantized 16 bit weights. + // weight_scales specifies the scaling of each column of weights: + // original float weight[row * num_columns + column] = + // quantized_weight[row * nnum_columns + column] * weight_scales[column]. + virtual bool DoMatMulQuantized(Stream* stream, + const DeviceMemory<float>& input_data, + const DeviceMemory<int16>& quantized_weights, + const DeviceMemory<float>& weight_scales, + const dnn::BatchDescriptor& input_dimensions, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) = 0; + + // Adds biases to the feature maps in input_data producing + // output_data. input_data can equal output_data, but must not + // partially overlap it. + // + // Let K = count() * height() * width() and N = feature_map_count() + // on dimensions. Then input_value contains K*N values and biases + // contains N values. We can thus logically consider input_value to + // contain K vectors of N elements each. This function adds biases + // to each of those N vectors. + // + // TODO(broune): This works differently when width() * height() > 1 + // and the call to ThenBiasAdd() follows a call to ThenMatMul(). In + // that case there should be width() * height() * + // feature_map_count() biases, but this is not implemented on all + // StreamExecutors. + // + // Arguments (all borrowed): + // stream: borrowed pointer to the stream that the 'bias add' operation + // should be enqueued onto. + // input_data: un-owned device memory region containing the input. + // biases: un-owned device memory region containing biases to add to the + // input. + // dimensions: dimensions of input_data and output_data. + // output_data: un-owned device memory region in which to place the result. + virtual bool DoBiasAdd(Stream* stream, const DeviceMemory<float>& input_data, + const DeviceMemory<float>& biases, + const dnn::BatchDescriptor& dimensions, + DeviceMemory<float>* output_data) = 0; + + // Performs a forward pooling operation on input_data, writing to + // output_data. See PoolingDescriptor for how to configure the + // pooling operation. + // + // Pooling happens as a window that moves across the Y and X + // dimensions of input_data, where each position of the window + // yields one output value. E.g. for max pooling, the computed value + // is the maximum element in the window. The operation is applied + // independently to each batch and at each feature map (depth), so + // that the output depth and feature_map_count are the same as for + // the input. The output width and height can be different. + // + // See PoolingDescriptor for how to configure the pooling operation. + virtual bool DoPoolForward(Stream* stream, + const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) = 0; + + // Performs differentiation of the pooling operation. + virtual bool DoPoolBackward(Stream* stream, + const dnn::PoolingDescriptor& pooling_dimensions, + const dnn::BatchDescriptor& input_dimensions, + const DeviceMemory<float>& input_data, + const dnn::BatchDescriptor& output_dimensions, + const DeviceMemory<float>& output_data, + const DeviceMemory<float>& input_diff_data, + DeviceMemory<float>* output_diff_data) = 0; + + // Applies local response normalization to all of the values + // held on the device in 'input_data'. + virtual bool DoNormalize(Stream* stream, + const dnn::NormalizeDescriptor& normalize_descriptor, + const DeviceMemory<float>& input_data, + DeviceMemory<float>* output_data) = 0; + + // Applies an activation function (see ActivationMode) to all of the values + // held on the device in 'input_data', whose dimensions are described by + // 'dimensions'. + // + // Arguments (all borrowed): + // stream: borrowed pointer to the stream that the 'activate' operation + // should be enqueued onto. + // activation_mode: Type of activation to perform. + // input_data: un-owned device memory region which contains the + // activate input. + // output_data: un-owned device memory region in which to place the + // activate result. + virtual bool DoActivate(Stream* stream, ActivationMode activation_mode, + const BatchDescriptor& dimensions, + const DeviceMemory<float>& input_data, + DeviceMemory<float>* output_data) = 0; + + // Concatenates several layers into one, by concatenating the depth of each + // layer at matching x and y coordinates. + // The inputs must all have the same width and height, the output will have + // the same width and height as the inputs and its depth will be the sum of + // the input depths. + // + // Arguments (all borrowed): + // stream: borrowed pointer to the stream that the 'depth concatenate' + // operation should be enqueued onto. + // input_dimensions: The dimensions of each input. + // input_data: un-owned device memory region which contains the + // input data for each input layer. + // output_data: un-owned device memory region in which to place the + // depth concatenate result. + virtual bool DoDepthConcatenate( + Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float>*> input_data, + DeviceMemory<float>* output_data) = 0; + + // Computes the specified operation (e.g. addition or multiplication) + // between corresponding elements in the inputs and stores the result in the + // output element. + // The inputs and output must all have the same dimensions, but may have + // different quantization parameters (min_value and max_value). + // + // Arguments (all borrowed): + // stream: borrowed pointer to the stream that the 'elementwise operation' + // should be enqueued onto. + // operation: The operation to perform. + // input_dimensions: The dimensions of each input. + // input_data: un-owned device memory region which contains the + // input data for each input layer. + // output_dimensions: The dimensions of the output. + // output_data: un-owned device memory region in which to place the + // operation result. + virtual bool DoElementwiseOperate( + Stream* stream, ElementwiseOperation operation, + port::ArraySlice<dnn::BatchDescriptor> input_dimensions, + port::ArraySlice<const DeviceMemory<float>*> input_data, + const dnn::BatchDescriptor& output_dimensions, + DeviceMemory<float>* output_data) = 0; + + // Enqueues an asynchronous memcpy of the *quantized* output of a layer (that + // is, bytes instead of scaled floats) into 'host_dst' if they are available + // for the underlying DNN implementation. If this quantized output is not + // available, false is returned, which will place 'stream' into an error + // state. + // + // Arguments (all borrowed): + // stream: borrowed pointer to the stream that the 'quantized memcpy' + // operation should be enqueued onto. + // gpu_unquantized_src: the device memory that contains the unquantized data + // -- this data should also have a corresponding quantized representation + // on the device for this operation to succeed. + // host_dst: un-owned host memory region that is mutated in place, + // it is clobbered by the values in 'gpu_unquantized_src' when the enqueued + // (asynchronous) memcpy operation is performed. + // TODO(wgulland) Merge all these versions of DoMemcpyD2HQuantized. + virtual bool DoMemcpyD2HQuantized( + Stream* stream, const DeviceMemory<float>& gpu_unquantized_src, + port::MutableArraySlice<uint8> host_dst) = 0; + + // As above, but for 16-bit values. + virtual bool DoMemcpyD2HQuantized( + Stream* stream, const DeviceMemory<float>& gpu_unquantized_src, + port::MutableArraySlice<uint16> host_dst) = 0; + + // As above, but for signed 32-bit values. + virtual bool DoMemcpyD2HQuantized( + Stream* stream, const DeviceMemory<float>& gpu_unquantized_src, + port::MutableArraySlice<int32> host_dst) = 0; + + // Enqueues an asynchronous memcpy of 'host_dst' into the *quantized* input + // of a layer (that is, bytes instead of scaled floats) if they are supported + // by the underlying DNN implementation. If this quantized input is not + // supported, false is returned, which will place 'stream' into an error + // state. + // + // Arguments (all borrowed): + // stream: borrowed pointer to the stream that the 'quantized memcpy' + // operation should be enqueued onto. + // host_src: un-owned host memory region that contains the quantized data. + // gpu_unquantized_dst: the device memory that is clobbered by the values in + // 'host_src' when the enqueued (asynchronous) memcpy operation is + // performed. -- this data should also have a corresponding quantized + // representation on the device for this operation to + // succeed. + virtual bool DoMemcpyH2DQuantized( + Stream* stream, port::ArraySlice<uint8> host_src, + DeviceMemory<float>* gpu_unquantized_dst) = 0; + + private: + SE_DISALLOW_COPY_AND_ASSIGN(DnnSupport); +}; + +} // namespace dnn +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_DNN_H_ |