aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/dnn.h
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/stream_executor/dnn.h')
-rw-r--r--tensorflow/stream_executor/dnn.h895
1 files changed, 895 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
new file mode 100644
index 0000000000..e737d1c78f
--- /dev/null
+++ b/tensorflow/stream_executor/dnn.h
@@ -0,0 +1,895 @@
+// Neural Net operation support for StreamExecutor instances.
+//
+// This is an abstract interface for a platform to optionally support common
+// neural net operations; it accommodates implementations such as the cudnn
+// library operations.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_DNN_H_
+#define TENSORFLOW_STREAM_EXECUTOR_DNN_H_
+
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/lib/array_slice.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace perftools {
+namespace gputools {
+
+class Stream;
+
+namespace dnn {
+
+// Describes how an input or output layer's data is formatted.
+// Specify int64 so there's no padding in BatchDescriptor.
+enum class DataLayout : int64 {
+ kYXDepthBatch = 0, // Same as dist_belief::DF_DEPTH_MAJOR.
+ kYXBatchDepth, // Same as dist_belief::DF_BATCH_MAJOR.
+ kBatchYXDepth, // Same as run_brain output, and tensorflow's layout.
+ kBatchDepthYX, // cuDNN's NCHW layout, data laid out as image, feature,
+ // maps, rows, columns.
+};
+
+// Returns a string representation of the given data layout.
+string DataLayoutString(DataLayout layout);
+
+// Specifies a quantization for activations in a given BatchDescriptor.
+enum class QuantizedActivationMode {
+ k8Bit = 1,
+ k16Bit = 2,
+ k32Bit = 4,
+};
+
+// Describes the dimensions that a layer consumes/produces.
+//
+// This is a matrix (height, width), its "depth" (feature_map_count),
+// how many of these matrices are present (count),
+// and the maximum and minimum values expected in the matrix (value_max,
+// value_min).
+// If input is quantized, all values greater
+// than value_max will be clipped to value_max and all values less than
+// value_min will be clipped to value_min.
+// When quantized output is dequantized no value will be greater than
+// value_max or less than value_min.
+//
+// Uses the named argument construction form:
+//
+// auto input_batch_dimensions =
+// BatchDescriptor().set_count(42).set_feature_map_count(7)...
+//
+// Details:
+//
+// For a convolutional layer, a single inference takes a 3-dimensional matrix
+// of input and produces a 3-dimensional matrix of output. We call the three
+// dimensions height, width and feature_map_count, where for an image, the
+// height and width correspond to the Y and X pixel indices, respectively, and
+// the feature_map_count corresponds to the RGB dimension of the input data.
+// Then the count indicates how many 3D matrices are being presented to be
+// processed at once; this corresponds to the neural network concept of
+// minibatch size.
+//
+// For a fully connected layer, it's better to put the nodes of the layer in
+// the feature_map_count, and leave the height and weight as degenerate (== 1).
+// Count indicates how many input vectors (degenerate 3D matrices) are to be
+// processed.
+//
+// If unspecified, value_max and value_min default to 0.0.
+// If value_max == value_min the Stream will attempt to derive valid values -
+// for example the output of Relu6 activation will always be in the range
+// [0.0, 6.0].
+//
+// If unspecified, layout defaults to kYXDepthBatch.
+class BatchDescriptor {
+ public:
+ // Creates a "blank" batch descriptor, which should be initialized via the
+ // named argument helpers.
+ BatchDescriptor();
+
+ // Clones values from 'other' for initialization.
+ void CloneFrom(const BatchDescriptor& other);
+
+ string ToString() const;
+ string ToShortString() const;
+
+ // Accessors.
+ int64 count() const { return count_; }
+ int64 feature_map_count() const { return feature_map_count_; }
+ int64 height() const { return height_; }
+ int64 width() const { return width_; }
+ float value_max() const { return value_max_; }
+ float value_min() const { return value_min_; }
+ DataLayout layout() const { return layout_; }
+ QuantizedActivationMode quantized_activation_mode() const {
+ return quantized_activation_mode_;
+ }
+
+ // Named-argument helpers for avoiding user error during construction.
+ BatchDescriptor& set_count(int64 value) {
+ count_ = value;
+ return *this;
+ }
+ BatchDescriptor& set_feature_map_count(int64 value) {
+ feature_map_count_ = value;
+ return *this;
+ }
+ BatchDescriptor& set_height(int64 value) {
+ height_ = value;
+ return *this;
+ }
+ BatchDescriptor& set_width(int64 value) {
+ width_ = value;
+ return *this;
+ }
+ BatchDescriptor& set_value_max(float value) {
+ value_max_ = value;
+ return *this;
+ }
+ BatchDescriptor& set_value_min(float value) {
+ value_min_ = value;
+ return *this;
+ }
+ BatchDescriptor& set_layout(DataLayout layout) {
+ layout_ = layout;
+ return *this;
+ }
+ BatchDescriptor& set_quantized_activation_mode(
+ QuantizedActivationMode quantized_activation_mode) {
+ quantized_activation_mode_ = quantized_activation_mode;
+ return *this;
+ }
+
+ // Return the number of nodes in a single feature map.
+ int64 NodesPerFeatureMap() const;
+
+ // Return the number of nodes across all feature maps. Note that this is not
+ // affected by the batch count.
+ int64 NodesAcrossFeatureMaps() const;
+
+ // Returns the number of elements (e.g. RGB pixel values) required to hold a
+ // given batch descriptor, given a no-padding assumption. Note that this is
+ // affected by the batch count.
+ int64 ElementCount() const;
+
+ // Return the number of weights required to fully connect a layer with
+ // dimensions given by the 'input' descriptor with a layer with dimensions
+ // given by the 'output' descriptor.
+ static int64 FullyConnectedWeightCount(const BatchDescriptor& input,
+ const BatchDescriptor& output);
+
+ // Return the number of biases required to fully connect to an output layer
+ // with dimensions given the 'output' descriptor.
+ static int64 FullyConnectedBiasCount(const BatchDescriptor& output);
+
+ private:
+ int64 count_;
+ int64 feature_map_count_;
+ int64 height_;
+ int64 width_;
+ float value_max_;
+ float value_min_;
+ DataLayout layout_;
+ QuantizedActivationMode quantized_activation_mode_;
+};
+
+// Describes how a filter is laid out in the memory.
+// Specify int64 so there's no padding in FilterDescriptor.
+enum class FilterLayout : int64 {
+ kOutputInputYX = 0, // cuDNN's default filter layout, laid out as:
+ // (major) output feature maps >> input feature maps >>
+ // rows >> columns (minor).
+ kInputYXOutput, // Same as dist_belief's default filter layout.
+ kYXInputOutput, // Same as tensorflow's default filter layout.
+};
+
+// Returns a string representation of the given filter layout.
+string FilterLayoutString(FilterLayout layout);
+
+// Describes a filter for the convolution. This is the "window" from
+// height-by-width patches of each of the feature maps in the input layer to the
+// cells within the output feature map.
+//
+// Uses the named argument construction form:
+//
+// FilterDescriptor filter_dimensions;
+// filter_dimensions
+// .set_output_feature_map_count(42)
+// .set_input_feature_map_count(7)
+// ...
+//
+// Arguments:
+// - output_feature_map_count: number of feature maps in the output layer.
+// - input_feature_map_count: number of feature maps in the input layer (from
+// which the filter patch is taken).
+// - input_filter_height: "height" number of neurons used in the sliding window
+// over the input layer.
+// - input_filter_width: "width" number of neurons used in the sliding window
+// over the input layer.
+//
+// Sometimes names like "filter input height" are referred to by synonymous
+// terminology, such as "kernel y size".
+//
+// If unspecified, layout defaults to kOutputInputYX.
+class FilterDescriptor {
+ public:
+ // By default construction, all dimensions are set to zero, so they should all
+ // be populated by the user via the named-argument helpers below. (See class
+ // comment for details.)
+ FilterDescriptor();
+
+ ~FilterDescriptor();
+
+ // Named-argument helpers for avoiding user error during construction.
+ FilterDescriptor& set_output_feature_map_count(int64 value) {
+ output_feature_map_count_ = value;
+ return *this;
+ }
+ FilterDescriptor& set_input_feature_map_count(int64 value) {
+ input_feature_map_count_ = value;
+ return *this;
+ }
+ FilterDescriptor& set_input_filter_height(int64 value) {
+ input_filter_height_ = value;
+ return *this;
+ }
+ FilterDescriptor& set_input_filter_width(int64 value) {
+ input_filter_width_ = value;
+ return *this;
+ }
+ FilterDescriptor& set_layout(FilterLayout layout) {
+ layout_ = layout;
+ return *this;
+ }
+
+ void CloneFrom(const FilterDescriptor& other);
+
+ string ToString() const;
+ string ToShortString() const;
+
+ // Returns the number of weights required as parameters for a convolution
+ // using this filter descriptor.
+ int64 ComputeWeightCount() const;
+
+ // Returns the number of biases required as parameters for a convolution using
+ // this filter descriptor.
+ int64 bias_count() const { return output_feature_map_count_; }
+
+ int64 output_feature_map_count() const { return output_feature_map_count_; }
+ int64 input_feature_map_count() const { return input_feature_map_count_; }
+ int64 input_filter_height() const { return input_filter_height_; }
+ int64 input_filter_width() const { return input_filter_width_; }
+ FilterLayout layout() const { return layout_; }
+
+ private:
+ int64 output_feature_map_count_;
+ int64 input_feature_map_count_;
+ int64 input_filter_height_;
+ int64 input_filter_width_;
+ FilterLayout layout_;
+
+ SE_DISALLOW_COPY_AND_ASSIGN(FilterDescriptor);
+};
+
+// Describes a convolution.
+//
+// Uses the named argument construction form:
+//
+// ConvolutionDescriptor convolution_dimensions;
+// convolution_dimensions
+// .set_vertical_filter_stride(2)
+// .set_horizontal_filter_stride(2)
+// ...
+//
+// Arguments:
+// - zero_padding_height: padding of the "y dimension" of the input data. Note
+// that this is different from the height of the filter.
+// - zero_padding_width: analogouus to the height above, but in the "x
+// dimension".
+// - vertical_filter_stride: the convolution slides a 2-dimensional window of
+// filter-height-by-filter-width over the input layer -- the center of that
+// window is moved in the "y dimension" according to this stride value.
+// - horizontal_filter_stride: analogous to the vertical stride above, but in
+// the "x dimension".
+class ConvolutionDescriptor {
+ public:
+ // By default construction, there is no zero-padding and the filter stride is
+ // 1x1 (centering the filter on every cell in the input layer's
+ // width-by-height area).
+ ConvolutionDescriptor();
+ ~ConvolutionDescriptor();
+
+ string ToString() const;
+ string ToShortString() const;
+
+ ConvolutionDescriptor& set_zero_padding_height(int64 value) {
+ zero_padding_height_ = value;
+ return *this;
+ }
+ ConvolutionDescriptor& set_zero_padding_width(int64 value) {
+ zero_padding_width_ = value;
+ return *this;
+ }
+ ConvolutionDescriptor& set_vertical_filter_stride(int64 value) {
+ vertical_filter_stride_ = value;
+ return *this;
+ }
+ ConvolutionDescriptor& set_horizontal_filter_stride(int64 value) {
+ horizontal_filter_stride_ = value;
+ return *this;
+ }
+
+ int64 zero_padding_height() const { return zero_padding_height_; }
+ int64 zero_padding_width() const { return zero_padding_width_; }
+ int64 vertical_filter_stride() const { return vertical_filter_stride_; }
+ int64 horizontal_filter_stride() const { return horizontal_filter_stride_; }
+
+ private:
+ int64 zero_padding_height_;
+ int64 zero_padding_width_;
+ int64 vertical_filter_stride_;
+ int64 horizontal_filter_stride_;
+ // TODO(leary) cudnn provides these fields, but need to characterize what
+ // their effect is -- they may be boolean rather than integral.
+ // int64 upscale_input_x;
+ // int64 upscale_input_y;
+};
+
+// A patch of values in the input can be pooled via either a max or an average
+// operation.
+// Specify int64 so there's no padding in PoolingDescriptor.
+enum class PoolingMode : int64 {
+ kMaximum,
+ kAverage,
+};
+
+// Describes a pooling operation to be enqueued onto a stream via a platform's
+// DnnSupport.
+//
+// TODO(broune): describe how padding works and what happens if the
+// window height/width is not divisible by the vertical/horizontal
+// stride.
+//
+// Arguments:
+// pooling_mode: pooling operator to use on the input patch
+// window_height: height of input window
+// window_width: width of input window
+// vertical_stride: vertical delta for center of the input patch
+// horizontal_stride: horizontal delta for center of the input patch
+class PoolingDescriptor {
+ public:
+ PoolingDescriptor();
+
+ PoolingDescriptor& set_pooling_mode(PoolingMode value) {
+ mode_ = value;
+ return *this;
+ }
+ PoolingDescriptor& set_window_height(int64 value) {
+ window_height_ = value;
+ return *this;
+ }
+ PoolingDescriptor& set_window_width(int64 value) {
+ window_width_ = value;
+ return *this;
+ }
+ PoolingDescriptor& set_vertical_padding(int64 value) {
+ vertical_padding_ = value;
+ return *this;
+ }
+ PoolingDescriptor& set_horizontal_padding(int64 value) {
+ horizontal_padding_ = value;
+ return *this;
+ }
+ PoolingDescriptor& set_vertical_stride(int64 value) {
+ vertical_stride_ = value;
+ return *this;
+ }
+ PoolingDescriptor& set_horizontal_stride(int64 value) {
+ horizontal_stride_ = value;
+ return *this;
+ }
+
+ void CloneFrom(const PoolingDescriptor& other);
+
+ string ToString() const;
+ string ToShortString() const;
+
+ PoolingMode mode() const { return mode_; }
+ int64 window_height() const { return window_height_; }
+ int64 window_width() const { return window_width_; }
+ int64 vertical_padding() const { return vertical_padding_; }
+ int64 horizontal_padding() const { return horizontal_padding_; }
+ int64 vertical_stride() const { return vertical_stride_; }
+ int64 horizontal_stride() const { return horizontal_stride_; }
+
+ private:
+ PoolingMode mode_;
+ int64 window_height_;
+ int64 window_width_;
+ int64 vertical_padding_;
+ int64 horizontal_padding_;
+ int64 vertical_stride_;
+ int64 horizontal_stride_;
+
+ SE_DISALLOW_COPY_AND_ASSIGN(PoolingDescriptor);
+};
+
+// Describes a dist_belief local response normalization.
+// The normalization equation is:
+// y_i = x_i / (bias + alpha * (sum_j_{i - range}^{i + range} x_j^2)) ^ beta
+// where x_i is the input in feature map i, y_i is the output.
+// Each feature map is split into segment_size segments for performing the
+// sum_j_. If wrap_around is true, the sum_j_ for y_i on the left and right of
+// a segment wrap around at the edges of the segment, if wrap_around is false
+// zeros are inserted instead.
+class NormalizeDescriptor {
+ public:
+ NormalizeDescriptor();
+
+ NormalizeDescriptor& set_bias(float bias) {
+ bias_ = bias;
+ return *this;
+ }
+
+ NormalizeDescriptor& set_range(int32 range) {
+ range_ = range;
+ return *this;
+ }
+
+ NormalizeDescriptor& set_alpha(float alpha) {
+ alpha_ = alpha;
+ return *this;
+ }
+
+ NormalizeDescriptor& set_beta(float beta) {
+ beta_ = beta;
+ return *this;
+ }
+
+ NormalizeDescriptor& set_wrap_around(bool wrap_around) {
+ wrap_around_ = wrap_around;
+ return *this;
+ }
+
+ NormalizeDescriptor& set_segment_size(int32 segment_size) {
+ segment_size_ = segment_size;
+ return *this;
+ }
+
+ void CloneFrom(const NormalizeDescriptor& other);
+
+ string ToString() const;
+ string ToShortString() const;
+
+ float bias() const { return bias_; }
+ int32 range() const { return range_; }
+ float alpha() const { return alpha_; }
+ float beta() const { return beta_; }
+ bool wrap_around() const { return wrap_around_; }
+ int32 segment_size() const { return segment_size_; }
+
+ private:
+ float bias_;
+ int32 range_;
+ float alpha_;
+ float beta_;
+ bool wrap_around_;
+ int32 segment_size_;
+
+ SE_DISALLOW_COPY_AND_ASSIGN(NormalizeDescriptor);
+};
+
+// Describes a kind of non-linearity (threshold-like mathematical function).
+enum class ActivationMode {
+ kSigmoid,
+ // Rectified linear activation: f(x) = x < 0 ? 0 : x
+ kRelu,
+ // Rectified linear activation, where upper maximum is 6.0.
+ kRelu6,
+ // Rectified linear activation, where upper maximum specified by
+ // BatchDescriptor::value_max().
+ kReluX,
+ kTanh,
+};
+
+// Returns a string representation of the given activation mode.
+string ActivationModeString(ActivationMode mode);
+
+// Describes the operation that DoElementwiseOperation should perform on its
+// inputs.
+enum class ElementwiseOperation {
+ kAdd,
+ kMultiply
+};
+
+string ElementwiseOperationString(ElementwiseOperation op);
+
+// Suite of operations typically used for implementing Deep/Convolutional Neural
+// Nets.
+class DnnSupport {
+ public:
+ DnnSupport() {}
+ virtual ~DnnSupport() {}
+
+ virtual port::Status Init() = 0;
+
+ // Enqueues a single-precision convolution operation onto the stream.
+ //
+ // Arguments (all borrowed):
+ // stream: borrowed pointer to the stream that the 'convolve' operation
+ // should be enqueued onto.
+ // input_descriptor: dimensions of the input layer.
+ // input_data: un-owned device memory region which contains the
+ // convolution input.
+ // filter_descriptor: dimensions of the convolution filter.
+ // weights: coefficients for the convolution filter, these are multiplied
+ // against values in the input that the filter convolves over.
+ // convolution_descriptor: stride of the convolution filter.
+ // output_descriptor: dimensions of the output layer.
+ // output_data: un-owned device memory region in which to place the
+ // convolution result.
+ //
+ // input_descriptor, filter_descriptor, convolution_descriptor and
+ // output_descriptor together specify exactly how the convolution is aligned
+ // with the input data:
+ //
+ // * (input dimensions - filter size + 1) / filter stride == output dimensions
+ // corresponds to dist_belief padding = VALID, i.e. the input is not padded.
+ // * input dimensions / filter stride == output dimensions
+ // corresponds to dist_belief padding = SAME, i.e. input and output are the
+ // same size - this requires padding the input.
+ // * (input dimensions + filter size - 1) / filter stride == output dimensions
+ // corresponds to dist_belief padding = FULL, i.e. the output is sized so
+ // that if the inverse of the filter is applied to the output in VALID mode
+ // the result is the same size as the input - this requires even more
+ // padding
+ // of the input.
+ virtual bool DoConvolve(
+ Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+ const DeviceMemory<float>& input_data,
+ const dnn::FilterDescriptor& filter_descriptor,
+ const DeviceMemory<float>& filter_data,
+ const dnn::ConvolutionDescriptor& convolution_descriptor,
+ const dnn::BatchDescriptor& output_descriptor,
+ DeviceMemory<float>* output_data) = 0;
+
+ // Enqueues a double-precision convolution operation onto the stream.
+ // See DoConvolve above for argument details.
+ virtual bool DoConvolve(
+ Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+ const DeviceMemory<double>& input_data,
+ const dnn::FilterDescriptor& filter_descriptor,
+ const DeviceMemory<double>& filter_data,
+ const dnn::ConvolutionDescriptor& convolution_descriptor,
+ const dnn::BatchDescriptor& output_descriptor,
+ DeviceMemory<double>* output_data) = 0;
+
+ // Variation of the above with the weight matrix split into two matrices.
+ // first_weights: Coefficients of the first matrix.
+ // second_weights: Coefficients of the second matrix.
+ // depth_multiplier: specifies the columns of the first matrix and rows
+ // of the second one - first_weights columns = depth_multiplier,
+ // second_weights rows = depth_multiplier *
+ // filter_descriptor.input_feature_map_count().
+ // see go/separable for documentation on separable convolutions.
+ virtual bool DoSeparableConvolve(
+ Stream* stream, const BatchDescriptor& input_descriptor,
+ const DeviceMemory<float>& input_data,
+ const FilterDescriptor& filter_descriptor, int depth_multiplier,
+ const DeviceMemory<float>& first_weights,
+ const DeviceMemory<float>& second_weights,
+ const ConvolutionDescriptor& convolution_descriptor,
+ const BatchDescriptor& output_descriptor,
+ DeviceMemory<float>* output_data) = 0;
+
+ // Enqueues a single-precision backward convolution (for data) operation onto
+ // the stream.
+ //
+ // Arguments:
+ // stream: borrowed pointer to the stream that the 'convolve' operation
+ // should be enqueued onto.
+ // filter_descriptor: dimensions of the convolution filter.
+ // filter_data: coefficients for the convolution filter.
+ // output_descriptor: dimensions of the output gradients, which is the same
+ // as
+ // the dimensions of the ouput.
+ // backward_output_data: un-owned device memory region which contains the
+ // backprop of the output.
+ // convolution_descriptor: stride of the convolution filter.
+ // input_descriptor: dimensions of the input layer.
+ // backward_input_data: un-owned device memory region in which to place the
+ // backprop of the input.
+ virtual bool DoConvolveBackwardData(
+ Stream* stream, const FilterDescriptor& filter_descriptor,
+ const DeviceMemory<float>& filter_data,
+ const BatchDescriptor& output_descriptor,
+ DeviceMemory<float> backward_output_data,
+ const ConvolutionDescriptor& convolution_descriptor,
+ const BatchDescriptor& input_descriptor,
+ DeviceMemory<float>* backward_input_data) = 0;
+
+ // Enqueues a single-precision backward convolution (for filter) operation
+ // onto
+ // the stream.
+ //
+ // Arguments:
+ // stream: borrowed pointer to the stream that the 'convolve' operation
+ // should be enqueued onto.
+ // input_descriptor: dimensions of the input layer.
+ // input_data: un-owned device memory region which contains the
+ // convolution input.
+ // output_descriptor: dimensions of the output gradients, which is the same
+ // as
+ // the dimensions of the ouput.
+ // backward_output_data: un-owned device memory region which contains the
+ // backprop of the output.
+ // convolution_descriptor: stride of the convolution filter.
+ // filter_descriptor: dimensions of the convolution filter.
+ // backward_filter_data: un-owned device memory region in which to place the
+ // backprop of the filter.
+ virtual bool DoConvolveBackwardFilter(
+ Stream* stream, const BatchDescriptor& input_descriptor,
+ const DeviceMemory<float>& input_data,
+ const BatchDescriptor& output_descriptor,
+ DeviceMemory<float> backward_output_data,
+ const ConvolutionDescriptor& convolution_descriptor,
+ const FilterDescriptor& filter_descriptor,
+ DeviceMemory<float>* backward_filter_data) = 0;
+
+ // Fully connects the "nodes" (float values) in input_data with
+ // shape input_dimensions to output_data with output_dimensions
+ // using provided weights. This is equivalent to computing a matrix
+ // product, hence the name MatMul.
+ //
+ // A BatchDescriptor has four dimensions: batch, y, x, depth. Matrix products
+ // happen in two dimensions. To get down to two dimensions, we consider the
+ // input y, x and depth dimension as one combined dimension T. For now,
+ // assume that the output height and width are 1 and let OD be the output
+ // depth.
+ //
+ // There are three device memory buffers passed in to this
+ // function. We can now view all three as matrices:
+ //
+ // input_data: A batch x T matrix
+ // weights: A T x OD matrix
+ // output_data: A batch x OD matrix
+ //
+ // This function then computes the matrix product of input_data and
+ // weights and writes the result into output_data.
+ //
+ // Here the weights buffer is in row major order, i.e. the first OD
+ // entries in weights are the first row, the second OD entries in
+ // weights are the second row and so on.
+ //
+ // The case for output width*height > 1 is more complicated. Let K =
+ // OY * OX where OY is the output height and OX is the output
+ // width. Then weights is divided into K sub-arrays W_i, for
+ // i=0,...,k-1, that each represent a T x OD matrix. This function
+ // then computes the K matrix multiplications of input_data with
+ // each W_i. This creates K matrices with dimensions batch x
+ // OD. These K matrices are concatenated horizontally to form one
+ // larger matrix with dimensions batch x (K*OD); note that this is
+ // not the same as concatenating the bytes of the matrices. The
+ // combined matrix can then be interpreted as a tensor with
+ // dimensions (batch, OY, OX, OD). If the output tensor format is
+ // not kBatchYXDepth, this function would then need to arrange for
+ // the output to be in the requested layout, if that is
+ // supported. Note that the case K=1 is equivalent to the
+ // description above. It is recommended to prefer the case K=1.
+ //
+ // Arguments (all borrowed):
+ // stream: borrowed pointer to the stream that the 'fully connect' operation
+ // should be enqueued onto.
+ // output_data: un-owned device memory region in which to place the
+ // fully connected result.
+ virtual bool DoMatMul(Stream* stream, const DeviceMemory<float>& input_data,
+ const DeviceMemory<float>& weights,
+ const dnn::BatchDescriptor& input_dimensions,
+ const dnn::BatchDescriptor& output_dimensions,
+ DeviceMemory<float>* output_data) = 0;
+
+ // Version of DoMatMul that uses pre-quantized 8 bit weights.
+ // weight_scales specifies the scaling of each column of weights:
+ // original float weight[row * num_columns + column] =
+ // quantized_weight[row * nnum_columns + column] * weight_scales[column].
+ virtual bool DoMatMulQuantized(Stream* stream,
+ const DeviceMemory<float>& input_data,
+ const DeviceMemory<int8>& quantized_weights,
+ const DeviceMemory<float>& weight_scales,
+ const dnn::BatchDescriptor& input_dimensions,
+ const dnn::BatchDescriptor& output_dimensions,
+ DeviceMemory<float>* output_data) = 0;
+
+ // Version of DoMatMul that uses pre-quantized 16 bit weights.
+ // weight_scales specifies the scaling of each column of weights:
+ // original float weight[row * num_columns + column] =
+ // quantized_weight[row * nnum_columns + column] * weight_scales[column].
+ virtual bool DoMatMulQuantized(Stream* stream,
+ const DeviceMemory<float>& input_data,
+ const DeviceMemory<int16>& quantized_weights,
+ const DeviceMemory<float>& weight_scales,
+ const dnn::BatchDescriptor& input_dimensions,
+ const dnn::BatchDescriptor& output_dimensions,
+ DeviceMemory<float>* output_data) = 0;
+
+ // Adds biases to the feature maps in input_data producing
+ // output_data. input_data can equal output_data, but must not
+ // partially overlap it.
+ //
+ // Let K = count() * height() * width() and N = feature_map_count()
+ // on dimensions. Then input_value contains K*N values and biases
+ // contains N values. We can thus logically consider input_value to
+ // contain K vectors of N elements each. This function adds biases
+ // to each of those N vectors.
+ //
+ // TODO(broune): This works differently when width() * height() > 1
+ // and the call to ThenBiasAdd() follows a call to ThenMatMul(). In
+ // that case there should be width() * height() *
+ // feature_map_count() biases, but this is not implemented on all
+ // StreamExecutors.
+ //
+ // Arguments (all borrowed):
+ // stream: borrowed pointer to the stream that the 'bias add' operation
+ // should be enqueued onto.
+ // input_data: un-owned device memory region containing the input.
+ // biases: un-owned device memory region containing biases to add to the
+ // input.
+ // dimensions: dimensions of input_data and output_data.
+ // output_data: un-owned device memory region in which to place the result.
+ virtual bool DoBiasAdd(Stream* stream, const DeviceMemory<float>& input_data,
+ const DeviceMemory<float>& biases,
+ const dnn::BatchDescriptor& dimensions,
+ DeviceMemory<float>* output_data) = 0;
+
+ // Performs a forward pooling operation on input_data, writing to
+ // output_data. See PoolingDescriptor for how to configure the
+ // pooling operation.
+ //
+ // Pooling happens as a window that moves across the Y and X
+ // dimensions of input_data, where each position of the window
+ // yields one output value. E.g. for max pooling, the computed value
+ // is the maximum element in the window. The operation is applied
+ // independently to each batch and at each feature map (depth), so
+ // that the output depth and feature_map_count are the same as for
+ // the input. The output width and height can be different.
+ //
+ // See PoolingDescriptor for how to configure the pooling operation.
+ virtual bool DoPoolForward(Stream* stream,
+ const dnn::PoolingDescriptor& pooling_dimensions,
+ const dnn::BatchDescriptor& input_dimensions,
+ const DeviceMemory<float>& input_data,
+ const dnn::BatchDescriptor& output_dimensions,
+ DeviceMemory<float>* output_data) = 0;
+
+ // Performs differentiation of the pooling operation.
+ virtual bool DoPoolBackward(Stream* stream,
+ const dnn::PoolingDescriptor& pooling_dimensions,
+ const dnn::BatchDescriptor& input_dimensions,
+ const DeviceMemory<float>& input_data,
+ const dnn::BatchDescriptor& output_dimensions,
+ const DeviceMemory<float>& output_data,
+ const DeviceMemory<float>& input_diff_data,
+ DeviceMemory<float>* output_diff_data) = 0;
+
+ // Applies local response normalization to all of the values
+ // held on the device in 'input_data'.
+ virtual bool DoNormalize(Stream* stream,
+ const dnn::NormalizeDescriptor& normalize_descriptor,
+ const DeviceMemory<float>& input_data,
+ DeviceMemory<float>* output_data) = 0;
+
+ // Applies an activation function (see ActivationMode) to all of the values
+ // held on the device in 'input_data', whose dimensions are described by
+ // 'dimensions'.
+ //
+ // Arguments (all borrowed):
+ // stream: borrowed pointer to the stream that the 'activate' operation
+ // should be enqueued onto.
+ // activation_mode: Type of activation to perform.
+ // input_data: un-owned device memory region which contains the
+ // activate input.
+ // output_data: un-owned device memory region in which to place the
+ // activate result.
+ virtual bool DoActivate(Stream* stream, ActivationMode activation_mode,
+ const BatchDescriptor& dimensions,
+ const DeviceMemory<float>& input_data,
+ DeviceMemory<float>* output_data) = 0;
+
+ // Concatenates several layers into one, by concatenating the depth of each
+ // layer at matching x and y coordinates.
+ // The inputs must all have the same width and height, the output will have
+ // the same width and height as the inputs and its depth will be the sum of
+ // the input depths.
+ //
+ // Arguments (all borrowed):
+ // stream: borrowed pointer to the stream that the 'depth concatenate'
+ // operation should be enqueued onto.
+ // input_dimensions: The dimensions of each input.
+ // input_data: un-owned device memory region which contains the
+ // input data for each input layer.
+ // output_data: un-owned device memory region in which to place the
+ // depth concatenate result.
+ virtual bool DoDepthConcatenate(
+ Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+ port::ArraySlice<const DeviceMemory<float>*> input_data,
+ DeviceMemory<float>* output_data) = 0;
+
+ // Computes the specified operation (e.g. addition or multiplication)
+ // between corresponding elements in the inputs and stores the result in the
+ // output element.
+ // The inputs and output must all have the same dimensions, but may have
+ // different quantization parameters (min_value and max_value).
+ //
+ // Arguments (all borrowed):
+ // stream: borrowed pointer to the stream that the 'elementwise operation'
+ // should be enqueued onto.
+ // operation: The operation to perform.
+ // input_dimensions: The dimensions of each input.
+ // input_data: un-owned device memory region which contains the
+ // input data for each input layer.
+ // output_dimensions: The dimensions of the output.
+ // output_data: un-owned device memory region in which to place the
+ // operation result.
+ virtual bool DoElementwiseOperate(
+ Stream* stream, ElementwiseOperation operation,
+ port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+ port::ArraySlice<const DeviceMemory<float>*> input_data,
+ const dnn::BatchDescriptor& output_dimensions,
+ DeviceMemory<float>* output_data) = 0;
+
+ // Enqueues an asynchronous memcpy of the *quantized* output of a layer (that
+ // is, bytes instead of scaled floats) into 'host_dst' if they are available
+ // for the underlying DNN implementation. If this quantized output is not
+ // available, false is returned, which will place 'stream' into an error
+ // state.
+ //
+ // Arguments (all borrowed):
+ // stream: borrowed pointer to the stream that the 'quantized memcpy'
+ // operation should be enqueued onto.
+ // gpu_unquantized_src: the device memory that contains the unquantized data
+ // -- this data should also have a corresponding quantized representation
+ // on the device for this operation to succeed.
+ // host_dst: un-owned host memory region that is mutated in place,
+ // it is clobbered by the values in 'gpu_unquantized_src' when the enqueued
+ // (asynchronous) memcpy operation is performed.
+ // TODO(wgulland) Merge all these versions of DoMemcpyD2HQuantized.
+ virtual bool DoMemcpyD2HQuantized(
+ Stream* stream, const DeviceMemory<float>& gpu_unquantized_src,
+ port::MutableArraySlice<uint8> host_dst) = 0;
+
+ // As above, but for 16-bit values.
+ virtual bool DoMemcpyD2HQuantized(
+ Stream* stream, const DeviceMemory<float>& gpu_unquantized_src,
+ port::MutableArraySlice<uint16> host_dst) = 0;
+
+ // As above, but for signed 32-bit values.
+ virtual bool DoMemcpyD2HQuantized(
+ Stream* stream, const DeviceMemory<float>& gpu_unquantized_src,
+ port::MutableArraySlice<int32> host_dst) = 0;
+
+ // Enqueues an asynchronous memcpy of 'host_dst' into the *quantized* input
+ // of a layer (that is, bytes instead of scaled floats) if they are supported
+ // by the underlying DNN implementation. If this quantized input is not
+ // supported, false is returned, which will place 'stream' into an error
+ // state.
+ //
+ // Arguments (all borrowed):
+ // stream: borrowed pointer to the stream that the 'quantized memcpy'
+ // operation should be enqueued onto.
+ // host_src: un-owned host memory region that contains the quantized data.
+ // gpu_unquantized_dst: the device memory that is clobbered by the values in
+ // 'host_src' when the enqueued (asynchronous) memcpy operation is
+ // performed. -- this data should also have a corresponding quantized
+ // representation on the device for this operation to
+ // succeed.
+ virtual bool DoMemcpyH2DQuantized(
+ Stream* stream, port::ArraySlice<uint8> host_src,
+ DeviceMemory<float>* gpu_unquantized_dst) = 0;
+
+ private:
+ SE_DISALLOW_COPY_AND_ASSIGN(DnnSupport);
+};
+
+} // namespace dnn
+} // namespace gputools
+} // namespace perftools
+
+#endif // TENSORFLOW_STREAM_EXECUTOR_DNN_H_