tensorflow/core/kernels/conv_ops.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373

// See docs in ../ops/nn_ops.cc.

#define USE_EIGEN_TENSOR
#define EIGEN_USE_THREADS

#include "tensorflow/core/framework/numeric_op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/public/tensor_shape.h"
#include "tensorflow/core/framework/tensor_slice.h"
#include "tensorflow/core/kernels/conv_2d.h"
#include "tensorflow/core/kernels/ops_util.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/gtl/array_slice.h"
#include "tensorflow/core/util/use_cudnn.h"
#include "tensorflow/core/util/padding.h"
#include "tensorflow/core/public/tensor.h"

#if GOOGLE_CUDA
#include "tensorflow/core/common_runtime/gpu_device_context.h"
#include "tensorflow/stream_executor/stream.h"
#endif  // GOOGLE_CUDA

namespace tensorflow {

typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;

template <typename Device, typename T>
struct LaunchGeneric {
  static void launch(OpKernelContext* ctx, const Tensor& input,
                     const Tensor& filter, int stride,
                     const Eigen::PaddingType& padding, Tensor* output) {
    if (filter.dim_size(1) == filter.dim_size(0) && filter.dim_size(0) == 1 &&
        stride == 1) {
      // For 1x1 kernel, the 2D convolution is reduced to matrix
      // multiplication.
      //
      // TODO(vrv): We should be able to call SpatialConvolution
      // and it will produce the same result, but doing so
      // led to NaNs during training.  Using matmul instead for now.
      int conv_width = 1;  // Width for the convolution step.
      for (int i = 0; i < 3; ++i) {
        conv_width *= output->dim_size(i);
      }

      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
      dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
      functor::MatMulConvFunctor<Device, T>()(
          ctx->eigen_device<Device>(),
          output->shaped<T, 2>({conv_width, filter.dim_size(3)}),
          input.shaped<T, 2>({conv_width, filter.dim_size(2)}),
          filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)}),
          dim_pair);
    } else {
      functor::SpatialConvolution<Device, T>()(
          ctx->eigen_device<Device>(), output->tensor<T, 4>(),
          input.tensor<T, 4>(), filter.tensor<T, 4>(), stride, padding);
    }
  }
};

template <typename Device, typename T>
struct LaunchConvOp;

template <typename T>
struct LaunchConvOp<CPUDevice, T> {
  static void launch(OpKernelContext* ctx, bool use_cudnn, const Tensor& input,
                     const Tensor& filter, int stride,
                     const Eigen::PaddingType& padding, Tensor* output) {
    LaunchGeneric<CPUDevice, T>::launch(ctx, input, filter, stride, padding,
                                        output);
  }
};

template <typename Device, typename T>
class Conv2DOp : public BinaryOp<T> {
 public:
  explicit Conv2DOp(OpKernelConstruction* context) : BinaryOp<T>(context) {
    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
    OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
    use_cudnn_ &= CanUseCudnn();
    OP_REQUIRES(context, strides_.size() == 4,
                errors::InvalidArgument(
                    "Sliding window strides field must "
                    "specify 4 dimensions"));
    OP_REQUIRES(context, strides_[1] == strides_[2],
                errors::InvalidArgument(
                    "Current implementation only supports equal length "
                    "strides in the row and column dimensions."));
    OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1),
                errors::InvalidArgument(
                    "Current implementation does not yet support "
                    "strides in the batch and depth dimensions."));
    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
  }

  void Compute(OpKernelContext* context) override {
    // Input tensor is of the following dimensions:
    // [ batch, in_rows, in_cols, in_depth ]

    const Tensor& input = context->input(0);

    // Input filter is of the following dimensions:
    // [ filter_rows, filter_cols, in_depth, out_depth]
    const Tensor& filter = context->input(1);

    // For 2D convolution, there should be 4 dimensions.
    OP_REQUIRES(context, input.dims() == 4,
                errors::InvalidArgument("input must be 4-dimensional",
                                        input.shape().ShortDebugString()));
    OP_REQUIRES(context, filter.dims() == 4,
                errors::InvalidArgument("filter must be 4-dimensional: ",
                                        filter.shape().ShortDebugString()));

    // The last dimension for input is in_depth. It must be the same as the
    // filter's in_depth.
    const int64 in_depth = input.dim_size(3);
    OP_REQUIRES(
        context, in_depth == filter.dim_size(2),
        errors::InvalidArgument("input and filter must have the same depth: ",
                                in_depth, " vs ", filter.dim_size(2)));

    // The last dimension for filter is out_depth.
    const int64 out_depth = filter.dim_size(3);

    // The second dimension for input is rows/height.
    // The first dimension for filter is rows/height.
    const int64 input_rows = input.dim_size(1);
    const int64 filter_rows = filter.dim_size(0);

    // The third dimension for input is columns/width.
    // The second dimension for filter is columns/width.
    const int64 input_cols = input.dim_size(2);
    const int64 filter_cols = filter.dim_size(1);

    // The first dimension for input is batch.
    const int64 batch = input.dim_size(0);

    // For now we take the stride from the second dimension only (we
    // assume row = col stride, and do not support striding on the
    // batch or depth dimension).
    const int stride = strides_[1];

    int out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
    if (filter_cols == filter_rows && filter_rows == 1 && stride == 1) {
      // For 1x1 kernel, the 2D convolution is reduced to matrix
      // multiplication.
      out_rows = input_rows;
      out_cols = input_cols;
    } else {
      OP_REQUIRES_OK(
          context, Get2dOutputSize(input_rows, input_cols, filter_rows,
                                   filter_cols, stride, stride, padding_,
                                   &out_rows, &out_cols, &pad_rows, &pad_cols));
    }
    TensorShape out_shape({batch, out_rows, out_cols, out_depth});

    // Output tensor is of the following dimensions:
    // [ in_batch, out_rows, out_cols, out_depth ]
    Tensor* output = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));

    VLOG(2) << "Conv2D: in_depth = " << in_depth
            << ", input_cols = " << input_cols
            << ", filter_cols = " << filter_cols
            << ", input_rows = " << input_rows
            << ", filter_rows = " << filter_rows << ", stride = " << stride
            << ", out_depth = " << out_depth;

    LaunchConvOp<Device, T>::launch(context, use_cudnn_, input, filter, stride,
                                    BrainPadding2EigenPadding(padding_),
                                    output);
  }

 private:
  std::vector<int32> strides_;
  bool use_cudnn_;
  Padding padding_;

  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DOp);
};

REGISTER_KERNEL_BUILDER(Name("Conv2D")
                            .Device(DEVICE_CPU)
                            .TypeConstraint<float>("T"),
                        Conv2DOp<CPUDevice, float>);

#if GOOGLE_CUDA

namespace {
template <typename T>
perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory,
                                                    uint64 size) {
  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory),
                                                size * sizeof(T));
  perftools::gputools::DeviceMemory<T> typed(wrapped);
  return typed;
}
}  // namespace

template <typename T>
struct LaunchConvOp<GPUDevice, T> {
  static void launch(OpKernelContext* ctx, bool use_cudnn,
                     const Tensor& input_param, const Tensor& filter,
                     int stride, const Eigen::PaddingType& padding,
                     Tensor* output) {
    auto* stream = ctx->op_device_context<GPUDeviceContext>()->stream();
    OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));

    if (use_cudnn) {
      Tensor input = input_param;
      if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1) {
        // 1x1 filter, so call cublas directly.
        const uint64 m =
            input.dim_size(0) * input.dim_size(1) * input.dim_size(2);
        const uint64 k = filter.dim_size(2);
        const uint64 n = filter.dim_size(3);

        auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
                                    input.template flat<T>().size());
        auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
                                    filter.template flat<T>().size());
        auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
                                    output->template flat<T>().size());

        auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
        bool blas_launch_status =
            stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f,
                                 b_ptr, n, a_ptr, k, 0.0f, &c_ptr, n)
                .ok();
        if (!blas_launch_status) {
          ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
                                          ", n=", n, ", k=", k));
        }
        return;
      }
      if (padding == Eigen::PADDING_SAME) {
        const int64 out_rows = output->dim_size(1);
        const int64 out_cols = output->dim_size(2);
        const int64 in_rows = input.dim_size(1);
        const int64 in_cols = input.dim_size(2);
        const int64 patch_rows = filter.dim_size(0);
        const int64 patch_cols = filter.dim_size(1);
        // Total padding on rows and cols is
        // Pr = (R' - 1) * S + Kr - R
        // Pc = (C' - 1) * S + Kc - C
        // where (R', C') are output dimensions, (R, C) are input dimensions, S
        // is stride, (Kr, Kc) are filter dimensions.
        // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
        // and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
        // we pad more on the right and bottom than on the top and left.
        const int padding_rows = (out_rows - 1) * stride + patch_rows - in_rows;
        const int padding_cols = (out_cols - 1) * stride + patch_cols - in_cols;
        Tensor transformed_input;
        OP_REQUIRES_OK(
            ctx, ctx->allocate_temp(
                     DataTypeToEnum<T>::value,
                     TensorShape(
                         {input.dim_size(0), input.dim_size(1) + padding_rows,
                          input.dim_size(2) + padding_cols, input.dim_size(3)}),
                     &transformed_input));

        functor::PadInput<GPUDevice, T>()(
            ctx->eigen_device<GPUDevice>(), input_param.tensor<T, 4>(),
            padding_rows / 2, padding_rows - padding_rows / 2, padding_cols / 2,
            padding_cols - padding_cols / 2, transformed_input.tensor<T, 4>());
        input = transformed_input;
      }

      perftools::gputools::dnn::BatchDescriptor input_desc;
      input_desc.set_count(input.dim_size(0))
          .set_height(input.dim_size(1))
          .set_width(input.dim_size(2))
          .set_feature_map_count(input.dim_size(3))
          .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
      perftools::gputools::dnn::BatchDescriptor output_desc;
      output_desc.set_count(output->dim_size(0))
          .set_height(output->dim_size(1))
          .set_width(output->dim_size(2))
          .set_feature_map_count(output->dim_size(3))
          .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
      perftools::gputools::dnn::FilterDescriptor filter_desc;
      filter_desc.set_input_filter_height(filter.dim_size(0))
          .set_input_filter_width(filter.dim_size(1))
          .set_input_feature_map_count(filter.dim_size(2))
          .set_output_feature_map_count(filter.dim_size(3));
      perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
      conv_desc.set_vertical_filter_stride(stride)
          .set_horizontal_filter_stride(stride);

      Tensor transformed_filter;
      OP_REQUIRES_OK(ctx,
                     ctx->allocate_temp(
                         DataTypeToEnum<T>::value,
                         TensorShape({filter.dim_size(3), filter.dim_size(2),
                                      filter.dim_size(0), filter.dim_size(1)}),
                         &transformed_filter));

      functor::TransformFilter<GPUDevice, T>()(
          ctx->eigen_device<GPUDevice>(), filter.tensor<T, 4>(),
          transformed_filter.tensor<T, 4>());

      auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
                                      input.template flat<T>().size());
      auto filter_ptr =
          AsDeviceMemory(transformed_filter.template flat<T>().data(),
                         transformed_filter.template flat<T>().size());
      auto output_ptr = AsDeviceMemory(output->template flat<T>().data(),
                                       output->template flat<T>().size());

      bool cudnn_launch_status =
          stream->ThenConvolve(input_desc, input_ptr, filter_desc, filter_ptr,
                               conv_desc, output_desc, &output_ptr)
              .ok();

      if (!cudnn_launch_status) {
        ctx->SetStatus(errors::Internal(
            "cuDNN launch failure : input shape(", input.shape().DebugString(),
            ") filter shape(", filter.shape().DebugString(), ")"));
      }
    } else {
      LaunchGeneric<GPUDevice, T>::launch(ctx, input_param, filter, stride,
                                          padding, output);
    }
  }
};

#endif  // GOOGLE_CUDA

#if GOOGLE_CUDA
// Forward declarations of the functor specializations for GPU.
namespace functor {
#define DECLARE_GPU_SPEC(T)                                                  \
  template <>                                                                \
  void SpatialConvolution<GPUDevice, T>::operator()(                         \
      const GPUDevice& d, typename TTypes<T, 4>::Tensor output,              \
      typename TTypes<T, 4>::ConstTensor input,                              \
      typename TTypes<T, 4>::ConstTensor filter, int stride,                 \
      const Eigen::PaddingType& padding);                                    \
  extern template struct SpatialConvolution<GPUDevice, T>;                   \
  template <>                                                                \
  void MatMulConvFunctor<GPUDevice, T>::operator()(                          \
      const GPUDevice& d, typename TTypes<T, 2>::Tensor out,                 \
      typename TTypes<T, 2>::ConstTensor in0,                                \
      typename TTypes<T, 2>::ConstTensor in1,                                \
      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair); \
  extern template struct MatMulConvFunctor<GPUDevice, T>;                    \
  template <>                                                                \
  void TransformFilter<GPUDevice, T>::operator()(                            \
      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in,             \
      typename TTypes<T, 4>::Tensor out);                                    \
  extern template struct TransformFilter<GPUDevice, T>;                      \
  template <>                                                                \
  void PadInput<GPUDevice, T>::operator()(                                   \
      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in,             \
      int padding_rows_left, int padding_rows_right, int padding_cols_left,  \
      int padding_cols_right, typename TTypes<T, 4>::Tensor out);            \
  extern template struct PadInput<GPUDevice, T>

DECLARE_GPU_SPEC(float);
#undef DECLARE_GPU_SPEC
}  // namespace functor

// Registration of the GPU implementations.
REGISTER_KERNEL_BUILDER(Name("Conv2D")
                            .Device(DEVICE_GPU)
                            .TypeConstraint<float>("T"),
                        Conv2DOp<GPUDevice, float>);

#endif  // GOOGLE_CUDA

}  // namespace tensorflow