tensorflow/core/kernels/adjust_contrast_op.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413

/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

// See docs in ../ops/image_ops.cc
#define EIGEN_USE_THREADS

#include "tensorflow/core/kernels/adjust_contrast_op.h"
#include <memory>
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/logging.h"

namespace tensorflow {

typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;

// AdjustContrastOp is deprecated as of GraphDef version >= 2

template <typename Device, typename T>
class AdjustContrastOp : public OpKernel {
 public:
  explicit AdjustContrastOp(OpKernelConstruction* context) : OpKernel(context) {
  }

  void Compute(OpKernelContext* context) override {
    const Tensor& input = context->input(0);
    const Tensor& factor = context->input(1);
    const Tensor& min_value = context->input(2);
    const Tensor& max_value = context->input(3);
    OP_REQUIRES(context, input.dims() >= 3,
                errors::InvalidArgument("input must be at least 3-D, got shape",
                                        input.shape().DebugString()));
    const int64 height = input.dim_size(input.dims() - 3);
    const int64 width = input.dim_size(input.dims() - 2);
    const int64 channels = input.dim_size(input.dims() - 1);

    OP_REQUIRES(context, TensorShapeUtils::IsScalar(factor.shape()),
                errors::InvalidArgument("contrast_factor must be scalar: ",
                                        factor.shape().DebugString()));
    OP_REQUIRES(context, TensorShapeUtils::IsScalar(min_value.shape()),
                errors::InvalidArgument("min_value must be scalar: ",
                                        min_value.shape().DebugString()));
    OP_REQUIRES(context, TensorShapeUtils::IsScalar(max_value.shape()),
                errors::InvalidArgument("max_value must be scalar: ",
                                        max_value.shape().DebugString()));

    Tensor* output = nullptr;
    OP_REQUIRES_OK(context,
                   context->allocate_output(0, input.shape(), &output));

    Tensor mean_values;
    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<float>::value,
                                                   TensorShape(input.shape()),
                                                   &mean_values));

    if (input.NumElements() > 0) {
      const int64 batch = input.NumElements() / (height * width * channels);
      const int64 shape[4] = {batch, height, width, channels};
      functor::AdjustContrast<Device, T>()(
          context->eigen_device<Device>(), input.shaped<T, 4>(shape),
          factor.scalar<float>(), min_value.scalar<float>(),
          max_value.scalar<float>(), mean_values.shaped<float, 4>(shape),
          output->shaped<float, 4>(shape));
    }
  }
};

#define REGISTER_KERNEL(T)                                              \
  REGISTER_KERNEL_BUILDER(                                              \
      Name("AdjustContrast").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
      AdjustContrastOp<CPUDevice, T>);

REGISTER_KERNEL(uint8);
REGISTER_KERNEL(int8);
REGISTER_KERNEL(int16);
REGISTER_KERNEL(int32);
REGISTER_KERNEL(float);
REGISTER_KERNEL(double);
#undef REGISTER_KERNEL

#if GOOGLE_CUDA
// Forward declarations of the function specializations for GPU (to prevent
// building the GPU versions here, they will be built compiling _gpu.cu.cc).
namespace functor {
#define DECLARE_GPU_SPEC(T)                                         \
  template <>                                                       \
  void AdjustContrast<GPUDevice, T>::operator()(                    \
      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \
      typename TTypes<float>::ConstScalar contrast_factor,          \
      typename TTypes<float>::ConstScalar min_value,                \
      typename TTypes<float>::ConstScalar max_value,                \
      typename TTypes<float, 4>::Tensor mean_values,                \
      typename TTypes<float, 4>::Tensor output);                    \
  extern template struct AdjustContrast<GPUDevice, T>;

DECLARE_GPU_SPEC(uint8);
DECLARE_GPU_SPEC(int8);
DECLARE_GPU_SPEC(int16);
DECLARE_GPU_SPEC(int32);
DECLARE_GPU_SPEC(float);
DECLARE_GPU_SPEC(double);
#undef DECLARE_GPU_SPEC
}  // namespace functor

// Registration of the GPU implementations.
#define REGISTER_GPU_KERNEL(T)                                          \
  REGISTER_KERNEL_BUILDER(                                              \
      Name("AdjustContrast").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
      AdjustContrastOp<GPUDevice, T>);
REGISTER_GPU_KERNEL(uint8);
REGISTER_GPU_KERNEL(int8);
REGISTER_GPU_KERNEL(int16);
REGISTER_GPU_KERNEL(int32);
REGISTER_GPU_KERNEL(float);
REGISTER_GPU_KERNEL(double);
#undef REGISTER_GPU_KERNEL

#endif  // GOOGLE_CUDA

class AdjustContrastOpV2Base : public OpKernel {
 protected:
  explicit AdjustContrastOpV2Base(OpKernelConstruction* context)
      : OpKernel(context) {}

  struct ComputeOptions {
    const Tensor* input = nullptr;
    const Tensor* factor = nullptr;
    Tensor* output = nullptr;
    int64 batch = 0;
    int64 height = 0;
    int64 width = 0;
    int64 channels = 0;
  };

  void Compute(OpKernelContext* context) override {
    const Tensor& input = context->input(0);
    const Tensor& factor = context->input(1);
    OP_REQUIRES(context, input.dims() >= 3,
                errors::InvalidArgument("input must be at least 3-D, got shape",
                                        input.shape().DebugString()));
    const int64 height = input.dim_size(input.dims() - 3);
    const int64 width = input.dim_size(input.dims() - 2);
    const int64 channels = input.dim_size(input.dims() - 1);

    OP_REQUIRES(context, TensorShapeUtils::IsScalar(factor.shape()),
                errors::InvalidArgument("contrast_factor must be scalar: ",
                                        factor.shape().DebugString()));

    Tensor* output = nullptr;
    OP_REQUIRES_OK(context,
                   context->allocate_output(0, input.shape(), &output));

    if (input.NumElements() > 0) {
      const int64 batch = input.NumElements() / (height * width * channels);
      ComputeOptions options;
      options.input = &input;
      options.factor = &factor;
      options.output = output;
      options.batch = batch;
      options.height = height;
      options.width = width;
      options.channels = channels;
      DoCompute(context, options);
    }
  }

  virtual void DoCompute(OpKernelContext* context,
                         const ComputeOptions& options) = 0;
};

template <typename Device>
class AdjustContrastOpv2;

template <>
class AdjustContrastOpv2<CPUDevice> : public AdjustContrastOpV2Base {
 public:
  explicit AdjustContrastOpv2(OpKernelConstruction* context)
      : AdjustContrastOpV2Base(context) {}

  void DoCompute(OpKernelContext* context,
                 const ComputeOptions& options) override {
    const int64 batch = options.batch;
    const int64 height = options.height;
    const int64 width = options.width;
    const int64 channels = options.channels;
    const int64 image_size = height * width;
    const Tensor* input = options.input;
    const Tensor* factor = options.factor;
    Tensor* output = options.output;
    Tensor mean_values;
    OP_REQUIRES_OK(context, context->allocate_temp(
                                DataTypeToEnum<float>::value,
                                TensorShape({batch, channels}), &mean_values));
    // TODO(zhengxq): for multiple batches, shard them into different batches.
    auto input_data = input->shaped<float, 3>({batch, image_size, channels});
    auto mean_data = mean_values.tensor<float, 2>();
    auto output_data = output->shaped<float, 3>({batch, image_size, channels});

    // Calculate the mean of the inputs.
    ReduceMeanAcrossImage(input_data, mean_data, output_data);
    // Broadcast the mean into the outputs.
    BroadcastAcrossImage(mean_data, output_data);
    // Increment the outputs with the scaled difference through their flat
    // structure.
    IncrementWithScaling(input_data, factor->scalar<float>(), output_data);
  }

 private:
  // Reduce the mean of the inputs along the image dimension, i.e. dim_1, in a
  // 3D tensor. Effectively means(i, k) = inputs(i, :, k).mean().
  void ReduceMeanAcrossImage(typename TTypes<float, 3>::ConstTensor input,
                             typename TTypes<float, 2>::Tensor mean,
                             typename TTypes<float, 3>::Tensor scratch) {
    const int64 batch = input.dimension(0);
    const int64 image_size = input.dimension(1);
    const int64 channels = input.dimension(2);
    TTypes<float, 1>::ConstTensor input_flat(&input(0, 0, 0), input.size());
    TTypes<float, 1>::Tensor mean_flat(&mean(0, 0), mean.size());
    TTypes<float, 1>::Tensor summation_scratch(&scratch(0, 0, 0),
                                               scratch.size());
    typedef Eigen::array<Eigen::DenseIndex, 1> Index;
    const int64 plane_size = image_size * channels;
    // Since the number of channels in the early layers is often small, a
    // straightforward loop for summing cannot utilize vectorization.
    // This algorithm repeatedly folds each image plane by half, until
    // only one set of channels remains.
    for (int64 i = 0; i < batch; i++) {
      auto input_plane =
          input_flat.slice(Index(i * plane_size), Index(plane_size));
      auto summation_plane =
          summation_scratch.slice(Index(i * plane_size), Index(plane_size));
      int64 remaining_size = image_size;
      int round = 0;
      // Sum the input(i, :, k) into mean(i, k). Repeatedly splits the input
      // array into half and sums the two halves, until only one set of channels
      // is left, which holds the sum. Since each half is large enough, this
      // leads to much better vectorizations between components. An example of
      // how this works:
      //
      //   x = float[4096, 3]
      //   round 0
      //     y[:2048, :] = x[:2048, :] + x[2048:, :]
      //   round 1
      //     y[:1024, :] += y[1024:2048, :]
      //   round 2
      //     y[:512, :] += y[512:1024, :]
      //   ...
      //   round 11
      //     y[:1, :] += y[1:2, :]
      //   At this point y[0, :] holds the sum of all x[:, :]
      //
      // The algorithm itself can handle size that is not power-of-two. Note
      // that in each round we sum up elements that are contiguous. So we can
      // use their flattened structure to gain vectorinization efficiency.
      do {
        int64 right_size = remaining_size / 2;
        int64 left_size = remaining_size - right_size;
        DCHECK(left_size == right_size || left_size == right_size + 1);
        if (round == 0) {
          // In the first round, sum the left side and right side of the input
          // array into the summation area.
          summation_plane.slice(Index(0), Index(right_size * channels)) =
              input_plane.slice(Index(left_size * channels),
                                Index(right_size * channels)) +
              input_plane.slice(Index(0), Index(right_size * channels));
          if (left_size > right_size) {
            DCHECK_EQ(left_size - right_size, 1);
            // Copy over the remaining column if the remaining_size is odd.
            // This also handles the case where image_size == 1.
            summation_plane.slice(Index(right_size * channels),
                                  Index(channels)) =
                input_plane.slice(Index(right_size * channels),
                                  Index(channels));
          }
        } else {
          // For all the remaining rounds, add the second half of the inputs
          // into the first half of the inputs. With the flat structure and
          // large size, this utilizes vectorization between components.
          summation_plane.slice(Index(0), Index(right_size * channels)) +=
              summation_plane.slice(Index(left_size * channels),
                                    Index(right_size * channels));
        }
        remaining_size = left_size;
        round++;
      } while (remaining_size > 1);
      const float mean_scaling = 1.0f / image_size;
      // The first channels elements in summation_plane now holds the summation.
      // Scale it with image_size and copy over to the means.
      auto mean_plane = mean_flat.slice(Index(i * channels), Index(channels));
      mean_plane =
          summation_plane.slice(Index(0), Index(channels)) * mean_scaling;
    }
  }

  // Broadcast a 2D inputs into a 3D outputs across the image dimension, i.e.,
  // dim-1.
  void BroadcastAcrossImage(typename TTypes<float, 2>::Tensor inputs,
                            typename TTypes<float, 3>::Tensor outputs) {
    int64 batch = outputs.dimension(0);
    int64 image_size = outputs.dimension(1);
    int64 channels = outputs.dimension(2);
    // Similar to the reduction case, a straighforward implementation of this
    // does not utilize vectorization well because of the small channel size.
    // This algorithm repeatedly increases the area to be copied, and leads to
    // much better vectorinizations in the copy.
    for (int64 i = 0; i < batch; i++) {
      // Copy over the inputs into outputs in this batch. Effectively:
      // outputs(i, :, k) = inputs(i, k). An example of how this algorith works:
      //
      //    x = float[1, 3], y = float[2048, 3]
      //    round 0
      //      y[:1, :] = x[:, :]
      //    round 1
      //      y[1:2, :] = y[:1, :]
      //    round 2
      //      y[2:4, :] = y[:2, :]
      //    round 3
      //      y[4:8, :] = y[:4, :]
      //    ...
      //    round 11
      //      y[1024:2048, :] = y[:1024, :]
      //    At this point y[:, k] == x[k]
      //
      // The algorithm works for size that is not power-of-two. For each round,
      // the elements that are copied are continuous, so it benefits from the
      // vectorized copy via memcpy.
      const float* mean_p = &inputs(i, 0);
      // Copy the first set of channels.
      float* output_p = &outputs(i, 0, 0);
      memcpy(output_p, mean_p, sizeof(float) * channels);
      int64 copied = 1;
      while (copied < image_size) {
        // Repeatedly increases the number of elements to copy so they have
        // better vectorinizations. However, the source of the copy has to be
        // not too large to stay in the cache.
        const int64 kMaxToCopy = 1024;
        int64 to_copy = std::min({copied, image_size - copied, kMaxToCopy});
        memcpy(output_p + channels * copied, output_p,
               to_copy * channels * sizeof(float));
        copied += to_copy;
      }
    }
  }

  // Increment the outputs with the scaled difference between inputs and
  // outputs. Effectively: outputs += factor * (inputs - outputs).
  void IncrementWithScaling(typename TTypes<float, 3>::ConstTensor input,
                            typename TTypes<float>::ConstScalar factor,
                            typename TTypes<float, 3>::Tensor output) {
    const float factor_value = factor();
    float* p = output.data();
    const float* q = input.data();
    for (int64 n = 0; n < input.size(); ++n) {
      p[n] += factor_value * (q[n] - p[n]);
    }
  }
};

REGISTER_KERNEL_BUILDER(Name("AdjustContrastv2").Device(DEVICE_CPU),
                        AdjustContrastOpv2<CPUDevice>);

#if GOOGLE_CUDA
// Forward declarations of the function specializations for GPU (to prevent
// building the GPU versions here, they will be built compiling _gpu.cu.cc).
namespace functor {
template <>
void AdjustContrastv2<GPUDevice>::operator()(
    const GPUDevice& d, typename TTypes<float, 4>::ConstTensor input,
    typename TTypes<float>::ConstScalar contrast_factor,
    typename TTypes<float, 4>::Tensor output);
extern template struct AdjustContrastv2<GPUDevice>;
}  // namespace functor

template <>
class AdjustContrastOpv2<GPUDevice> : public AdjustContrastOpV2Base {
 public:
  explicit AdjustContrastOpv2(OpKernelConstruction* context)
      : AdjustContrastOpV2Base(context) {}

  void DoCompute(OpKernelContext* context,
                 const ComputeOptions& options) override {
    const int64 shape[4] = {options.batch, options.height, options.width,
                            options.channels};
    functor::AdjustContrastv2<GPUDevice>()(
        context->eigen_device<GPUDevice>(),
        options.input->shaped<float, 4>(shape), options.factor->scalar<float>(),
        options.output->shaped<float, 4>(shape));
  }
};

REGISTER_KERNEL_BUILDER(Name("AdjustContrastv2").Device(DEVICE_GPU),
                        AdjustContrastOpv2<GPUDevice>);
#endif  // GOOGLE_CUDA

}  // namespace tensorflow