aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/ops/image_ops.cc
blob: 5098c44437ea216fefea18cca49172163822eb59 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/shape_inference.h"

namespace tensorflow {

using shape_inference::Dimension;
using shape_inference::InferenceContext;
using shape_inference::Shape;

namespace {

// Sets output[0] to shape [batch_dim,height,width,channel_dim], where
// height and width come from the size_tensor.
Status SetOutputToSizedImage(InferenceContext* c, const Dimension* batch_dim,
                             int size_input_idx, const Dimension* channel_dim) {
  // Verify shape of size input.
  const Shape* size;
  TF_RETURN_IF_ERROR(c->WithRank(c->input(size_input_idx), 1, &size));
  const Dimension* unused;
  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(size, 0), 2, &unused));

  // Get size values from the size tensor.
  const Tensor* size_tensor = c->input_tensor(size_input_idx);
  const Dimension* width;
  const Dimension* height;
  if (size_tensor == nullptr) {
    width = c->UnknownDim();
    height = c->UnknownDim();
  } else {
    height = c->MakeDim(size_tensor->flat<int32>()(0));
    width = c->MakeDim(size_tensor->flat<int32>()(1));
  }
  c->set_output(0, c->MakeShape({batch_dim, height, width, channel_dim}));
  return Status::OK();
}

Status ResizeShapeFn(InferenceContext* c) {
  const Shape* input;
  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
  return SetOutputToSizedImage(c, c->Dim(input, 0), 1 /* size_input_idx */,
                               c->Dim(input, 3));
}

Status DecodeImageShapeFn(InferenceContext* c) {
  const Shape* unused;
  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
  const Dimension* channels_dim;
  int32 channels;
  Status s = c->GetAttr("channels", &channels);
  if (s.ok()) {
    if (channels < 0) {
      return errors::InvalidArgument("channels must be non-negative, got ",
                                     channels);
    }
    channels_dim = c->MakeDim(channels);
  } else {
    channels_dim = c->UnknownDim();
  }

  c->set_output(0, c->MakeShape({InferenceContext::kUnknownDim,
                                 InferenceContext::kUnknownDim, channels_dim}));
  return Status::OK();
}

Status EncodeImageShapeFn(InferenceContext* c) {
  const Shape* unused;
  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &unused));
  c->set_output(0, c->Scalar());
  return Status::OK();
}

Status ColorspaceShapeFn(InferenceContext* c) {
  const Shape* input;
  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &input));

  // The last dimension value is always 3.
  const Dimension* last_dim;
  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(input, -1), 3, &last_dim));
  const Shape* out;
  TF_RETURN_IF_ERROR(c->ReplaceDim(input, -1, last_dim, &out));
  c->set_output(0, out);

  return Status::OK();
}

}  // namespace

// --------------------------------------------------------------------------
REGISTER_OP("ResizeArea")
    .Input("images: T")
    .Input("size: int32")
    .Output("resized_images: float")
    .Attr("T: {uint8, int8, int16, int32, int64, half, float, double}")
    .Attr("align_corners: bool = false")
    .SetShapeFn(ResizeShapeFn)
    .Doc(R"doc(
Resize `images` to `size` using area interpolation.

Input images can be of different types but output images are always float.

images: 4-D with shape `[batch, height, width, channels]`.
size:= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
  new size for the images.
align_corners: If true, rescale input by (new_height - 1) / (height - 1), which
  exactly aligns the 4 corners of images and resized images. If false, rescale
  by new_height / height. Treat similarly the width dimension.
resized_images: 4-D with shape
  `[batch, new_height, new_width, channels]`.
)doc");

// --------------------------------------------------------------------------
REGISTER_OP("ResizeBicubic")
    .Input("images: T")
    .Input("size: int32")
    .Output("resized_images: float")
    .Attr("T: {uint8, int8, int16, int32, int64, half, float, double}")
    .Attr("align_corners: bool = false")
    .SetShapeFn(ResizeShapeFn)
    .Doc(R"doc(
Resize `images` to `size` using bicubic interpolation.

Input images can be of different types but output images are always float.

images: 4-D with shape `[batch, height, width, channels]`.
size:= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
  new size for the images.
align_corners: If true, rescale input by (new_height - 1) / (height - 1), which
  exactly aligns the 4 corners of images and resized images. If false, rescale
  by new_height / height. Treat similarly the width dimension.
resized_images: 4-D with shape
  `[batch, new_height, new_width, channels]`.
)doc");

// --------------------------------------------------------------------------
REGISTER_OP("ResizeBilinear")
    .Input("images: T")
    .Input("size: int32")
    .Output("resized_images: float")
    .Attr("T: {uint8, int8, int16, int32, int64, half, float, double}")
    .Attr("align_corners: bool = false")
    .SetShapeFn(ResizeShapeFn)
    .Doc(R"doc(
Resize `images` to `size` using bilinear interpolation.

Input images can be of different types but output images are always float.

images: 4-D with shape `[batch, height, width, channels]`.
size:= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
  new size for the images.
align_corners: If true, rescale input by (new_height - 1) / (height - 1), which
  exactly aligns the 4 corners of images and resized images. If false, rescale
  by new_height / height. Treat similarly the width dimension.
resized_images: 4-D with shape
  `[batch, new_height, new_width, channels]`.
)doc");

// --------------------------------------------------------------------------
REGISTER_OP("ResizeBilinearGrad")
    .Input("grads: float")
    .Input("original_image: T")
    .Output("output: T")
    .Attr("T: {float, half, double}")
    .Attr("align_corners: bool = false")
    .Doc(R"doc(
Computes the gradient of bilinear interpolation.

grads: 4-D with shape `[batch, height, width, channels]`.
original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
  The image tensor that was resized.
align_corners: If true, rescale grads by (orig_height - 1) / (height - 1), which
  exactly aligns the 4 corners of grads and original_image. If false, rescale by
  orig_height / height. Treat similarly the width dimension.
output: 4-D with shape `[batch, orig_height, orig_width, channels]`.
  Gradients with respect to the input image. Input image must have been
  float or double.
)doc");

// --------------------------------------------------------------------------
REGISTER_OP("ResizeNearestNeighbor")
    .Input("images: T")
    .Input("size: int32")
    .Output("resized_images: T")
    .Attr("T: {uint8, int8, int16, int32, int64, half, float, double}")
    .Attr("align_corners: bool = false")
    .SetShapeFn(ResizeShapeFn)
    .Doc(R"doc(
Resize `images` to `size` using nearest neighbor interpolation.

images: 4-D with shape `[batch, height, width, channels]`.
size:= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
  new size for the images.
align_corners: If true, rescale input by (new_height - 1) / (height - 1), which
  exactly aligns the 4 corners of images and resized images. If false, rescale
  by new_height / height. Treat similarly the width dimension.
resized_images: 4-D with shape
  `[batch, new_height, new_width, channels]`.
)doc");

// --------------------------------------------------------------------------
REGISTER_OP("ResizeNearestNeighborGrad")
    .Input("grads: T")
    .Input("size: int32")
    .Output("output: T")
    .Attr("T: {uint8, int8, int32, half, float, double}")
    .Attr("align_corners: bool = false")
    .Doc(R"doc(
Computes the gradient of nearest neighbor interpolation.

grads: 4-D with shape `[batch, height, width, channels]`.
size:= A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
  original input size.
align_corners: If true, rescale grads by (orig_height - 1) / (height - 1), which
  exactly aligns the 4 corners of grads and original_image. If false, rescale by
  orig_height / height. Treat similarly the width dimension.
output: 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
  with respect to the input image.
)doc");

// --------------------------------------------------------------------------
REGISTER_OP("RandomCrop")
    .Input("image: T")
    .Input("size: int64")
    .Output("output: T")
    .Attr("T: {uint8, int8, int16, int32, int64, float, double}")
    .Attr("seed: int = 0")
    .Attr("seed2: int = 0")
    .SetIsStateful()
    .Deprecated(8, "Random crop is now pure Python")
    .Doc(R"doc(
Randomly crop `image`.

`size` is a 1-D int64 tensor with 2 elements representing the crop height and
width.  The values must be non negative.

This Op picks a random location in `image` and crops a `height` by `width`
rectangle from that location.  The random location is picked so the cropped
area will fit inside the original image.

image: 3-D of shape `[height, width, channels]`.
size: 1-D of length 2 containing: `crop_height`, `crop_width`..
seed: If either seed or seed2 are set to be non-zero, the random number
  generator is seeded by the given seed.  Otherwise, it is seeded by a
  random seed.
seed2: An second seed to avoid seed collision.
output: 3-D of shape `[crop_height, crop_width, channels].`
)doc");
// TODO(shlens): Support variable rank in RandomCrop.

// --------------------------------------------------------------------------
REGISTER_OP("DecodeJpeg")
    .Input("contents: string")
    .Attr("channels: int = 0")
    .Attr("ratio: int = 1")
    .Attr("fancy_upscaling: bool = true")
    .Attr("try_recover_truncated: bool = false")
    .Attr("acceptable_fraction: float = 1.0")
    .Output("image: uint8")
    .SetShapeFn(DecodeImageShapeFn)
    .Doc(R"doc(
Decode a JPEG-encoded image to a uint8 tensor.

The attr `channels` indicates the desired number of color channels for the
decoded image.

Accepted values are:

*   0: Use the number of channels in the JPEG-encoded image.
*   1: output a grayscale image.
*   3: output an RGB image.

If needed, the JPEG-encoded image is transformed to match the requested number
of color channels.

The attr `ratio` allows downscaling the image by an integer factor during
decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
downscaling the image later.

contents: 0-D.  The JPEG-encoded image.
channels: Number of color channels for the decoded image.
ratio: Downscaling ratio.
fancy_upscaling: If true use a slower but nicer upscaling of the
  chroma planes (yuv420/422 only).
try_recover_truncated:  If true try to recover an image from truncated input.
acceptable_fraction: The minimum required fraction of lines before a truncated
  input is accepted.
image: 3-D with shape `[height, width, channels]`..
)doc");

// --------------------------------------------------------------------------
REGISTER_OP("EncodeJpeg")
    .Input("image: uint8")
    .Attr("format: {'', 'grayscale', 'rgb'} = ''")
    .Attr("quality: int = 95")
    .Attr("progressive: bool = false")
    .Attr("optimize_size: bool = false")
    .Attr("chroma_downsampling: bool = true")
    .Attr("density_unit: {'in', 'cm'} = 'in'")
    .Attr("x_density: int = 300")
    .Attr("y_density: int = 300")
    .Attr("xmp_metadata: string = ''")
    .Output("contents: string")
    .SetShapeFn(EncodeImageShapeFn)
    .Doc(R"doc(
JPEG-encode an image.

`image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.

The attr `format` can be used to override the color format of the encoded
output.  Values can be:

*   `''`: Use a default format based on the number of channels in the image.
*   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
    of `image` must be 1.
*   `rgb`: Output an RGB JPEG image. The `channels` dimension
    of `image` must be 3.

If `format` is not specified or is the empty string, a default format is picked
in function of the number of channels in `image`:

*   1: Output a grayscale image.
*   3: Output an RGB image.

image: 3-D with shape `[height, width, channels]`.
format: Per pixel image format.
quality: Quality of the compression from 0 to 100 (higher is better and slower).
progressive: If True, create a JPEG that loads progressively (coarse to fine).
optimize_size: If True, spend CPU/RAM to reduce size with no quality change.
chroma_downsampling: See http://en.wikipedia.org/wiki/Chroma_subsampling.
density_unit: Unit used to specify `x_density` and `y_density`:
   pixels per inch (`'in'`) or centimeter (`'cm'`).
x_density: Horizontal pixels per density unit.
y_density: Vertical pixels per density unit.
xmp_metadata: If not empty, embed this XMP metadata in the image header.
contents: 0-D. JPEG-encoded image.
)doc");

// --------------------------------------------------------------------------
REGISTER_OP("AdjustContrast")
    .Input("images: T")
    .Input("contrast_factor: float")
    .Input("min_value: float")
    .Input("max_value: float")
    .Output("output: float")
    .Attr("T: {uint8, int8, int16, int32, int64, float, double}")
    .Deprecated(2, "Use AdjustContrastv2 instead")
    .Doc(R"Doc(
Deprecated. Disallowed in GraphDef version >= 2.
)Doc");

// --------------------------------------------------------------------------
REGISTER_OP("AdjustContrastv2")
    .Input("images: float")
    .Input("contrast_factor: float")
    .Output("output: float")
    .Doc(R"Doc(
Adjust the contrast of one or more images.

`images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
interpreted as `[height, width, channels]`.  The other dimensions only
represent a collection of images, such as `[batch, height, width, channels].`

Contrast is adjusted independently for each channel of each image.

For each channel, the Op first computes the mean of the image pixels in the
channel and then adjusts each component of each pixel to
`(x - mean) * contrast_factor + mean`.

images: Images to adjust.  At least 3-D.
contrast_factor: A float multiplier for adjusting contrast.
output: The contrast-adjusted image or images.
)Doc");

// --------------------------------------------------------------------------
REGISTER_OP("DecodePng")
    .Input("contents: string")
    .Attr("channels: int = 0")
    .Attr("dtype: {uint8, uint16} = DT_UINT8")
    .Output("image: dtype")
    .SetShapeFn(DecodeImageShapeFn)
    .Doc(R"doc(
Decode a PNG-encoded image to a uint8 or uint16 tensor.

The attr `channels` indicates the desired number of color channels for the
decoded image.

Accepted values are:

*   0: Use the number of channels in the PNG-encoded image.
*   1: output a grayscale image.
*   3: output an RGB image.
*   4: output an RGBA image.

If needed, the PNG-encoded image is transformed to match the requested number
of color channels.

contents: 0-D.  The PNG-encoded image.
channels: Number of color channels for the decoded image.
image: 3-D with shape `[height, width, channels]`.
)doc");

// --------------------------------------------------------------------------
REGISTER_OP("EncodePng")
    .Attr("compression: int = -1")
    .Attr("T: {uint8, uint16} = DT_UINT8")
    .Input("image: T")
    .Output("contents: string")
    .SetShapeFn(EncodeImageShapeFn)
    .Doc(R"doc(
PNG-encode an image.

`image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
where `channels` is:

*   1: for grayscale.
*   2: for grayscale + alpha.
*   3: for RGB.
*   4: for RGBA.

The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
default or a value from 0 to 9.  9 is the highest compression level, generating
the smallest output, but is slower.

image: 3-D with shape `[height, width, channels]`.
compression: Compression level.
contents: 0-D. PNG-encoded image.
)doc");

// --------------------------------------------------------------------------
REGISTER_OP("DecodeGif")
    .Input("contents: string")
    .Output("image: uint8")
    .Doc(R"doc(
Decode the first frame of a GIF-encoded image to a uint8 tensor.

GIF with frame or transparency compression are not supported
convert animated GIF from compressed to uncompressed by:

convert $src.gif -coalesce $dst.gif

contents: 0-D.  The GIF-encoded image.
image: 4-D with shape `[num_frames, height, width, 3]`. RGB order
)doc");

// --------------------------------------------------------------------------
REGISTER_OP("RGBToHSV")
    .Input("images: T")
    .Output("output: T")
    .Attr("T: {float, double} = DT_FLOAT")
    .SetShapeFn(ColorspaceShapeFn)
    .Doc(R"doc(
Converts one or more images from RGB to HSV.

Outputs a tensor of the same shape as the `images` tensor, containing the HSV
value of the pixels. The output is only well defined if the value in `images`
are in `[0,1]`.

`output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
`output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.

images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
output: `images` converted to HSV.
)doc");

// --------------------------------------------------------------------------
REGISTER_OP("HSVToRGB")
    .Input("images: T")
    .Output("output: T")
    .Attr("T: {float, double} = DT_FLOAT")
    .SetShapeFn(ColorspaceShapeFn)
    .Doc(R"doc(
Convert one or more images from HSV to RGB.

Outputs a tensor of the same shape as the `images` tensor, containing the RGB
value of the pixels. The output is only well defined if the value in `images`
are in `[0,1]`.

See `rgb_to_hsv` for a description of the HSV encoding.

images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
output: `images` converted to RGB.
)doc");

// --------------------------------------------------------------------------
REGISTER_OP("DrawBoundingBoxes")
    .Input("images: T")
    .Input("boxes: float")
    .Output("output: T")
    .Attr("T: {float, half} = DT_FLOAT")
    .Doc(R"doc(
Draw bounding boxes on a batch of images.

Outputs a copy of `images` but draws on top of the pixels zero or more bounding
boxes specified by the locations in `boxes`. The coordinates of the each
bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
height of the underlying image.

For example, if an image is 100 x 200 pixels and the bounding box is
`[0.1, 0.5, 0.2, 0.9]`, the bottom-left and upper-right coordinates of the
bounding box will be `(10, 40)` to `(50, 180)`.

Parts of the bounding box may fall outside the image.

images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
  boxes.
output: 4-D with the same shape as `images`. The batch of input images with
  bounding boxes drawn on the images.
)doc");

// --------------------------------------------------------------------------
REGISTER_OP("SampleDistortedBoundingBox")
    .Input("image_size: T")
    .Input("bounding_boxes: float")
    .Output("begin: T")
    .Output("size: T")
    .Output("bboxes: float")
    .Attr("T: {uint8, int8, int16, int32, int64}")
    .Attr("seed: int = 0")
    .Attr("seed2: int = 0")
    .Attr("min_object_covered: float = 0.1")
    .Attr("aspect_ratio_range: list(float) = [0.75, 1.33]")
    .Attr("area_range: list(float) = [0.05, 1.0]")
    .Attr("max_attempts: int = 100")
    .Attr("use_image_if_no_bounding_boxes: bool = false")
    .SetIsStateful()
    .SetShapeFn([](InferenceContext* c) {
      c->set_output(0, c->Vector(3));
      c->set_output(1, c->Vector(3));
      c->set_output(2, c->MakeShape({1, 1, 4}));
      return Status::OK();
    })
    .Doc(R"doc(
Generate a single randomly distorted bounding box for an image.

Bounding box annotations are often supplied in addition to ground-truth labels
in image recognition or object localization tasks. A common technique for
training such a system is to randomly distort an image while preserving
its content, i.e. *data augmentation*. This Op outputs a randomly distorted
localization of an object, i.e. bounding box, given an `image_size`,
`bounding_boxes` and a series of constraints.

The output of this Op is a single bounding box that may be used to crop the
original image. The output is returned as 3 tensors: `begin`, `size` and
`bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
image. The latter may be supplied to `tf.image.draw_bounding_box` to visualize
what the bounding box looks like.

Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
height of the underlying image.

For example,

    # Generate a single distorted bounding box.
    begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
        tf.shape(image),
        bounding_boxes=bounding_boxes)

    # Draw the bounding box in an image summary.
    image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
                                                  bbox_for_draw)
    tf.image_summary('images_with_box', image_with_box)

    # Employ the bounding box to distort the image.
    distorted_image = tf.slice(image, begin, size)

Note that if no bounding box information is available, setting
`use_image_if_no_bounding_boxes = true` will assume there is a single implicit
bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
false and no bounding boxes are supplied, an error is raised.

image_size: 1-D, containing `[height, width, channels]`.
bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
  associated with the image.
begin: 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
  `tf.slice`.
size: 1-D, containing `[target_height, target_width, -1]`. Provide as input to
  `tf.slice`.
bboxes: 3-D with shape `[1, 1, 4]` containing the distorted bounding box.
  Provide as input to `tf.image.draw_bounding_boxes`.
seed: If either `seed` or `seed2` are set to non-zero, the random number
  generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
  seed.
seed2: A second seed to avoid seed collision.
min_object_covered: The cropped area of the image must contain at least this
  fraction of any bounding box supplied.
aspect_ratio_range: The cropped area of the image must have an aspect ratio =
  width / height within this range.
area_range: The cropped area of the image must contain a fraction of the
  supplied image within in this range.
max_attempts: Number of attempts at generating a cropped region of the image
  of the specified constraints. After `max_attempts` failures, return the entire
  image.
use_image_if_no_bounding_boxes: Controls behavior if no bounding boxes supplied.
  If true, assume an implicit bounding box covering the whole input. If false,
  raise an error.
)doc");

// --------------------------------------------------------------------------

// glimpse = extract_glimpse(input, size, offsets) extract the glimpse
// of size `size` centered at location `offsets` from the input tensor
// `input`.
//
// REQUIRES: input.dims() == 4
//
REGISTER_OP("ExtractGlimpse")
    .Input("input: float")
    .Input("size: int32")
    .Input("offsets: float")
    .Output("glimpse: float")
    .Attr("centered: bool = true")
    .Attr("normalized: bool = true")
    .Attr("uniform_noise: bool = true")
    .SetShapeFn([](InferenceContext* c) {
      const Shape* input;
      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
      const Shape* offsets;
      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &offsets));

      const Dimension* batch_dim;
      TF_RETURN_IF_ERROR(
          c->Merge(c->Dim(input, 0), c->Dim(offsets, 0), &batch_dim));
      const Dimension* unused;
      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(offsets, 1), 2, &unused));

      return SetOutputToSizedImage(c, batch_dim, 1 /* size_input_idx */,
                                   c->Dim(input, 3));
    })
    .Doc(R"doc(
Extracts a glimpse from the input tensor.

Returns a set of windows called glimpses extracted at location
`offsets` from the input tensor. If the windows only partially
overlaps the inputs, the non overlapping areas will be filled with
random noise.

The result is a 4-D tensor of shape `[batch_size, glimpse_height,
glimpse_width, channels]`. The channels and batch dimensions are the
same as that of the input tensor. The height and width of the output
windows are specified in the `size` parameter.

The argument `normalized` and `centered` controls how the windows are built:

* If the coordinates are normalized but not centered, 0.0 and 1.0
  correspond to the minimum and maximum of each height and width
  dimension.
* If the coordinates are both normalized and centered, they range from
  -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
  left corner, the lower right corner is located at (1.0, 1.0) and the
  center is at (0, 0).
* If the coordinates are not normalized they are interpreted as
  numbers of pixels.

input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
size: A 1-D tensor of 2 elements containing the size of the glimpses
  to extract.  The glimpse height must be specified first, following
  by the glimpse width.
offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
  the x, y locations of the center of each window.
glimpse: A tensor representing the glimpses `[batch_size,
  glimpse_height, glimpse_width, channels]`.
centered: indicates if the offset coordinates are centered relative to
  the image, in which case the (0, 0) offset is relative to the center
  of the input images. If false, the (0,0) offset corresponds to the
  upper left corner of the input images.
normalized: indicates if the offset coordinates are normalized.
uniform_noise: indicates if the noise should be generated using a
  uniform distribution or a gaussian distribution.
)doc");

// --------------------------------------------------------------------------

REGISTER_OP("CropAndResize")
    .Input("image: T")
    .Input("boxes: float")
    .Input("box_ind: int32")
    .Input("crop_size: int32")
    .Output("crops: float")
    .Attr("T: {uint8, int8, int16, int32, int64, half, float, double}")
    .Attr("method: {'bilinear'} = 'bilinear'")
    .Attr("extrapolation_value: float = 0")
    .SetShapeFn([](InferenceContext* c) {
      // Get inputs and validate ranks.
      const Shape* input;
      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
      const Shape* boxes;
      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &boxes));
      const Shape* box_ind;
      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &box_ind));

      // boxes[0] and box_ind[0] are both num_boxes.
      const Dimension* num_boxes_dim;
      TF_RETURN_IF_ERROR(
          c->Merge(c->Dim(boxes, 0), c->Dim(box_ind, 0), &num_boxes_dim));

      // boxes.dim(1) is 4.
      const Dimension* unused;
      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));

      return SetOutputToSizedImage(c, num_boxes_dim, 3 /* size_input_idx */,
                                   c->Dim(input, 3));
    })
    .Doc(R"doc(
Extracts crops from the input image tensor and bilinearly resizes them (possibly
with aspect ratio change) to a common output size specified by `crop_size`. This
is more general than the `crop_to_bounding_box` op which extracts a fixed size
slice from the input image and does not allow resizing or aspect ratio change.

Returns a tensor with `crops` from the input `image` at positions defined at the
bounding box locations in `boxes`. The cropped boxes are all resized (with
bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`.

image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
  Both `image_height` and `image_width` need to be positive.
boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
  specifies the coordinates of a box in the `box_ind[i]` image and is specified
  in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
  `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
  `[0, 1]` interval of normalized image height is mapped to
  `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
  which case the sampled crop is an up-down flipped version of the original
  image. The width dimension is treated similarly. Normalized coordinates
  outside the `[0, 1]` range are allowed, in which case we use
  `extrapolation_value` to extrapolate the input image values.
box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
  The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
  cropped image patches are resized to this size. The aspect ratio of the image
  content is not preserved. Both `crop_height` and `crop_width` need to be
  positive.
crops: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
method: A string specifying the interpolation method. Only 'bilinear' is
  supported for now.
extrapolation_value: Value used for extrapolation, when applicable.
)doc");

REGISTER_OP("CropAndResizeGradImage")
    .Input("grads: float")
    .Input("boxes: float")
    .Input("box_ind: int32")
    .Input("image_size: int32")
    .Output("output: T")
    .Attr("T: {float, half, double}")
    .Attr("method: {'bilinear'} = 'bilinear'")
    .Doc(R"doc(
Computes the gradient of the crop_and_resize op wrt the input image tensor.

grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
  specifies the coordinates of a box in the `box_ind[i]` image and is specified
  in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
  `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
  `[0, 1]` interval of normalized image height is mapped to
  `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
  which case the sampled crop is an up-down flipped version of the original
  image. The width dimension is treated similarly. Normalized coordinates
  outside the `[0, 1]` range are allowed, in which case we use
  `extrapolation_value` to extrapolate the input image values.
box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
  The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
  containing the original image size. Both `image_height` and `image_width` need
  to be positive.
output: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
method: A string specifying the interpolation method. Only 'bilinear' is
  supported for now.
)doc");

REGISTER_OP("CropAndResizeGradBoxes")
    .Input("grads: float")
    .Input("image: T")
    .Input("boxes: float")
    .Input("box_ind: int32")
    .Output("output: float")
    .Attr("T: {uint8, int8, int16, int32, int64, half, float, double}")
    .Attr("method: {'bilinear'} = 'bilinear'")
    .Doc(R"doc(
Computes the gradient of the crop_and_resize op wrt the input boxes tensor.

grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
  Both `image_height` and `image_width` need to be positive.
boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
  specifies the coordinates of a box in the `box_ind[i]` image and is specified
  in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
  `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
  `[0, 1]` interval of normalized image height is mapped to
  `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
  which case the sampled crop is an up-down flipped version of the original
  image. The width dimension is treated similarly. Normalized coordinates
  outside the `[0, 1]` range are allowed, in which case we use
  `extrapolation_value` to extrapolate the input image values.
box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
  The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
output: A 2-D tensor of shape `[num_boxes, 4]`.
method: A string specifying the interpolation method. Only 'bilinear' is
  supported for now.
)doc");

// --------------------------------------------------------------------------

REGISTER_OP("NonMaxSuppression")
    .Input("boxes: float")
    .Input("scores: float")
    .Input("max_output_size: int32")
    .Output("selected_indices: int32")
    .Attr("iou_threshold: float = 0.5")
    .Doc(R"doc(
Greedily selects a subset of bounding boxes in descending order of score,
pruning away boxes that have high intersection-over-union (IOU) overlap
with previously selected boxes.  Bounding boxes are supplied as
[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
diagonal pair of box corners and the coordinates can be provided as normalized
(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
is agnostic to where the origin is in the coordinate system.  Note that this
algorithm is invariant to orthogonal transformations and translations
of the coordinate system; thus translating or reflections of the coordinate
system result in the same boxes being selected by the algorithm.

The output of this operation is a set of integers indexing into the input
collection of bounding boxes representing the selected boxes.  The bounding
box coordinates corresponding to the selected indices can then be obtained
using the tf.gather operation.  For example:

  selected_indices = tf.image.non_max_suppression(
      boxes, scores, max_output_size, iou_threshold)
  selected_boxes = tf.gather(boxes, selected_indices)

boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
scores: A 1-D float tensor of shape `[num_boxes]` representing a single
  score corresponding to each box (each row of boxes).
max_output_size: A scalar integer tensor representing the maximum number of
  boxes to be selected by non max suppression.
iou_threshold: A float representing the threshold for deciding whether boxes
  overlap too much with respect to IOU.
selected_indices: A 1-D integer tensor of shape `[M]` representing the selected
  indices from the boxes tensor, where `M <= max_output_size`.
)doc");

}  // namespace tensorflow