aboutsummaryrefslogtreecommitdiffhomepage
path: root/third_party/googleapis/google/cloud/vision/v1/image_annotator.proto
blob: c17f8aeb6fe0d74fe15ee1598c76eeb619fcfc23 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.vision.v1;

import "google/api/annotations.proto";
import "google/cloud/vision/v1/geometry.proto";
import "google/cloud/vision/v1/text_annotation.proto";
import "google/cloud/vision/v1/web_detection.proto";
import "google/rpc/status.proto";
import "google/type/color.proto";
import "google/type/latlng.proto";

option cc_enable_arenas = true;
option go_package = "google.golang.org/genproto/googleapis/cloud/vision/v1;vision";
option java_multiple_files = true;
option java_outer_classname = "ImageAnnotatorProto";
option java_package = "com.google.cloud.vision.v1";


// Service that performs Google Cloud Vision API detection tasks over client
// images, such as face, landmark, logo, label, and text detection. The
// ImageAnnotator service returns detected entities from the images.
service ImageAnnotator {
  // Run image detection and annotation for a batch of images.
  rpc BatchAnnotateImages(BatchAnnotateImagesRequest) returns (BatchAnnotateImagesResponse) {
    option (google.api.http) = { post: "/v1/images:annotate" body: "*" };
  }
}

// Users describe the type of Google Cloud Vision API tasks to perform over
// images by using *Feature*s. Each Feature indicates a type of image
// detection task to perform. Features encode the Cloud Vision API
// vertical to operate on and the number of top-scoring results to return.
message Feature {
  // Type of image feature.
  enum Type {
    // Unspecified feature type.
    TYPE_UNSPECIFIED = 0;

    // Run face detection.
    FACE_DETECTION = 1;

    // Run landmark detection.
    LANDMARK_DETECTION = 2;

    // Run logo detection.
    LOGO_DETECTION = 3;

    // Run label detection.
    LABEL_DETECTION = 4;

    // Run OCR.
    TEXT_DETECTION = 5;

    // Run dense text document OCR. Takes precedence when both
    // DOCUMENT_TEXT_DETECTION and TEXT_DETECTION are present.
    DOCUMENT_TEXT_DETECTION = 11;

    // Run computer vision models to compute image safe-search properties.
    SAFE_SEARCH_DETECTION = 6;

    // Compute a set of image properties, such as the image's dominant colors.
    IMAGE_PROPERTIES = 7;

    // Run crop hints.
    CROP_HINTS = 9;

    // Run web detection.
    WEB_DETECTION = 10;
  }

  // The feature type.
  Type type = 1;

  // Maximum number of results of this type.
  int32 max_results = 2;
}

// External image source (Google Cloud Storage image location).
message ImageSource {
  // NOTE: For new code `image_uri` below is preferred.
  // Google Cloud Storage image URI, which must be in the following form:
  // `gs://bucket_name/object_name` (for details, see
  // [Google Cloud Storage Request
  // URIs](https://cloud.google.com/storage/docs/reference-uris)).
  // NOTE: Cloud Storage object versioning is not supported.
  string gcs_image_uri = 1;

  // Image URI which supports:
  // 1) Google Cloud Storage image URI, which must be in the following form:
  // `gs://bucket_name/object_name` (for details, see
  // [Google Cloud Storage Request
  // URIs](https://cloud.google.com/storage/docs/reference-uris)).
  // NOTE: Cloud Storage object versioning is not supported.
  // 2) Publicly accessible image HTTP/HTTPS URL.
  // This is preferred over the legacy `gcs_image_uri` above. When both
  // `gcs_image_uri` and `image_uri` are specified, `image_uri` takes
  // precedence.
  string image_uri = 2;
}

// Client image to perform Google Cloud Vision API tasks over.
message Image {
  // Image content, represented as a stream of bytes.
  // Note: as with all `bytes` fields, protobuffers use a pure binary
  // representation, whereas JSON representations use base64.
  bytes content = 1;

  // Google Cloud Storage image location. If both `content` and `source`
  // are provided for an image, `content` takes precedence and is
  // used to perform the image annotation request.
  ImageSource source = 2;
}

// A face annotation object contains the results of face detection.
message FaceAnnotation {
  // A face-specific landmark (for example, a face feature).
  // Landmark positions may fall outside the bounds of the image
  // if the face is near one or more edges of the image.
  // Therefore it is NOT guaranteed that `0 <= x < width` or
  // `0 <= y < height`.
  message Landmark {
    // Face landmark (feature) type.
    // Left and right are defined from the vantage of the viewer of the image
    // without considering mirror projections typical of photos. So, `LEFT_EYE`,
    // typically, is the person's right eye.
    enum Type {
      // Unknown face landmark detected. Should not be filled.
      UNKNOWN_LANDMARK = 0;

      // Left eye.
      LEFT_EYE = 1;

      // Right eye.
      RIGHT_EYE = 2;

      // Left of left eyebrow.
      LEFT_OF_LEFT_EYEBROW = 3;

      // Right of left eyebrow.
      RIGHT_OF_LEFT_EYEBROW = 4;

      // Left of right eyebrow.
      LEFT_OF_RIGHT_EYEBROW = 5;

      // Right of right eyebrow.
      RIGHT_OF_RIGHT_EYEBROW = 6;

      // Midpoint between eyes.
      MIDPOINT_BETWEEN_EYES = 7;

      // Nose tip.
      NOSE_TIP = 8;

      // Upper lip.
      UPPER_LIP = 9;

      // Lower lip.
      LOWER_LIP = 10;

      // Mouth left.
      MOUTH_LEFT = 11;

      // Mouth right.
      MOUTH_RIGHT = 12;

      // Mouth center.
      MOUTH_CENTER = 13;

      // Nose, bottom right.
      NOSE_BOTTOM_RIGHT = 14;

      // Nose, bottom left.
      NOSE_BOTTOM_LEFT = 15;

      // Nose, bottom center.
      NOSE_BOTTOM_CENTER = 16;

      // Left eye, top boundary.
      LEFT_EYE_TOP_BOUNDARY = 17;

      // Left eye, right corner.
      LEFT_EYE_RIGHT_CORNER = 18;

      // Left eye, bottom boundary.
      LEFT_EYE_BOTTOM_BOUNDARY = 19;

      // Left eye, left corner.
      LEFT_EYE_LEFT_CORNER = 20;

      // Right eye, top boundary.
      RIGHT_EYE_TOP_BOUNDARY = 21;

      // Right eye, right corner.
      RIGHT_EYE_RIGHT_CORNER = 22;

      // Right eye, bottom boundary.
      RIGHT_EYE_BOTTOM_BOUNDARY = 23;

      // Right eye, left corner.
      RIGHT_EYE_LEFT_CORNER = 24;

      // Left eyebrow, upper midpoint.
      LEFT_EYEBROW_UPPER_MIDPOINT = 25;

      // Right eyebrow, upper midpoint.
      RIGHT_EYEBROW_UPPER_MIDPOINT = 26;

      // Left ear tragion.
      LEFT_EAR_TRAGION = 27;

      // Right ear tragion.
      RIGHT_EAR_TRAGION = 28;

      // Left eye pupil.
      LEFT_EYE_PUPIL = 29;

      // Right eye pupil.
      RIGHT_EYE_PUPIL = 30;

      // Forehead glabella.
      FOREHEAD_GLABELLA = 31;

      // Chin gnathion.
      CHIN_GNATHION = 32;

      // Chin left gonion.
      CHIN_LEFT_GONION = 33;

      // Chin right gonion.
      CHIN_RIGHT_GONION = 34;
    }

    // Face landmark type.
    Type type = 3;

    // Face landmark position.
    Position position = 4;
  }

  // The bounding polygon around the face. The coordinates of the bounding box
  // are in the original image's scale, as returned in `ImageParams`.
  // The bounding box is computed to "frame" the face in accordance with human
  // expectations. It is based on the landmarker results.
  // Note that one or more x and/or y coordinates may not be generated in the
  // `BoundingPoly` (the polygon will be unbounded) if only a partial face
  // appears in the image to be annotated.
  BoundingPoly bounding_poly = 1;

  // The `fd_bounding_poly` bounding polygon is tighter than the
  // `boundingPoly`, and encloses only the skin part of the face. Typically, it
  // is used to eliminate the face from any image analysis that detects the
  // "amount of skin" visible in an image. It is not based on the
  // landmarker results, only on the initial face detection, hence
  // the <code>fd</code> (face detection) prefix.
  BoundingPoly fd_bounding_poly = 2;

  // Detected face landmarks.
  repeated Landmark landmarks = 3;

  // Roll angle, which indicates the amount of clockwise/anti-clockwise rotation
  // of the face relative to the image vertical about the axis perpendicular to
  // the face. Range [-180,180].
  float roll_angle = 4;

  // Yaw angle, which indicates the leftward/rightward angle that the face is
  // pointing relative to the vertical plane perpendicular to the image. Range
  // [-180,180].
  float pan_angle = 5;

  // Pitch angle, which indicates the upwards/downwards angle that the face is
  // pointing relative to the image's horizontal plane. Range [-180,180].
  float tilt_angle = 6;

  // Detection confidence. Range [0, 1].
  float detection_confidence = 7;

  // Face landmarking confidence. Range [0, 1].
  float landmarking_confidence = 8;

  // Joy likelihood.
  Likelihood joy_likelihood = 9;

  // Sorrow likelihood.
  Likelihood sorrow_likelihood = 10;

  // Anger likelihood.
  Likelihood anger_likelihood = 11;

  // Surprise likelihood.
  Likelihood surprise_likelihood = 12;

  // Under-exposed likelihood.
  Likelihood under_exposed_likelihood = 13;

  // Blurred likelihood.
  Likelihood blurred_likelihood = 14;

  // Headwear likelihood.
  Likelihood headwear_likelihood = 15;
}

// Detected entity location information.
message LocationInfo {
  // lat/long location coordinates.
  google.type.LatLng lat_lng = 1;
}

// A `Property` consists of a user-supplied name/value pair.
message Property {
  // Name of the property.
  string name = 1;

  // Value of the property.
  string value = 2;
}

// Set of detected entity features.
message EntityAnnotation {
  // Opaque entity ID. Some IDs may be available in
  // [Google Knowledge Graph Search API](https://developers.google.com/knowledge-graph/).
  string mid = 1;

  // The language code for the locale in which the entity textual
  // `description` is expressed.
  string locale = 2;

  // Entity textual description, expressed in its `locale` language.
  string description = 3;

  // Overall score of the result. Range [0, 1].
  float score = 4;

  // The accuracy of the entity detection in an image.
  // For example, for an image in which the "Eiffel Tower" entity is detected,
  // this field represents the confidence that there is a tower in the query
  // image. Range [0, 1].
  float confidence = 5;

  // The relevancy of the ICA (Image Content Annotation) label to the
  // image. For example, the relevancy of "tower" is likely higher to an image
  // containing the detected "Eiffel Tower" than to an image containing a
  // detected distant towering building, even though the confidence that
  // there is a tower in each image may be the same. Range [0, 1].
  float topicality = 6;

  // Image region to which this entity belongs. Currently not produced
  // for `LABEL_DETECTION` features. For `TEXT_DETECTION` (OCR), `boundingPoly`s
  // are produced for the entire text detected in an image region, followed by
  // `boundingPoly`s for each word within the detected text.
  BoundingPoly bounding_poly = 7;

  // The location information for the detected entity. Multiple
  // `LocationInfo` elements can be present because one location may
  // indicate the location of the scene in the image, and another location
  // may indicate the location of the place where the image was taken.
  // Location information is usually present for landmarks.
  repeated LocationInfo locations = 8;

  // Some entities may have optional user-supplied `Property` (name/value)
  // fields, such a score or string that qualifies the entity.
  repeated Property properties = 9;
}

// Set of features pertaining to the image, computed by computer vision
// methods over safe-search verticals (for example, adult, spoof, medical,
// violence).
message SafeSearchAnnotation {
  // Represents the adult content likelihood for the image.
  Likelihood adult = 1;

  // Spoof likelihood. The likelihood that an modification
  // was made to the image's canonical version to make it appear
  // funny or offensive.
  Likelihood spoof = 2;

  // Likelihood that this is a medical image.
  Likelihood medical = 3;

  // Violence likelihood.
  Likelihood violence = 4;
}

// Rectangle determined by min and max `LatLng` pairs.
message LatLongRect {
  // Min lat/long pair.
  google.type.LatLng min_lat_lng = 1;

  // Max lat/long pair.
  google.type.LatLng max_lat_lng = 2;
}

// Color information consists of RGB channels, score, and the fraction of
// the image that the color occupies in the image.
message ColorInfo {
  // RGB components of the color.
  google.type.Color color = 1;

  // Image-specific score for this color. Value in range [0, 1].
  float score = 2;

  // The fraction of pixels the color occupies in the image.
  // Value in range [0, 1].
  float pixel_fraction = 3;
}

// Set of dominant colors and their corresponding scores.
message DominantColorsAnnotation {
  // RGB color values with their score and pixel fraction.
  repeated ColorInfo colors = 1;
}

// Stores image properties, such as dominant colors.
message ImageProperties {
  // If present, dominant colors completed successfully.
  DominantColorsAnnotation dominant_colors = 1;
}

// Single crop hint that is used to generate a new crop when serving an image.
message CropHint {
  // The bounding polygon for the crop region. The coordinates of the bounding
  // box are in the original image's scale, as returned in `ImageParams`.
  BoundingPoly bounding_poly = 1;

  // Confidence of this being a salient region.  Range [0, 1].
  float confidence = 2;

  // Fraction of importance of this salient region with respect to the original
  // image.
  float importance_fraction = 3;
}

// Set of crop hints that are used to generate new crops when serving images.
message CropHintsAnnotation {
  repeated CropHint crop_hints = 1;
}

// Parameters for crop hints annotation request.
message CropHintsParams {
  // Aspect ratios in floats, representing the ratio of the width to the height
  // of the image. For example, if the desired aspect ratio is 4/3, the
  // corresponding float value should be 1.33333.  If not specified, the
  // best possible crop is returned. The number of provided aspect ratios is
  // limited to a maximum of 16; any aspect ratios provided after the 16th are
  // ignored.
  repeated float aspect_ratios = 1;
}

// Image context and/or feature-specific parameters.
message ImageContext {
  // lat/long rectangle that specifies the location of the image.
  LatLongRect lat_long_rect = 1;

  // List of languages to use for TEXT_DETECTION. In most cases, an empty value
  // yields the best results since it enables automatic language detection. For
  // languages based on the Latin alphabet, setting `language_hints` is not
  // needed. In rare cases, when the language of the text in the image is known,
  // setting a hint will help get better results (although it will be a
  // significant hindrance if the hint is wrong). Text detection returns an
  // error if one or more of the specified languages is not one of the
  // [supported languages](/vision/docs/languages).
  repeated string language_hints = 2;

  // Parameters for crop hints annotation request.
  CropHintsParams crop_hints_params = 4;
}

// Request for performing Google Cloud Vision API tasks over a user-provided
// image, with user-requested features.
message AnnotateImageRequest {
  // The image to be processed.
  Image image = 1;

  // Requested features.
  repeated Feature features = 2;

  // Additional context that may accompany the image.
  ImageContext image_context = 3;
}

// Response to an image annotation request.
message AnnotateImageResponse {
  // If present, face detection has completed successfully.
  repeated FaceAnnotation face_annotations = 1;

  // If present, landmark detection has completed successfully.
  repeated EntityAnnotation landmark_annotations = 2;

  // If present, logo detection has completed successfully.
  repeated EntityAnnotation logo_annotations = 3;

  // If present, label detection has completed successfully.
  repeated EntityAnnotation label_annotations = 4;

  // If present, text (OCR) detection or document (OCR) text detection has
  // completed successfully.
  repeated EntityAnnotation text_annotations = 5;

  // If present, text (OCR) detection or document (OCR) text detection has
  // completed successfully.
  // This annotation provides the structural hierarchy for the OCR detected
  // text.
  TextAnnotation full_text_annotation = 12;

  // If present, safe-search annotation has completed successfully.
  SafeSearchAnnotation safe_search_annotation = 6;

  // If present, image properties were extracted successfully.
  ImageProperties image_properties_annotation = 8;

  // If present, crop hints have completed successfully.
  CropHintsAnnotation crop_hints_annotation = 11;

  // If present, web detection has completed successfully.
  WebDetection web_detection = 13;

  // If set, represents the error message for the operation.
  // Note that filled-in image annotations are guaranteed to be
  // correct, even when `error` is set.
  google.rpc.Status error = 9;
}

// Multiple image annotation requests are batched into a single service call.
message BatchAnnotateImagesRequest {
  // Individual image annotation requests for this batch.
  repeated AnnotateImageRequest requests = 1;
}

// Response to a batch image annotation request.
message BatchAnnotateImagesResponse {
  // Individual responses to image annotation requests within the batch.
  repeated AnnotateImageResponse responses = 1;
}

// A bucketized representation of likelihood, which is intended to give clients
// highly stable results across model upgrades.
enum Likelihood {
  // Unknown likelihood.
  UNKNOWN = 0;

  // It is very unlikely that the image belongs to the specified vertical.
  VERY_UNLIKELY = 1;

  // It is unlikely that the image belongs to the specified vertical.
  UNLIKELY = 2;

  // It is possible that the image belongs to the specified vertical.
  POSSIBLE = 3;

  // It is likely that the image belongs to the specified vertical.
  LIKELY = 4;

  // It is very likely that the image belongs to the specified vertical.
  VERY_LIKELY = 5;
}