aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/kernels
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/core/kernels')
-rw-r--r--tensorflow/core/kernels/adjust_contrast_op.cc121
-rw-r--r--tensorflow/core/kernels/adjust_contrast_op.h64
-rw-r--r--tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc43
-rw-r--r--tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc22
-rw-r--r--tensorflow/core/kernels/adjust_contrast_op_test.cc88
-rw-r--r--tensorflow/core/kernels/aggregate_ops.cc238
-rw-r--r--tensorflow/core/kernels/aggregate_ops.h211
-rw-r--r--tensorflow/core/kernels/aggregate_ops_gpu.cu.cc141
-rw-r--r--tensorflow/core/kernels/argmax_op.cc163
-rw-r--r--tensorflow/core/kernels/argmax_op.h55
-rw-r--r--tensorflow/core/kernels/argmax_op_gpu.cu.cc20
-rw-r--r--tensorflow/core/kernels/assign_op.h92
-rw-r--r--tensorflow/core/kernels/attention_ops.cc92
-rw-r--r--tensorflow/core/kernels/avgpooling_op.cc418
-rw-r--r--tensorflow/core/kernels/avgpooling_op.h58
-rw-r--r--tensorflow/core/kernels/avgpooling_op_gpu.cu.cc101
-rw-r--r--tensorflow/core/kernels/batch_matmul_op.cc260
-rw-r--r--tensorflow/core/kernels/batch_norm_op.cc223
-rw-r--r--tensorflow/core/kernels/batch_norm_op.h133
-rw-r--r--tensorflow/core/kernels/batch_norm_op_gpu.cu.cc17
-rw-r--r--tensorflow/core/kernels/bcast_ops.cc71
-rw-r--r--tensorflow/core/kernels/bias_op.cc112
-rw-r--r--tensorflow/core/kernels/bias_op.h41
-rw-r--r--tensorflow/core/kernels/bias_op_gpu.cu.cc23
-rw-r--r--tensorflow/core/kernels/candidate_sampler_ops.cc243
-rw-r--r--tensorflow/core/kernels/cast_op.cc233
-rw-r--r--tensorflow/core/kernels/cast_op.h71
-rw-r--r--tensorflow/core/kernels/cast_op_gpu.cu.cc45
-rw-r--r--tensorflow/core/kernels/cast_op_test.cc100
-rw-r--r--tensorflow/core/kernels/check_numerics_op.cc190
-rw-r--r--tensorflow/core/kernels/check_numerics_op_gpu.cu.cc62
-rw-r--r--tensorflow/core/kernels/cholesky_op.cc71
-rw-r--r--tensorflow/core/kernels/concat_op.cc153
-rw-r--r--tensorflow/core/kernels/concat_op.h27
-rw-r--r--tensorflow/core/kernels/concat_op_cpu.cc122
-rw-r--r--tensorflow/core/kernels/concat_op_gpu.cu.cc41
-rw-r--r--tensorflow/core/kernels/concat_op_test.cc240
-rw-r--r--tensorflow/core/kernels/constant_op.cc249
-rw-r--r--tensorflow/core/kernels/constant_op.h25
-rw-r--r--tensorflow/core/kernels/constant_op_gpu.cu.cc89
-rw-r--r--tensorflow/core/kernels/constant_op_test.cc43
-rw-r--r--tensorflow/core/kernels/control_flow_ops.cc359
-rw-r--r--tensorflow/core/kernels/control_flow_ops.h22
-rw-r--r--tensorflow/core/kernels/control_flow_ops_test.cc71
-rw-r--r--tensorflow/core/kernels/conv_2d.h127
-rw-r--r--tensorflow/core/kernels/conv_grad_ops.cc1190
-rw-r--r--tensorflow/core/kernels/conv_ops.cc373
-rw-r--r--tensorflow/core/kernels/conv_ops_gpu.cu.cc35
-rw-r--r--tensorflow/core/kernels/conv_ops_gpu_2.cu.cc16
-rw-r--r--tensorflow/core/kernels/conv_ops_gpu_3.cu.cc22
-rw-r--r--tensorflow/core/kernels/conv_ops_gpu_matmul.cu.cc16
-rw-r--r--tensorflow/core/kernels/core_ops_test.cc990
-rw-r--r--tensorflow/core/kernels/count_up_to_op.cc51
-rw-r--r--tensorflow/core/kernels/cwise_op_abs.cc23
-rw-r--r--tensorflow/core/kernels/cwise_op_add.cc21
-rw-r--r--tensorflow/core/kernels/cwise_op_ceil.cc8
-rw-r--r--tensorflow/core/kernels/cwise_op_complex.cc10
-rw-r--r--tensorflow/core/kernels/cwise_op_conj.cc10
-rw-r--r--tensorflow/core/kernels/cwise_op_cos.cc8
-rw-r--r--tensorflow/core/kernels/cwise_op_div.cc21
-rw-r--r--tensorflow/core/kernels/cwise_op_equal_to.cc21
-rw-r--r--tensorflow/core/kernels/cwise_op_exp.cc8
-rw-r--r--tensorflow/core/kernels/cwise_op_floor.cc8
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_abs.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_add.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_ceil.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_complex.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_cos.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_div.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_exp.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_floor.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_greater.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_greater_equal.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_imag.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_inverse.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_isfinite.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_isinf.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_isnan.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_less.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_less_equal.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_log.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_logical_and.cu.cc13
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_logical_not.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_logical_or.cu.cc13
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_maximum.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_minimum.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_mod.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_not_equal_to.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_pow.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_real.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_rsqrt.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_select.cu.cc15
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_sigmoid.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_sign.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_sin.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_sqrt.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_square.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_sub.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_gpu_tanh.cu.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_greater.cc21
-rw-r--r--tensorflow/core/kernels/cwise_op_greater_equal.cc22
-rw-r--r--tensorflow/core/kernels/cwise_op_imag.cc10
-rw-r--r--tensorflow/core/kernels/cwise_op_inverse.cc8
-rw-r--r--tensorflow/core/kernels/cwise_op_isfinite.cc8
-rw-r--r--tensorflow/core/kernels/cwise_op_isinf.cc8
-rw-r--r--tensorflow/core/kernels/cwise_op_isnan.cc8
-rw-r--r--tensorflow/core/kernels/cwise_op_less.cc20
-rw-r--r--tensorflow/core/kernels/cwise_op_less_equal.cc22
-rw-r--r--tensorflow/core/kernels/cwise_op_log.cc8
-rw-r--r--tensorflow/core/kernels/cwise_op_logical_and.cc10
-rw-r--r--tensorflow/core/kernels/cwise_op_logical_not.cc10
-rw-r--r--tensorflow/core/kernels/cwise_op_logical_or.cc10
-rw-r--r--tensorflow/core/kernels/cwise_op_maximum.cc21
-rw-r--r--tensorflow/core/kernels/cwise_op_minimum.cc21
-rw-r--r--tensorflow/core/kernels/cwise_op_mod.cc6
-rw-r--r--tensorflow/core/kernels/cwise_op_mul.cc21
-rw-r--r--tensorflow/core/kernels/cwise_op_neg.cc9
-rw-r--r--tensorflow/core/kernels/cwise_op_not_equal_to.cc10
-rw-r--r--tensorflow/core/kernels/cwise_op_pow.cc9
-rw-r--r--tensorflow/core/kernels/cwise_op_real.cc10
-rw-r--r--tensorflow/core/kernels/cwise_op_rsqrt.cc8
-rw-r--r--tensorflow/core/kernels/cwise_op_select.cc17
-rw-r--r--tensorflow/core/kernels/cwise_op_sigmoid.cc8
-rw-r--r--tensorflow/core/kernels/cwise_op_sign.cc19
-rw-r--r--tensorflow/core/kernels/cwise_op_sin.cc8
-rw-r--r--tensorflow/core/kernels/cwise_op_sqrt.cc8
-rw-r--r--tensorflow/core/kernels/cwise_op_square.cc9
-rw-r--r--tensorflow/core/kernels/cwise_op_sub.cc21
-rw-r--r--tensorflow/core/kernels/cwise_op_tanh.cc8
-rw-r--r--tensorflow/core/kernels/cwise_ops.h607
-rw-r--r--tensorflow/core/kernels/cwise_ops_common.cc42
-rw-r--r--tensorflow/core/kernels/cwise_ops_common.h390
-rw-r--r--tensorflow/core/kernels/cwise_ops_gpu_common.cu.h135
-rw-r--r--tensorflow/core/kernels/cwise_ops_test.cc167
-rw-r--r--tensorflow/core/kernels/decode_csv_op.cc222
-rw-r--r--tensorflow/core/kernels/decode_jpeg_op.cc72
-rw-r--r--tensorflow/core/kernels/decode_png_op.cc69
-rw-r--r--tensorflow/core/kernels/decode_raw_op.cc90
-rw-r--r--tensorflow/core/kernels/dense_update_ops.cc136
-rw-r--r--tensorflow/core/kernels/dense_update_ops.h43
-rw-r--r--tensorflow/core/kernels/dense_update_ops_gpu.cu.cc22
-rw-r--r--tensorflow/core/kernels/determinant_op.cc66
-rw-r--r--tensorflow/core/kernels/diag_op.cc93
-rw-r--r--tensorflow/core/kernels/dynamic_partition_op.cc154
-rw-r--r--tensorflow/core/kernels/dynamic_partition_op_test.cc145
-rw-r--r--tensorflow/core/kernels/dynamic_stitch_op.cc158
-rw-r--r--tensorflow/core/kernels/dynamic_stitch_op_test.cc133
-rw-r--r--tensorflow/core/kernels/edit_distance_op.cc217
-rw-r--r--tensorflow/core/kernels/encode_jpeg_op.cc114
-rw-r--r--tensorflow/core/kernels/encode_png_op.cc52
-rw-r--r--tensorflow/core/kernels/example_parsing_ops.cc444
-rw-r--r--tensorflow/core/kernels/fact_op.cc96
-rw-r--r--tensorflow/core/kernels/fifo_queue.cc518
-rw-r--r--tensorflow/core/kernels/fifo_queue.h127
-rw-r--r--tensorflow/core/kernels/fifo_queue_op.cc93
-rw-r--r--tensorflow/core/kernels/fill_functor.h26
-rw-r--r--tensorflow/core/kernels/fixed_length_record_reader_op.cc109
-rw-r--r--tensorflow/core/kernels/gather_op.cc136
-rw-r--r--tensorflow/core/kernels/gather_op_test.cc213
-rw-r--r--tensorflow/core/kernels/identity_op.cc45
-rw-r--r--tensorflow/core/kernels/identity_op.h25
-rw-r--r--tensorflow/core/kernels/identity_op_test.cc56
-rw-r--r--tensorflow/core/kernels/identity_reader_op.cc57
-rw-r--r--tensorflow/core/kernels/in_topk_op.cc58
-rw-r--r--tensorflow/core/kernels/initializable_lookup_table.cc41
-rw-r--r--tensorflow/core/kernels/initializable_lookup_table.h103
-rw-r--r--tensorflow/core/kernels/io.cc270
-rw-r--r--tensorflow/core/kernels/io.h38
-rw-r--r--tensorflow/core/kernels/l2loss_op.cc69
-rw-r--r--tensorflow/core/kernels/l2loss_op.h24
-rw-r--r--tensorflow/core/kernels/l2loss_op_gpu.cu.cc16
-rw-r--r--tensorflow/core/kernels/linalg_ops_common.cc99
-rw-r--r--tensorflow/core/kernels/linalg_ops_common.h123
-rw-r--r--tensorflow/core/kernels/listdiff_op.cc75
-rw-r--r--tensorflow/core/kernels/logging_ops.cc77
-rw-r--r--tensorflow/core/kernels/logging_ops_test.cc87
-rw-r--r--tensorflow/core/kernels/lookup_table_init_op.cc116
-rw-r--r--tensorflow/core/kernels/lookup_table_op.cc166
-rw-r--r--tensorflow/core/kernels/lookup_table_op.h80
-rw-r--r--tensorflow/core/kernels/lookup_util.cc72
-rw-r--r--tensorflow/core/kernels/lookup_util.h31
-rw-r--r--tensorflow/core/kernels/lrn_op.cc228
-rw-r--r--tensorflow/core/kernels/lrn_op_test.cc185
-rw-r--r--tensorflow/core/kernels/matching_files_op.cc42
-rw-r--r--tensorflow/core/kernels/matmul_op.cc214
-rw-r--r--tensorflow/core/kernels/matmul_op.h40
-rw-r--r--tensorflow/core/kernels/matmul_op_gpu.cu.cc32
-rw-r--r--tensorflow/core/kernels/matmul_op_test.cc56
-rw-r--r--tensorflow/core/kernels/matrix_inverse_op.cc64
-rw-r--r--tensorflow/core/kernels/maxpooling_op.cc554
-rw-r--r--tensorflow/core/kernels/maxpooling_op.h29
-rw-r--r--tensorflow/core/kernels/maxpooling_op_gpu.cu.cc261
-rw-r--r--tensorflow/core/kernels/maxpooling_op_gpu.h42
-rw-r--r--tensorflow/core/kernels/no_op.cc8
-rw-r--r--tensorflow/core/kernels/no_op.h17
-rw-r--r--tensorflow/core/kernels/ops_testutil.cc18
-rw-r--r--tensorflow/core/kernels/ops_testutil.h191
-rw-r--r--tensorflow/core/kernels/ops_util.cc113
-rw-r--r--tensorflow/core/kernels/ops_util.h180
-rw-r--r--tensorflow/core/kernels/ops_util_test.cc265
-rw-r--r--tensorflow/core/kernels/pack_op.cc114
-rw-r--r--tensorflow/core/kernels/pad_op.cc159
-rw-r--r--tensorflow/core/kernels/pad_op.h27
-rw-r--r--tensorflow/core/kernels/pad_op_gpu.cu.cc26
-rw-r--r--tensorflow/core/kernels/pooling_ops_common.cc252
-rw-r--r--tensorflow/core/kernels/pooling_ops_common.h264
-rw-r--r--tensorflow/core/kernels/pooling_ops_common_gpu.h39
-rw-r--r--tensorflow/core/kernels/queue_base.cc153
-rw-r--r--tensorflow/core/kernels/queue_base.h77
-rw-r--r--tensorflow/core/kernels/queue_ops.cc288
-rw-r--r--tensorflow/core/kernels/random_crop_op.cc103
-rw-r--r--tensorflow/core/kernels/random_crop_op_test.cc60
-rw-r--r--tensorflow/core/kernels/random_op.cc276
-rw-r--r--tensorflow/core/kernels/random_op.h16
-rw-r--r--tensorflow/core/kernels/random_op_gpu.cu.cc152
-rw-r--r--tensorflow/core/kernels/random_op_test.cc99
-rw-r--r--tensorflow/core/kernels/random_shuffle_op.cc89
-rw-r--r--tensorflow/core/kernels/random_shuffle_queue_op.cc740
-rw-r--r--tensorflow/core/kernels/range_sampler.cc305
-rw-r--r--tensorflow/core/kernels/range_sampler.h237
-rw-r--r--tensorflow/core/kernels/range_sampler_test.cc320
-rw-r--r--tensorflow/core/kernels/reader_base.cc156
-rw-r--r--tensorflow/core/kernels/reader_base.h107
-rw-r--r--tensorflow/core/kernels/reader_base.proto13
-rw-r--r--tensorflow/core/kernels/reader_ops.cc132
-rw-r--r--tensorflow/core/kernels/reduction_ops.h66
-rw-r--r--tensorflow/core/kernels/reduction_ops_all.cc17
-rw-r--r--tensorflow/core/kernels/reduction_ops_any.cc17
-rw-r--r--tensorflow/core/kernels/reduction_ops_common.h302
-rw-r--r--tensorflow/core/kernels/reduction_ops_gpu.cu.cc65
-rw-r--r--tensorflow/core/kernels/reduction_ops_max.cc26
-rw-r--r--tensorflow/core/kernels/reduction_ops_mean.cc12
-rw-r--r--tensorflow/core/kernels/reduction_ops_min.cc26
-rw-r--r--tensorflow/core/kernels/reduction_ops_prod.cc26
-rw-r--r--tensorflow/core/kernels/reduction_ops_sum.cc37
-rw-r--r--tensorflow/core/kernels/reduction_ops_test.cc73
-rw-r--r--tensorflow/core/kernels/reference_gemm.h75
-rw-r--r--tensorflow/core/kernels/relu_op.cc154
-rw-r--r--tensorflow/core/kernels/relu_op.h79
-rw-r--r--tensorflow/core/kernels/relu_op_gpu.cu.cc27
-rw-r--r--tensorflow/core/kernels/reshape_op.cc29
-rw-r--r--tensorflow/core/kernels/reshape_op.h83
-rw-r--r--tensorflow/core/kernels/resize_area_op.cc139
-rw-r--r--tensorflow/core/kernels/resize_bicubic_op.cc121
-rw-r--r--tensorflow/core/kernels/resize_bilinear_op.cc109
-rw-r--r--tensorflow/core/kernels/resize_bilinear_op_test.cc171
-rw-r--r--tensorflow/core/kernels/resize_nearest_neighbor_op.cc89
-rw-r--r--tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc163
-rw-r--r--tensorflow/core/kernels/restore_op.cc65
-rw-r--r--tensorflow/core/kernels/restore_op_test.cc305
-rw-r--r--tensorflow/core/kernels/reverse_op.cc139
-rw-r--r--tensorflow/core/kernels/reverse_op.h28
-rw-r--r--tensorflow/core/kernels/reverse_op_gpu.cu.cc33
-rw-r--r--tensorflow/core/kernels/reverse_op_test.cc101
-rw-r--r--tensorflow/core/kernels/reverse_sequence_op.cc170
-rw-r--r--tensorflow/core/kernels/reverse_sequence_op.h56
-rw-r--r--tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc26
-rw-r--r--tensorflow/core/kernels/save_op.cc81
-rw-r--r--tensorflow/core/kernels/save_op_test.cc443
-rw-r--r--tensorflow/core/kernels/scatter_op.cc167
-rw-r--r--tensorflow/core/kernels/scatter_op_test.cc255
-rw-r--r--tensorflow/core/kernels/segment_reduction_ops.cc466
-rw-r--r--tensorflow/core/kernels/segment_reduction_ops_test.cc157
-rw-r--r--tensorflow/core/kernels/sendrecv_ops.cc116
-rw-r--r--tensorflow/core/kernels/sendrecv_ops.h32
-rw-r--r--tensorflow/core/kernels/sequence_ops.cc123
-rw-r--r--tensorflow/core/kernels/shape_ops.cc261
-rw-r--r--tensorflow/core/kernels/slice_op.cc242
-rw-r--r--tensorflow/core/kernels/slice_op.h25
-rw-r--r--tensorflow/core/kernels/slice_op_gpu.cu.cc31
-rw-r--r--tensorflow/core/kernels/slice_op_test.cc73
-rw-r--r--tensorflow/core/kernels/softmax_op.cc62
-rw-r--r--tensorflow/core/kernels/softmax_op.h70
-rw-r--r--tensorflow/core/kernels/softmax_op_gpu.cu.cc31
-rw-r--r--tensorflow/core/kernels/softplus_op.cc97
-rw-r--r--tensorflow/core/kernels/softplus_op.h46
-rw-r--r--tensorflow/core/kernels/softplus_op_gpu.cu.cc25
-rw-r--r--tensorflow/core/kernels/sparse_concat_op.cc139
-rw-r--r--tensorflow/core/kernels/sparse_matmul_op.cc192
-rw-r--r--tensorflow/core/kernels/sparse_matmul_op_test.cc139
-rw-r--r--tensorflow/core/kernels/sparse_reorder_op.cc71
-rw-r--r--tensorflow/core/kernels/sparse_to_dense_op.cc129
-rw-r--r--tensorflow/core/kernels/sparse_to_dense_op_test.cc283
-rw-r--r--tensorflow/core/kernels/split_op.cc146
-rw-r--r--tensorflow/core/kernels/split_op.h31
-rw-r--r--tensorflow/core/kernels/split_op_cpu.cc30
-rw-r--r--tensorflow/core/kernels/split_op_gpu.cu.cc31
-rw-r--r--tensorflow/core/kernels/string_to_hash_bucket_op.cc47
-rw-r--r--tensorflow/core/kernels/string_to_number_op.cc71
-rw-r--r--tensorflow/core/kernels/summary_image_op.cc169
-rw-r--r--tensorflow/core/kernels/summary_image_op_test.cc141
-rw-r--r--tensorflow/core/kernels/summary_op.cc141
-rw-r--r--tensorflow/core/kernels/summary_op_test.cc282
-rw-r--r--tensorflow/core/kernels/text_line_reader_op.cc99
-rw-r--r--tensorflow/core/kernels/tf_record_reader_op.cc76
-rw-r--r--tensorflow/core/kernels/tile_ops.cc460
-rw-r--r--tensorflow/core/kernels/tile_ops.h48
-rw-r--r--tensorflow/core/kernels/tile_ops_gpu.cu.cc38
-rw-r--r--tensorflow/core/kernels/topk_op.cc71
-rw-r--r--tensorflow/core/kernels/training_ops.cc884
-rw-r--r--tensorflow/core/kernels/training_ops.h65
-rw-r--r--tensorflow/core/kernels/training_ops_gpu.cu.cc127
-rw-r--r--tensorflow/core/kernels/training_ops_test.cc226
-rw-r--r--tensorflow/core/kernels/transpose_op.cc190
-rw-r--r--tensorflow/core/kernels/transpose_op.h19
-rw-r--r--tensorflow/core/kernels/transpose_op_functor.h28
-rw-r--r--tensorflow/core/kernels/transpose_op_gpu.cu.cc43
-rw-r--r--tensorflow/core/kernels/unique_op.cc61
-rw-r--r--tensorflow/core/kernels/unique_op_test.cc51
-rw-r--r--tensorflow/core/kernels/unpack_op.cc96
-rw-r--r--tensorflow/core/kernels/variable_ops.cc37
-rw-r--r--tensorflow/core/kernels/variable_ops.h146
-rw-r--r--tensorflow/core/kernels/where_op.cc74
-rw-r--r--tensorflow/core/kernels/where_op.h65
-rw-r--r--tensorflow/core/kernels/whole_file_read_ops.cc108
-rw-r--r--tensorflow/core/kernels/xent_op.cc90
-rw-r--r--tensorflow/core/kernels/xent_op.h102
-rw-r--r--tensorflow/core/kernels/xent_op_gpu.cu.cc35
-rw-r--r--tensorflow/core/kernels/xent_op_test.cc46
323 files changed, 33366 insertions, 0 deletions
diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc
new file mode 100644
index 0000000000..7cc0534354
--- /dev/null
+++ b/tensorflow/core/kernels/adjust_contrast_op.cc
@@ -0,0 +1,121 @@
+// See docs in ../ops/image_ops.cc
+#define EIGEN_USE_THREADS
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/adjust_contrast_op.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class AdjustContrastOp : public OpKernel {
+ public:
+ explicit AdjustContrastOp(OpKernelConstruction* context) : OpKernel(context) {
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ const Tensor& factor = context->input(1);
+ const Tensor& min_value = context->input(2);
+ const Tensor& max_value = context->input(3);
+ OP_REQUIRES(context, input.dims() >= 3,
+ errors::InvalidArgument("input must be at least 3-D, got shape",
+ input.shape().ShortDebugString()));
+ const int64 height = input.dim_size(input.dims() - 3);
+ const int64 width = input.dim_size(input.dims() - 2);
+ const int64 channels = input.dim_size(input.dims() - 1);
+
+ OP_REQUIRES(context, TensorShapeUtils::IsScalar(factor.shape()),
+ errors::InvalidArgument("contrast_factor must be scalar: ",
+ factor.shape().ShortDebugString()));
+ OP_REQUIRES(context, TensorShapeUtils::IsScalar(min_value.shape()),
+ errors::InvalidArgument("min_value must be scalar: ",
+ min_value.shape().ShortDebugString()));
+ OP_REQUIRES(context, TensorShapeUtils::IsScalar(max_value.shape()),
+ errors::InvalidArgument("max_value must be scalar: ",
+ max_value.shape().ShortDebugString()));
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, input.shape(), &output));
+
+ Tensor mean_values;
+ OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<float>::value,
+ TensorShape(input.shape()),
+ &mean_values));
+
+ if (input.NumElements() > 0) {
+ const int64 batch = input.NumElements() / (height * width * channels);
+ const int64 shape[4] = {batch, height, width, channels};
+ functor::AdjustContrast<Device, T>()(
+ context->eigen_device<Device>(), input.shaped<T, 4>(shape),
+ factor.scalar<float>(), min_value.scalar<float>(),
+ max_value.scalar<float>(), mean_values.shaped<float, 4>(shape),
+ output->shaped<float, 4>(shape));
+ }
+ }
+};
+
+#define REGISTER_KERNEL(T) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("AdjustContrast").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+ AdjustContrastOp<CPUDevice, T>);
+
+REGISTER_KERNEL(uint8);
+REGISTER_KERNEL(int8);
+REGISTER_KERNEL(int16);
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+// Forward declarations of the function specializations for GPU (to prevent
+// building the GPU versions here, they will be built compiling _gpu.cu.cc).
+namespace functor {
+#define DECLARE_GPU_SPEC(T) \
+ template <> \
+ void AdjustContrast<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \
+ typename TTypes<float>::ConstScalar contrast_factor, \
+ typename TTypes<float>::ConstScalar min_value, \
+ typename TTypes<float>::ConstScalar max_value, \
+ typename TTypes<float, 4>::Tensor mean_values, \
+ typename TTypes<float, 4>::Tensor output); \
+ extern template struct AdjustContrast<GPUDevice, T>;
+
+DECLARE_GPU_SPEC(uint8);
+DECLARE_GPU_SPEC(int8);
+DECLARE_GPU_SPEC(int16);
+DECLARE_GPU_SPEC(int32);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+} // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(T) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("AdjustContrast").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+ AdjustContrastOp<GPUDevice, T>);
+REGISTER_GPU_KERNEL(uint8);
+REGISTER_GPU_KERNEL(int8);
+REGISTER_GPU_KERNEL(int16);
+REGISTER_GPU_KERNEL(int32);
+REGISTER_GPU_KERNEL(float);
+REGISTER_GPU_KERNEL(double);
+#undef REGISTER_GPU_KERNEL
+
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/adjust_contrast_op.h b/tensorflow/core/kernels/adjust_contrast_op.h
new file mode 100644
index 0000000000..2182b33c03
--- /dev/null
+++ b/tensorflow/core/kernels/adjust_contrast_op.h
@@ -0,0 +1,64 @@
+#ifndef TENSORFLOW_KERNELS_ADJUST_CONTRAST_OP_H_
+#define TENSORFLOW_KERNELS_ADJUST_CONTRAST_OP_H_
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by AdjustContrastOp to do the computations.
+template <typename Device, typename T>
+struct AdjustContrast {
+ void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+ typename TTypes<float>::ConstScalar contrast_factor,
+ typename TTypes<float>::ConstScalar min_value,
+ typename TTypes<float>::ConstScalar max_value,
+ typename TTypes<float, 4>::Tensor mean_values,
+ typename TTypes<float, 4>::Tensor output) {
+ const int batch = input.dimension(0);
+ const int height = input.dimension(1);
+ const int width = input.dimension(2);
+ const int channels = input.dimension(3);
+
+ Eigen::array<int, 4> scalar_broadcast{{batch, height, width, channels}};
+#if !defined(EIGEN_HAS_INDEX_LIST)
+ Eigen::array<int, 2> reduction_axis{{1, 2}};
+ Eigen::array<int, 4> scalar{{1, 1, 1, 1}};
+ Eigen::array<int, 4> broadcast_dims{{1, height, width, 1}};
+ Eigen::Tensor<int, 4>::Dimensions reshape_dims{{batch, 1, 1, channels}};
+#else
+ Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >
+ reduction_axis;
+ Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<1>,
+ Eigen::type2index<1>, Eigen::type2index<1> > scalar;
+ Eigen::IndexList<Eigen::type2index<1>, int, int, Eigen::type2index<1> >
+ broadcast_dims;
+ broadcast_dims.set(1, height);
+ broadcast_dims.set(2, width);
+ Eigen::IndexList<int, Eigen::type2index<1>, Eigen::type2index<1>, int>
+ reshape_dims;
+ reshape_dims.set(0, batch);
+ reshape_dims.set(3, channels);
+#endif
+ mean_values.device(d) = input.template cast<float>()
+ .mean(reduction_axis)
+ .eval()
+ .reshape(reshape_dims)
+ .broadcast(broadcast_dims);
+
+ auto contrast_factor_tensor =
+ contrast_factor.reshape(scalar).broadcast(scalar_broadcast);
+ auto adjusted =
+ (input.template cast<float>() - mean_values) * contrast_factor_tensor +
+ mean_values;
+ auto min_bcast = min_value.reshape(scalar).broadcast(scalar_broadcast);
+ auto max_bcast = max_value.reshape(scalar).broadcast(scalar_broadcast);
+ // TODO(wicke): This is rather slow and should be re-written as pure cuda.
+ output.device(d) = adjusted.cwiseMin(max_bcast).cwiseMax(min_bcast);
+ }
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_ADJUST_CONTRAST_OP_H_
diff --git a/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc b/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
new file mode 100644
index 0000000000..75b177cf4d
--- /dev/null
+++ b/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
@@ -0,0 +1,43 @@
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+static Graph* BM_AdjustContrast(int batches, int width, int height) {
+ Graph* g = new Graph(OpRegistry::Global());
+ Tensor in(DT_UINT8, TensorShape({batches, width, height, 3}));
+ in.flat<uint8>().setRandom();
+ Tensor factor(DT_FLOAT, TensorShape({}));
+ factor.flat<float>().setConstant(1.2);
+ Tensor min_value(DT_FLOAT, TensorShape({}));
+ min_value.flat<float>().setConstant(7.);
+ Tensor max_value(DT_FLOAT, TensorShape({}));
+ max_value.flat<float>().setConstant(250.);
+
+ Node* ret;
+ NodeBuilder(g->NewName("n"), "AdjustContrast")
+ .Input(test::graph::Constant(g, in))
+ .Input(test::graph::Constant(g, factor))
+ .Input(test::graph::Constant(g, min_value))
+ .Input(test::graph::Constant(g, max_value))
+ .Finalize(g, &ret);
+ return g;
+}
+
+#define BM_AdjustContrastDev(DEVICE, B, W, H) \
+ static void BM_AdjustContrast_##DEVICE##_##B##_##W##_##H(int iters) { \
+ testing::ItemsProcessed(iters* B* W* H * 3); \
+ test::Benchmark(#DEVICE, BM_AdjustContrast(B, W, H)).Run(iters); \
+ } \
+ BENCHMARK(BM_AdjustContrast_##DEVICE##_##B##_##W##_##H);
+
+// Benchmark results as of cl/106323955
+// BM_AdjustContrast_cpu_1_299_299 3416770 22008951 100 11.6M items/s
+
+// BM_AdjustContrast_gpu_32_299_299 37117844 45512374 100 179.8M items/s
+BM_AdjustContrastDev(cpu, 1, 299, 299) BM_AdjustContrastDev(gpu, 32, 299, 299)
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc b/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc
new file mode 100644
index 0000000000..7a9b0726fd
--- /dev/null
+++ b/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc
@@ -0,0 +1,22 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/adjust_contrast_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+template struct functor::AdjustContrast<GPUDevice, uint8>;
+template struct functor::AdjustContrast<GPUDevice, int8>;
+template struct functor::AdjustContrast<GPUDevice, int16>;
+template struct functor::AdjustContrast<GPUDevice, int32>;
+template struct functor::AdjustContrast<GPUDevice, int64>;
+template struct functor::AdjustContrast<GPUDevice, float>;
+template struct functor::AdjustContrast<GPUDevice, double>;
+
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/adjust_contrast_op_test.cc b/tensorflow/core/kernels/adjust_contrast_op_test.cc
new file mode 100644
index 0000000000..67891e4fa1
--- /dev/null
+++ b/tensorflow/core/kernels/adjust_contrast_op_test.cc
@@ -0,0 +1,88 @@
+#include "tensorflow/core/framework/allocator.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+class AdjustContrastOpTest : public OpsTestBase {
+ protected:
+ void MakeOp() { RequireDefaultOps(); }
+};
+
+TEST_F(AdjustContrastOpTest, Simple_1113) {
+ RequireDefaultOps();
+ EXPECT_OK(NodeDefBuilder("adjust_constrast_op", "AdjustContrast")
+ .Input(FakeInput(DT_FLOAT))
+ .Input(FakeInput(DT_FLOAT))
+ .Input(FakeInput(DT_FLOAT))
+ .Input(FakeInput(DT_FLOAT))
+ .Attr("T", DT_FLOAT)
+ .Finalize(node_def()));
+ EXPECT_OK(InitOp());
+ AddInputFromArray<float>(TensorShape({1, 1, 1, 3}), {-1, 2, 3});
+ AddInputFromArray<float>(TensorShape({}), {1.0});
+ AddInputFromArray<float>(TensorShape({}), {0.0});
+ AddInputFromArray<float>(TensorShape({}), {2.0});
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 3}));
+ test::FillValues<float>(&expected, {0, 2, 2});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(AdjustContrastOpTest, Simple_1223) {
+ RequireDefaultOps();
+ EXPECT_OK(NodeDefBuilder("adjust_constrast_op", "AdjustContrast")
+ .Input(FakeInput(DT_FLOAT))
+ .Input(FakeInput(DT_FLOAT))
+ .Input(FakeInput(DT_FLOAT))
+ .Input(FakeInput(DT_FLOAT))
+ .Attr("T", DT_FLOAT)
+ .Finalize(node_def()));
+ EXPECT_OK(InitOp());
+ AddInputFromArray<float>(TensorShape({1, 2, 2, 3}),
+ {1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8, 12});
+ AddInputFromArray<float>(TensorShape({}), {0.2});
+ AddInputFromArray<float>(TensorShape({}), {0.0});
+ AddInputFromArray<float>(TensorShape({}), {10.0});
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 3}));
+ test::FillValues<float>(
+ &expected, {2.2, 6.2, 10, 2.4, 6.4, 10, 2.6, 6.6, 10, 2.8, 6.8, 10});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(AdjustContrastOpTest, Big_99x99x3) {
+ EXPECT_OK(NodeDefBuilder("adjust_constrast_op", "AdjustContrast")
+ .Input(FakeInput(DT_FLOAT))
+ .Input(FakeInput(DT_FLOAT))
+ .Input(FakeInput(DT_FLOAT))
+ .Input(FakeInput(DT_FLOAT))
+ .Attr("T", DT_FLOAT)
+ .Finalize(node_def()));
+ EXPECT_OK(InitOp());
+
+ std::vector<float> values;
+ for (int i = 0; i < 99 * 99 * 3; ++i) {
+ values.push_back(i % 255);
+ }
+
+ AddInputFromArray<float>(TensorShape({1, 99, 99, 3}), values);
+ AddInputFromArray<float>(TensorShape({}), {0.2});
+ AddInputFromArray<float>(TensorShape({}), {0});
+ AddInputFromArray<float>(TensorShape({}), {255});
+ ASSERT_OK(RunOpKernel());
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/aggregate_ops.cc b/tensorflow/core/kernels/aggregate_ops.cc
new file mode 100644
index 0000000000..426e868735
--- /dev/null
+++ b/tensorflow/core/kernels/aggregate_ops.cc
@@ -0,0 +1,238 @@
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/aggregate_ops.h"
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/register_types.h"
+
+#include "tensorflow/core/platform/logging.h"
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class AddNOp : public OpKernel {
+ public:
+ explicit AddNOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* ctx) override {
+ if (!ctx->ValidateInputsAreSameShape(this)) return;
+
+ const Tensor& input0 = ctx->input(0);
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input0.shape(), &output));
+ auto To = output->flat<T>();
+
+ const int num = ctx->num_inputs();
+ if (num == 1) {
+ *output = input0;
+ return;
+ }
+
+#define I(IDX) ctx->input(IDX).flat<T>()
+
+#if defined(PLATFORM_POSIX_ANDROID) || defined(PLATFORM_GOOGLE_ANDROID)
+ // On Android, we only support additions of two arguments, so we
+ // can reduce the number of template instantiations.
+ OP_REQUIRES(ctx, num == 2,
+ errors::InvalidArgument("Only additions of two arguments "
+ "supported. Num inputs: ",
+ num));
+ functor::Add2Functor<Device, T> functor2;
+ functor2(ctx->template eigen_device<Device>(), To, I(0), I(1));
+#else
+ static const int kWidth = 8;
+ int r = num % kWidth;
+
+ switch (r) {
+ case 2: {
+ functor::Add2Functor<Device, T> functor2;
+ functor2(ctx->template eigen_device<Device>(), To, I(0), I(1));
+ break;
+ }
+ case 3: {
+ functor::Add3Functor<Device, T> functor3;
+ functor3(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2));
+ break;
+ }
+ case 4: {
+ functor::Add4Functor<Device, T> functor4;
+ functor4(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
+ I(3));
+ break;
+ }
+ case 5: {
+ functor::Add5Functor<Device, T> functor5;
+ functor5(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
+ I(3), I(4));
+ break;
+ }
+ case 6: {
+ functor::Add6Functor<Device, T> functor6;
+ functor6(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
+ I(3), I(4), I(5));
+ break;
+ }
+ case 7: {
+ functor::Add7Functor<Device, T> functor7;
+ functor7(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
+ I(3), I(4), I(5), I(6));
+ break;
+ }
+ case 0: {
+ functor::Add8Functor<Device, T> functor8;
+ functor8(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
+ I(3), I(4), I(5), I(6), I(7));
+ r = 8;
+ break;
+ }
+ case 1: {
+ functor::Add9Functor<Device, T> functor9;
+ functor9(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
+ I(3), I(4), I(5), I(6), I(7), I(8));
+ r = 9;
+ break;
+ }
+ }
+
+ for (; r < num; r += kWidth) {
+ functor::Add8pFunctor<Device, T> functor8p;
+ functor8p(ctx->template eigen_device<Device>(), To, I(r), I(r + 1),
+ I(r + 2), I(r + 3), I(r + 4), I(r + 5), I(r + 6), I(r + 7));
+ }
+#endif // defined(PLATFORM_POSIX_ANDROID) || defined(PLATFORM_GOOGLE_ANDROID)
+
+#undef I
+ }
+};
+
+// Partial specializations for a CPUDevice, that uses the Eigen implementation
+// from AddNEigenImpl.
+namespace functor {
+template <typename T>
+struct Add2Functor<CPUDevice, T> {
+ void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2) {
+ Add2EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2);
+ }
+};
+template <typename T>
+struct Add3Functor<CPUDevice, T> {
+ void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3) {
+ Add3EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3);
+ }
+};
+template <typename T>
+struct Add4Functor<CPUDevice, T> {
+ void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3,
+ typename TTypes<T>::ConstFlat in4) {
+ Add4EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4);
+ }
+};
+template <typename T>
+struct Add5Functor<CPUDevice, T> {
+ void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3,
+ typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5) {
+ Add5EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5);
+ }
+};
+template <typename T>
+struct Add6Functor<CPUDevice, T> {
+ void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3,
+ typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5,
+ typename TTypes<T>::ConstFlat in6) {
+ Add6EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6);
+ }
+};
+template <typename T>
+struct Add7Functor<CPUDevice, T> {
+ void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3,
+ typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5,
+ typename TTypes<T>::ConstFlat in6,
+ typename TTypes<T>::ConstFlat in7) {
+ Add7EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+ in7);
+ }
+};
+
+template <typename T>
+struct Add8Functor<CPUDevice, T> {
+ void operator()(
+ const CPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+ typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
+ Add8EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+ in7, in8);
+ }
+};
+
+template <typename T>
+struct Add8pFunctor<CPUDevice, T> {
+ void operator()(
+ const CPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+ typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
+ Add8pEigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+ in7, in8);
+ }
+};
+
+template <typename T>
+struct Add9Functor<CPUDevice, T> {
+ void operator()(
+ const CPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+ typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
+ typename TTypes<T>::ConstFlat in9) {
+ Add9EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+ in7, in8, in9);
+ }
+};
+
+} // namespace functor
+
+#define REGISTER_ADDN(type, dev) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("AddN").Device(DEVICE_##dev).TypeConstraint<type>("T"), \
+ AddNOp<dev##Device, type>)
+
+#define REGISTER_ADDN_CPU(type) REGISTER_ADDN(type, CPU)
+
+TF_CALL_NUMBER_TYPES(REGISTER_ADDN_CPU);
+#undef REGISTER_ADDN_CPU
+
+#if GOOGLE_CUDA
+REGISTER_ADDN(float, GPU);
+#endif // GOOGLE_CUDA
+
+#undef REGISTER_ADDN
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/aggregate_ops.h b/tensorflow/core/kernels/aggregate_ops.h
new file mode 100644
index 0000000000..2214901970
--- /dev/null
+++ b/tensorflow/core/kernels/aggregate_ops.h
@@ -0,0 +1,211 @@
+#ifndef TENSORFLOW_KERNELS_AGGREGATE_OPS_H_
+#define TENSORFLOW_KERNELS_AGGREGATE_OPS_H_
+
+// Functor definitions for Aggregate ops, must be compilable by nvcc.
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct Add2Functor {
+ void operator()(const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2);
+};
+
+template <typename Device, typename T>
+struct Add2EigenImpl {
+ static void Compute(const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2) {
+ out.device(d) = in1 + in2;
+ }
+};
+
+template <typename Device, typename T>
+struct Add3Functor {
+ void operator()(const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3);
+};
+
+template <typename Device, typename T>
+struct Add3EigenImpl {
+ static void Compute(const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3) {
+ out.device(d) = in1 + in2 + in3;
+ }
+};
+
+template <typename Device, typename T>
+struct Add4Functor {
+ void operator()(const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3,
+ typename TTypes<T>::ConstFlat in4);
+};
+
+template <typename Device, typename T>
+struct Add4EigenImpl {
+ static void Compute(const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3,
+ typename TTypes<T>::ConstFlat in4) {
+ out.device(d) = in1 + in2 + in3 + in4;
+ }
+};
+
+template <typename Device, typename T>
+struct Add5Functor {
+ void operator()(const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3,
+ typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5);
+};
+
+template <typename Device, typename T>
+struct Add5EigenImpl {
+ static void Compute(const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3,
+ typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5) {
+ out.device(d) = in1 + in2 + in3 + in4 + in5;
+ }
+};
+
+template <typename Device, typename T>
+struct Add6Functor {
+ void operator()(const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3,
+ typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5,
+ typename TTypes<T>::ConstFlat in6);
+};
+
+template <typename Device, typename T>
+struct Add6EigenImpl {
+ static void Compute(const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3,
+ typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5,
+ typename TTypes<T>::ConstFlat in6) {
+ out.device(d) = in1 + in2 + in3 + in4 + in5 + in6;
+ }
+};
+
+template <typename Device, typename T>
+struct Add7Functor {
+ void operator()(const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3,
+ typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5,
+ typename TTypes<T>::ConstFlat in6,
+ typename TTypes<T>::ConstFlat in7);
+};
+
+template <typename Device, typename T>
+struct Add7EigenImpl {
+ static void Compute(const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3,
+ typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5,
+ typename TTypes<T>::ConstFlat in6,
+ typename TTypes<T>::ConstFlat in7) {
+ out.device(d) = in1 + in2 + in3 + in4 + in5 + in6 + in7;
+ }
+};
+
+template <typename Device, typename T>
+struct Add8Functor {
+ void operator()(
+ const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+ typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8);
+};
+
+template <typename Device, typename T>
+struct Add8EigenImpl {
+ static void Compute(
+ const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+ typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
+ out.device(d) = in1 + in2 + in3 + in4 + in5 + in6 + in7 + in8;
+ }
+};
+
+// Add8p is like Add8 except the underlying implementation should +=
+// rather than assign to the output.
+template <typename Device, typename T>
+struct Add8pFunctor {
+ void operator()(
+ const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+ typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8);
+};
+
+template <typename Device, typename T>
+struct Add8pEigenImpl {
+ static void Compute(
+ const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+ typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
+ out.device(d) += in1 + in2 + in3 + in4 + in5 + in6 + in7 + in8;
+ }
+};
+
+template <typename Device, typename T>
+struct Add9Functor {
+ void operator()(
+ const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+ typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
+ typename TTypes<T>::ConstFlat in9);
+};
+
+template <typename Device, typename T>
+struct Add9EigenImpl {
+ static void Compute(
+ const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+ typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
+ typename TTypes<T>::ConstFlat in9) {
+ out.device(d) = in1 + in2 + in3 + in4 + in5 + in6 + in7 + in8 + in9;
+ }
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_AGGREGATE_OPS_H_
diff --git a/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc b/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
new file mode 100644
index 0000000000..5cf2934ac1
--- /dev/null
+++ b/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
@@ -0,0 +1,141 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/aggregate_ops.h"
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Partial specialization for a GPUDevice, that uses the Eigen implementation.
+namespace functor {
+template <typename T>
+struct Add2Functor<GPUDevice, T> {
+ void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2) {
+ Add2EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2);
+ }
+};
+
+template <typename T>
+struct Add3Functor<GPUDevice, T> {
+ void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3) {
+ Add3EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3);
+ }
+};
+
+template <typename T>
+struct Add4Functor<GPUDevice, T> {
+ void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3,
+ typename TTypes<T>::ConstFlat in4) {
+ Add4EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4);
+ }
+};
+
+template <typename T>
+struct Add5Functor<GPUDevice, T> {
+ void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3,
+ typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5) {
+ Add5EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5);
+ }
+};
+
+template <typename T>
+struct Add6Functor<GPUDevice, T> {
+ void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3,
+ typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5,
+ typename TTypes<T>::ConstFlat in6) {
+ Add6EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6);
+ }
+};
+
+template <typename T>
+struct Add7Functor<GPUDevice, T> {
+ void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1,
+ typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3,
+ typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5,
+ typename TTypes<T>::ConstFlat in6,
+ typename TTypes<T>::ConstFlat in7) {
+ Add7EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+ in7);
+ }
+};
+
+template <typename T>
+struct Add8Functor<GPUDevice, T> {
+ void operator()(
+ const GPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+ typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
+ Add8EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+ in7, in8);
+ }
+};
+
+template <typename T>
+struct Add8pFunctor<GPUDevice, T> {
+ void operator()(
+ const GPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+ typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
+ Add8pEigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+ in7, in8);
+ }
+};
+
+template <typename T>
+struct Add9Functor<GPUDevice, T> {
+ void operator()(
+ const GPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+ typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+ typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+ typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
+ typename TTypes<T>::ConstFlat in9) {
+ Add9EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+ in7, in8, in9);
+ }
+};
+
+} // end namespace functor
+
+// Instantiate the GPU implementation for float.
+template struct functor::Add2Functor<GPUDevice, float>;
+template struct functor::Add3Functor<GPUDevice, float>;
+template struct functor::Add4Functor<GPUDevice, float>;
+template struct functor::Add5Functor<GPUDevice, float>;
+template struct functor::Add6Functor<GPUDevice, float>;
+template struct functor::Add7Functor<GPUDevice, float>;
+template struct functor::Add8Functor<GPUDevice, float>;
+template struct functor::Add8pFunctor<GPUDevice, float>;
+template struct functor::Add9Functor<GPUDevice, float>;
+
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/argmax_op.cc b/tensorflow/core/kernels/argmax_op.cc
new file mode 100644
index 0000000000..0845eebf09
--- /dev/null
+++ b/tensorflow/core/kernels/argmax_op.cc
@@ -0,0 +1,163 @@
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif // GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/argmax_op.h"
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T, typename ArgFunctor>
+class ArgOp : public OpKernel {
+ public:
+ explicit ArgOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ const Tensor& dimension = context->input(1);
+
+ OP_REQUIRES(context, TensorShapeUtils::IsScalar(dimension.shape()),
+ errors::InvalidArgument(
+ "dim must be a scalar, but received tensor of shape: ",
+ dimension.shape().DebugString()));
+
+ const int32 dim = dimension.scalar<int32>()();
+ const int input_dims = input.dims();
+
+ OP_REQUIRES(context, dim >= 0, errors::InvalidArgument("dim must be >= 0"));
+ OP_REQUIRES(context, dim < input_dims,
+ errors::InvalidArgument("Minimum tensor rank: ", dim,
+ " but got: ", input_dims));
+
+ TensorShape output_shape;
+ TensorShape input_shape = input.shape();
+ for (int d = 0; d < input_dims - 1; ++d) {
+ output_shape.AddDim(input_shape.dim_size((d < dim) ? d : d + 1));
+ }
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+#define HANDLE_DIM(NDIM) \
+ case NDIM: \
+ ArgFunctor::Reduce##NDIM(context->eigen_device<Device>(), \
+ input.tensor<T, NDIM>(), dim, \
+ output->tensor<int64, NDIM - 1>()); \
+ break;
+
+ switch (input_dims) {
+ HANDLE_DIM(1);
+ HANDLE_DIM(2);
+ HANDLE_DIM(3);
+ HANDLE_DIM(4);
+ HANDLE_DIM(5);
+
+ default:
+ OP_REQUIRES(context, false,
+ errors::InvalidArgument(
+ "ArgOp : Unhandled input dimensions: ", input_dims));
+ }
+ }
+#undef HANDLE_DIM
+
+ private:
+ TF_DISALLOW_COPY_AND_ASSIGN(ArgOp);
+};
+
+template <typename Device, typename T>
+class ArgMaxOp : public ArgOp<Device, T, functor::ArgMax<Device, T> > {
+ public:
+ explicit ArgMaxOp(OpKernelConstruction* context)
+ : ArgOp<Device, T, functor::ArgMax<Device, T> >(context) {}
+};
+
+template <typename Device, typename T>
+class ArgMinOp : public ArgOp<Device, T, functor::ArgMin<Device, T> > {
+ public:
+ explicit ArgMinOp(OpKernelConstruction* context)
+ : ArgOp<Device, T, functor::ArgMin<Device, T> >(context) {}
+};
+
+#define REGISTER_ARGMAX(type) \
+ REGISTER_KERNEL_BUILDER(Name("ArgMax") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("dimension"), \
+ ArgMaxOp<CPUDevice, type>); \
+ REGISTER_KERNEL_BUILDER(Name("ArgMin") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("dimension"), \
+ ArgMinOp<CPUDevice, type>);
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_ARGMAX);
+
+#if GOOGLE_CUDA
+
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+
+#define DECLARE_GPU_SPEC(T, Dims) \
+ template <> \
+ void ArgMax<GPUDevice, T>::Reduce##Dims( \
+ const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \
+ const int32 dimension, typename TTypes<int64, Dims - 1>::Tensor output); \
+ template <> \
+ void ArgMin<GPUDevice, T>::Reduce##Dims( \
+ const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \
+ const int32 dimension, typename TTypes<int64, Dims - 1>::Tensor output);
+
+#define DECLARE_GPU_SPECS(T) \
+ DECLARE_GPU_SPEC(T, 1); \
+ DECLARE_GPU_SPEC(T, 2); \
+ DECLARE_GPU_SPEC(T, 3); \
+ DECLARE_GPU_SPEC(T, 4); \
+ DECLARE_GPU_SPEC(T, 5);
+
+#define DECLARE_GPU_CLASS(T) \
+ extern template struct ArgMax<GPUDevice, T>; \
+ extern template struct ArgMin<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_CLASS);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_CLASS
+
+} // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_ARGMAX_GPU(type) \
+ REGISTER_KERNEL_BUILDER(Name("ArgMax") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("dimension"), \
+ ArgMaxOp<GPUDevice, type>); \
+ REGISTER_KERNEL_BUILDER(Name("ArgMin") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("dimension"), \
+ ArgMinOp<GPUDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_ARGMAX_GPU);
+
+#undef REGISTER_ARGMAX_GPU
+
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/argmax_op.h b/tensorflow/core/kernels/argmax_op.h
new file mode 100644
index 0000000000..41734f3254
--- /dev/null
+++ b/tensorflow/core/kernels/argmax_op.h
@@ -0,0 +1,55 @@
+#ifndef TENSORFLOW_KERNELS_ARGMAX_OP_H_
+#define TENSORFLOW_KERNELS_ARGMAX_OP_H_
+// Generator definition for ArgMaxOp, must be compilable by nvcc.
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct ArgMax {
+#define DECLARE_COMPUTE_SPEC(Dims) \
+ EIGEN_ALWAYS_INLINE static void Reduce##Dims( \
+ const Device& d, typename TTypes<T, Dims>::ConstTensor input, \
+ const int32 dimension, \
+ typename TTypes<int64, Dims - 1>::Tensor output) { \
+ output.device(d) = input.argmax(dimension).template cast<int64>(); \
+ }
+
+ DECLARE_COMPUTE_SPEC(1);
+ DECLARE_COMPUTE_SPEC(2);
+ DECLARE_COMPUTE_SPEC(3);
+ DECLARE_COMPUTE_SPEC(4);
+ DECLARE_COMPUTE_SPEC(5);
+
+#undef DECLARE_COMPUTE_SPEC
+};
+
+template <typename Device, typename T>
+struct ArgMin {
+#define DECLARE_COMPUTE_SPEC(Dims) \
+ EIGEN_ALWAYS_INLINE static void Reduce##Dims( \
+ const Device& d, typename TTypes<T, Dims>::ConstTensor input, \
+ const int32 dimension, \
+ typename TTypes<int64, Dims - 1>::Tensor output) { \
+ output.device(d) = input.argmin(dimension).template cast<int64>(); \
+ }
+
+ DECLARE_COMPUTE_SPEC(1);
+ DECLARE_COMPUTE_SPEC(2);
+ DECLARE_COMPUTE_SPEC(3);
+ DECLARE_COMPUTE_SPEC(4);
+ DECLARE_COMPUTE_SPEC(5);
+
+#undef DECLARE_COMPUTE_SPEC
+};
+
+} // namespace functor
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_ARGMAX_OP_H_
diff --git a/tensorflow/core/kernels/argmax_op_gpu.cu.cc b/tensorflow/core/kernels/argmax_op_gpu.cu.cc
new file mode 100644
index 0000000000..6c91fc2c86
--- /dev/null
+++ b/tensorflow/core/kernels/argmax_op_gpu.cu.cc
@@ -0,0 +1,20 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/argmax_op.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_GPU_SPEC(T) \
+ template struct functor::ArgMax<GPUDevice, T>; \
+ template struct functor::ArgMin<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/assign_op.h b/tensorflow/core/kernels/assign_op.h
new file mode 100644
index 0000000000..3306f1eeaa
--- /dev/null
+++ b/tensorflow/core/kernels/assign_op.h
@@ -0,0 +1,92 @@
+#ifndef TENSORFLOW_KERNELS_ASSIGN_OP_H_
+#define TENSORFLOW_KERNELS_ASSIGN_OP_H_
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+// TODO(jeff): Get rid of use_exclusive_lock_ option
+
+// Computes *input[0] = input[1]
+class AssignOp : public OpKernel {
+ public:
+ explicit AssignOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context,
+ context->GetAttr("use_locking", &use_exclusive_lock_));
+ OP_REQUIRES_OK(context,
+ context->GetAttr("validate_shape", &validate_shape_));
+ OP_REQUIRES(context, IsRefType(context->input_type(0)),
+ errors::InvalidArgument("lhs input needs to be a ref type"));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ Tensor rhs = context->input(1);
+
+ // We always return the input ref.
+ context->forward_ref_input_to_ref_output(0, 0);
+
+ // If the left hand side is not initialized, or the shape of the
+ // right-hand side is different than the left hand side, we need
+ // to allocate a new tensor.
+ {
+ mutex_lock l(*context->input_ref_mutex(0));
+
+ Tensor old_lhs = context->mutable_input(0, true);
+
+ if (validate_shape_) {
+ OP_REQUIRES(
+ context, old_lhs.shape().IsSameSize(rhs.shape()),
+ errors::InvalidArgument(
+ "Assign requires shapes of both tensors to match. lhs shape= ",
+ old_lhs.shape().ShortDebugString(), " rhs shape= ",
+ rhs.shape().ShortDebugString()));
+ }
+
+ const bool same_shape = old_lhs.shape().IsSameSize(rhs.shape());
+ if (!old_lhs.IsInitialized() || !same_shape) {
+ // Create new tensor whose shape matches the right hand side
+ // and copy then hand off to lhs.
+ // We can't always know how this value will be used downstream,
+ // so make conservative assumptions in specifying the memory
+ // allocation attributes.
+ AllocatorAttributes attr;
+ attr.set_gpu_compatible(true);
+ PersistentTensor copy;
+ Tensor* copyTensor = nullptr;
+ OP_REQUIRES_OK(
+ context, context->allocate_persistent(old_lhs.dtype(), rhs.shape(),
+ &copy, &copyTensor, attr));
+ Copy(context, copyTensor, rhs);
+ context->replace_ref_input(0, *copyTensor, true);
+ return;
+ }
+
+ // The tensor has already been initialized and the right hand side
+ // matches the left hand side's shape.
+ if (use_exclusive_lock_) {
+ Copy(context, &old_lhs, rhs);
+ return;
+ }
+ }
+
+ // The tensor has already been initialized and the right hand side
+ // matches the left hand side's shape. We have been told to do the
+ // copy outside the lock.
+ Tensor old_unlocked_lhs = context->mutable_input(0, false);
+ Copy(context, &old_unlocked_lhs, rhs);
+ }
+
+ virtual void Copy(OpKernelContext* context, Tensor* lhs,
+ const Tensor& rhs) = 0;
+
+ bool use_exclusive_lock_;
+ bool validate_shape_;
+};
+
+} // end namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_ASSIGN_OP_H_
diff --git a/tensorflow/core/kernels/attention_ops.cc b/tensorflow/core/kernels/attention_ops.cc
new file mode 100644
index 0000000000..28763f65a4
--- /dev/null
+++ b/tensorflow/core/kernels/attention_ops.cc
@@ -0,0 +1,92 @@
+// See docs in ../ops/attention_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
+
+namespace tensorflow {
+
+class ExtractGlimpseOp : public OpKernel {
+ public:
+ explicit ExtractGlimpseOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("normalized", &normalized_));
+ OP_REQUIRES_OK(context, context->GetAttr("centered", &centered_));
+ OP_REQUIRES_OK(context, context->GetAttr("uniform_noise", &uniform_noise_));
+ }
+
+ // Expect input tensor of rank 4 with dimensions (batch_size, height, width,
+ // depth).
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ const TensorShape input_shape = input.shape();
+ const int32 num_dims = input_shape.dims();
+ OP_REQUIRES(
+ context, num_dims == 4,
+ errors::InvalidArgument(
+ "input must be 4-dimensional (batch_size, height, width, depth)",
+ input_shape.ShortDebugString()));
+
+ const int64 batch_size = input_shape.dim_size(0);
+
+ const Tensor& window_size = context->input(1);
+ OP_REQUIRES(context, (window_size.shape().dims() == 1) &&
+ window_size.shape().dim_size(0) == 2,
+ errors::InvalidArgument(
+ "input must be a vector of size 2 (height, width)",
+ window_size.shape().ShortDebugString()));
+
+ const int64 output_height = window_size.tensor<int, 1>()(0);
+ const int64 output_width = window_size.tensor<int, 1>()(1);
+ TensorShape output_shape = input_shape;
+ output_shape.set_dim(1, output_height);
+ output_shape.set_dim(2, output_width);
+
+ const Tensor& offsets = context->input(2);
+ OP_REQUIRES(context, offsets.shape().dims() == 2,
+ errors::InvalidArgument("input must be a matrix",
+ offsets.shape().ShortDebugString()));
+ OP_REQUIRES(context, offsets.shape().dim_size(0) == batch_size,
+ errors::InvalidArgument("first dimension should be batch",
+ offsets.shape().ShortDebugString()));
+ OP_REQUIRES(
+ context, offsets.shape().dim_size(1) == 2,
+ errors::InvalidArgument("second dimension should be of size 2 (y,x)",
+ offsets.shape().ShortDebugString()));
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+ std::vector<Eigen::IndexPair<float> > offset_vec;
+ offset_vec.reserve(batch_size);
+ for (int i = 0; i < batch_size; ++i) {
+ float offset_y = offsets.tensor<float, 2>()(i, 0);
+ float offset_x = offsets.tensor<float, 2>()(i, 1);
+ // Eigen::ExtractGlimpses expects offsets as (x,y), whereas the
+ // calling TensorFlow operates with (y,x) as indices.
+ offset_vec.push_back(Eigen::IndexPair<float>(offset_x, offset_y));
+ }
+
+ output->tensor<float, 4>().swap_layout().device(
+ context->eigen_cpu_device()) =
+ Eigen::ExtractGlimpses(input.tensor<float, 4>().swap_layout(),
+ output_width, output_height, offset_vec,
+ normalized_, centered_, uniform_noise_);
+ }
+
+ private:
+ bool normalized_;
+ bool centered_;
+ bool uniform_noise_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ExtractGlimpse").Device(DEVICE_CPU),
+ ExtractGlimpseOp);
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc
new file mode 100644
index 0000000000..26f98ffbcd
--- /dev/null
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@@ -0,0 +1,418 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/avgpooling_op.h"
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/pooling_ops_common.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
+#include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
+#endif // GOOGLE_CUDA
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class AvgPoolingOp : public UnaryOp<T> {
+ public:
+ explicit AvgPoolingOp(OpKernelConstruction* context) : UnaryOp<T>(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+ OP_REQUIRES(context, ksize_.size() == 4,
+ errors::InvalidArgument(
+ "Sliding window ksize field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+ OP_REQUIRES(context, stride_.size() == 4,
+ errors::InvalidArgument(
+ "Sliding window stride field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+ errors::Unimplemented(
+ "Pooling is not yet supported on the batch dimension."));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& tensor_in = context->input(0);
+ PoolParameters params{context, ksize_, stride_, padding_,
+ tensor_in.shape()};
+ if (!context->status().ok()) {
+ return;
+ }
+ OP_REQUIRES(context, params.depth_window == 1,
+ errors::Unimplemented(
+ "Non-spatial pooling is not "
+ "yet supported. Volunteers? :)"));
+
+ // For avgpooling, tensor_in should have 4 dimensions.
+ OP_REQUIRES(context, tensor_in.dims() == 4,
+ errors::InvalidArgument("tensor_in must be 4-dimensional"));
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(
+ 0, params.forward_output_shape(), &output));
+
+ if (std::is_same<Device, GPUDevice>::value) {
+ Eigen::PaddingType pt = BrainPadding2EigenPadding(padding_);
+ functor::SpatialAvgPooling<Device, T>()(
+ context->eigen_device<Device>(), output->tensor<T, 4>(),
+ tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols,
+ params.row_stride, params.col_stride, pt);
+ } else {
+ SpatialAvgPool<Device, T>(context, output, tensor_in, params, padding_);
+ }
+ }
+
+ private:
+ std::vector<int32> ksize_;
+ std::vector<int32> stride_;
+ Padding padding_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("AvgPool")
+ .Device(DEVICE_CPU)
+ .TypeConstraint<float>("T"),
+ AvgPoolingOp<CPUDevice, float>);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T) \
+ template <> \
+ void SpatialAvgPooling<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T, 4>::Tensor output, \
+ typename TTypes<T, 4>::ConstTensor input, int window_rows, \
+ int window_cols, int row_stride, int col_stride, \
+ const Eigen::PaddingType& padding); \
+ extern template struct SpatialAvgPooling<GPUDevice, T>;
+
+DECLARE_GPU_SPEC(float);
+#undef DECLARE_GPU_SPEC
+} // namespace functor
+
+REGISTER_KERNEL_BUILDER(Name("AvgPool")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<float>("T"),
+ AvgPoolingOp<GPUDevice, float>);
+#endif // GOOGLE_CUDA
+
+// The operation to compute AvgPool gradients.
+// It takes two inputs:
+// - The original input tensor shape
+// - Backprop tensor for output
+// It produces one output: backprop tensor for input.
+template <typename Device, class T>
+class AvgPoolingGradOp : public OpKernel {
+ public:
+ explicit AvgPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+ OP_REQUIRES(context, ksize_.size() == 4,
+ errors::InvalidArgument(
+ "Sliding window ksize field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+ OP_REQUIRES(context, stride_.size() == 4,
+ errors::InvalidArgument(
+ "Sliding window strides field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+ errors::Unimplemented(
+ "Pooling is not yet supported on the batch dimension."));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& tensor_in_shape = context->input(0);
+ const Tensor& out_backprop = context->input(1);
+ // For avgpooling, tensor_in_shape should have 1 dimension, and 4 elements.
+ OP_REQUIRES(context, tensor_in_shape.dims() == 1 &&
+ tensor_in_shape.NumElements() == 4,
+ errors::InvalidArgument(
+ "out_backprop must be 1-dimensional and 4 "
+ "elements"));
+ // For avgpooling, out_backprop should have 4 dimensions.
+ OP_REQUIRES(context, out_backprop.dims() == 4,
+ errors::InvalidArgument("out_backprop must be 4-dimensional"));
+ const int64 out_backprop_batch = out_backprop.dim_size(0);
+ const int64 out_backprop_rows = out_backprop.dim_size(1);
+ const int64 out_backprop_cols = out_backprop.dim_size(2);
+ const int64 out_backprop_depth = out_backprop.dim_size(3);
+
+ TensorShape output_shape;
+ auto shape_vec = tensor_in_shape.vec<int32>();
+ for (int64 i = 0; i < tensor_in_shape.NumElements(); ++i) {
+ output_shape.AddDim(shape_vec(i));
+ }
+ const int64 in_rows = output_shape.dim_size(1);
+ const int64 in_cols = output_shape.dim_size(2);
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+ output->flat<T>().setZero();
+
+ const int window_rows = ksize_[1];
+ const int window_cols = ksize_[2];
+ const int depth_window = ksize_[3];
+
+ const int row_stride = stride_[1];
+ const int col_stride = stride_[2];
+
+ // We (will) use different code for spatial pooling and
+ // non-spatial pooling.
+ //
+ // Spatial pooling is when depth_window = 1
+ OP_REQUIRES(context, depth_window == 1,
+ errors::Unimplemented(
+ "Non-spatial pooling is not "
+ "yet supported. Volunteers? :)"));
+
+ int out_height, out_width, pad_rows, pad_cols;
+ OP_REQUIRES_OK(
+ context, Get2dOutputSize(in_rows, in_cols, window_rows, window_cols,
+ row_stride, col_stride, padding_, &out_height,
+ &out_width, &pad_rows, &pad_cols));
+
+ const T* out_backprop_ptr = out_backprop.flat<T>().data();
+ T* input_backprop_ptr = output->flat<T>().data();
+
+ for (int64 b = 0; b < out_backprop_batch; ++b) {
+ for (int64 r = 0; r < out_backprop_rows; ++r) {
+ // Calculates row broadcast size. For SAME padding, current
+ // index could be in the padding area, and r*row_stride +
+ // window_rows could be beyond the input tensor's boundary. In
+ // such cases, change the starting index and reduce the
+ // broadcast size.
+ int rindex, rsize;
+ OP_REQUIRES_OK(context,
+ GetBroadcastSize(r, in_rows, window_rows, row_stride,
+ pad_rows, &rindex, &rsize));
+ for (int64 c = 0; c < out_backprop_cols; ++c) {
+ // Calculates col broadcast size. For SAME padding, current
+ // index could be in the padding area, and c*col_stride +
+ // window_cols could be beyond the input tensor's boundary. In
+ // such cases, change the starting index and reduce the
+ // broadcast size.
+ int cindex, csize;
+ OP_REQUIRES_OK(context,
+ GetBroadcastSize(c, in_cols, window_cols, col_stride,
+ pad_cols, &cindex, &csize));
+
+ T divide_coeff = 1.0 / (rsize * csize);
+ int64 output_index =
+ (b * out_backprop_rows + r) * out_backprop_cols + c;
+ for (int64 r_dst = rindex; r_dst < rindex + rsize; ++r_dst) {
+ for (int64 c_dst = cindex; c_dst < cindex + csize; ++c_dst) {
+ int64 input_index = (b * in_rows + r_dst) * in_cols + c_dst;
+ const T* output_offset =
+ out_backprop_ptr + output_index * out_backprop_depth;
+ T* input_offset =
+ input_backprop_ptr + input_index * out_backprop_depth;
+ for (int64 d = 0; d < out_backprop_depth; ++d) {
+ *input_offset += *output_offset * divide_coeff;
+ ++output_offset;
+ ++input_offset;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private:
+ std::vector<int32> ksize_;
+ std::vector<int32> stride_;
+ Padding padding_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
+ .Device(DEVICE_CPU)
+ .TypeConstraint<float>("T")
+ .HostMemory("orig_input_shape"),
+ AvgPoolingGradOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
+ .Device(DEVICE_CPU)
+ .TypeConstraint<double>("T")
+ .HostMemory("orig_input_shape"),
+ AvgPoolingGradOp<CPUDevice, double>);
+
+#if GOOGLE_CUDA
+
+// A CUDNN based AvgPoolingGrad implementation. It includes the padding as the
+// candidates for the pooling operation.
+template <class T>
+class AvgPoolingGradOp<GPUDevice, T> : public OpKernel {
+ public:
+ typedef GPUDevice Device;
+
+ explicit AvgPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+ OP_REQUIRES(context, ksize_.size() == 4,
+ errors::InvalidArgument("Sliding window ksize field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+ OP_REQUIRES(context, stride_.size() == 4,
+ errors::InvalidArgument("Sliding window strides field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+ errors::Unimplemented(
+ "Pooling is not yet supported on the batch dimension."));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& tensor_in_shape = context->input(0);
+ const Tensor& out_backprop = context->input(1);
+ // For avgpooling, tensor_in_shape should have 1 dimension, and 4 elements.
+ OP_REQUIRES(
+ context,
+ tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
+ errors::InvalidArgument("out_backprop must be 1-dimensional and 4 "
+ "elements"));
+ // For avgpooling, out_backprop should have 4 dimensions.
+ OP_REQUIRES(context, out_backprop.dims() == 4,
+ errors::InvalidArgument("out_backprop must be 4-dimensional"));
+
+ TensorShape output_shape;
+ auto shape_vec = tensor_in_shape.vec<int32>();
+ for (int64 i = 0; i < tensor_in_shape.NumElements(); ++i) {
+ output_shape.AddDim(shape_vec(i));
+ }
+
+ DnnPoolingGradOp<T>::Compute(
+ context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
+ stride_, padding_, nullptr, nullptr, out_backprop, output_shape);
+ }
+
+ private:
+ std::vector<int32> ksize_;
+ std::vector<int32> stride_;
+ Padding padding_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<float>("T")
+ .HostMemory("orig_input_shape")
+ .Label("cudnn"),
+ AvgPoolingGradOp<GPUDevice, float>);
+
+// A custom GPU kernel based AvgPoolingGrad implementation. It includes the
+// padding as the candidates for the pooling operation.
+template <class T>
+class AvgPoolingGradOpCustomGPUKernel : public OpKernel {
+ public:
+ typedef GPUDevice Device;
+
+ explicit AvgPoolingGradOpCustomGPUKernel(OpKernelConstruction* context)
+ : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+ OP_REQUIRES(context, ksize_.size() == 4,
+ errors::InvalidArgument("Sliding window ksize field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+ OP_REQUIRES(context, stride_.size() == 4,
+ errors::InvalidArgument("Sliding window strides field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+ errors::Unimplemented(
+ "Pooling is not yet supported on the batch dimension."));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& tensor_in_shape = context->input(0);
+ const Tensor& out_backprop = context->input(1);
+ // For avgpooling, tensor_in_shape should have 1 dimension, and 4 elements.
+ OP_REQUIRES(
+ context,
+ tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
+ errors::InvalidArgument("out_backprop must be 1-dimensional and 4 "
+ "elements"));
+ // For avgpooling, out_backprop should have 4 dimensions.
+ OP_REQUIRES(context, out_backprop.dims() == 4,
+ errors::InvalidArgument("out_backprop must be 4-dimensional"));
+ const int64 out_backprop_batch = out_backprop.dim_size(0);
+ const int64 out_backprop_rows = out_backprop.dim_size(1);
+ const int64 out_backprop_cols = out_backprop.dim_size(2);
+ const int64 out_backprop_depth = out_backprop.dim_size(3);
+
+ TensorShape output_shape;
+ auto shape_vec = tensor_in_shape.vec<int32>();
+ for (int64 i = 0; i < tensor_in_shape.NumElements(); ++i) {
+ output_shape.AddDim(shape_vec(i));
+ }
+ const int64 in_rows = output_shape.dim_size(1);
+ const int64 in_cols = output_shape.dim_size(2);
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+ const int window_rows = ksize_[1];
+ const int window_cols = ksize_[2];
+ const int depth_window = ksize_[3];
+
+ const int row_stride = stride_[1];
+ const int col_stride = stride_[2];
+
+ // We (will) use different code for spatial pooling and
+ // non-spatial pooling.
+ //
+ // Spatial pooling is when depth_window = 1
+ OP_REQUIRES(context, depth_window == 1,
+ errors::Unimplemented("Non-spatial pooling is not "
+ "yet supported. Volunteers? :)"));
+
+ int out_height, out_width, pad_rows, pad_cols;
+ OP_REQUIRES_OK(
+ context, Get2dOutputSize(in_rows, in_cols, window_rows, window_cols,
+ row_stride, col_stride, padding_, &out_height,
+ &out_width, &pad_rows, &pad_cols));
+
+ RunAvePoolBackwardNHWC<T>(out_backprop.flat<T>().data(), // top_diff
+ out_backprop_batch, // num
+ in_rows, // height
+ in_cols, // width
+ out_backprop_depth, // channels
+ out_backprop_rows, // pooled_height
+ out_backprop_cols, // pooled_width
+ window_rows, // kernel_h
+ window_cols, // kernel_w
+ row_stride, // stride_h
+ col_stride, // stride_w
+ pad_rows, // pad_t
+ pad_cols, // pad_l
+ output->flat<T>().data(), // bottom_diff
+ context->eigen_gpu_device()); // d
+ }
+
+ private:
+ std::vector<int32> ksize_;
+ std::vector<int32> stride_;
+ Padding padding_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<float>("T")
+ .HostMemory("orig_input_shape"),
+ AvgPoolingGradOpCustomGPUKernel<float>);
+
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/avgpooling_op.h b/tensorflow/core/kernels/avgpooling_op.h
new file mode 100644
index 0000000000..38f0eb97e5
--- /dev/null
+++ b/tensorflow/core/kernels/avgpooling_op.h
@@ -0,0 +1,58 @@
+#ifndef TENSORFLOW_KERNELS_AVGPOOLING_OP_H_
+#define TENSORFLOW_KERNELS_AVGPOOLING_OP_H_
+// Functor definition for AvgPoolingOp, must be compilable by nvcc.
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct SpatialAvgPooling {
+ void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
+ typename TTypes<T, 4>::ConstTensor input, int window_rows,
+ int window_cols, int row_stride, int col_stride,
+ const Eigen::PaddingType& padding) {
+ // Because we swap the layout, we swap the row/cols as well
+ output.swap_layout().device(d) =
+ Eigen::SpatialAvgPooling(input.swap_layout(), window_cols, window_rows,
+ col_stride, row_stride, padding);
+ }
+};
+
+} // namespace functor
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Lauch a custom GPU kernels from Yanqing for the avgpooling backward operation
+// that works NHWC data formats.
+// Arguments:
+// top_diff: backprop to the output of the pooling layer
+// num: number of input batches
+// height: input height
+// width: input width
+// channels: number of input channels
+// pooled_height: the height of the output to the pooling layer
+// pooled_width: the width of the output to the pooling layer
+// kernel_h: the height of the pooling kernel
+// kernel_w: the width of the pooling kernel
+// stride_h: the height of the vertical stride
+// stride_w: the width of the horizontal stride
+// pad_t: padding size to the top side
+// pad_l: padding size to the left side
+// bottom_diff: backprop to the input of the pooling layer.
+template <typename T>
+bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
+ const int height, const int width,
+ const int channels, const int pooled_height,
+ const int pooled_width, const int kernel_h,
+ const int kernel_w, const int stride_h,
+ const int stride_w, const int pad_t,
+ const int pad_l, T* const bottom_diff,
+ const GPUDevice& d);
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_AVGPOOLING_OP_H_
diff --git a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
new file mode 100644
index 0000000000..ec84ee6862
--- /dev/null
+++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
@@ -0,0 +1,101 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+#include <iostream>
+
+#include "tensorflow/core/kernels/avgpooling_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_GPU_KERNELS(T) \
+ template struct functor::SpatialAvgPooling<GPUDevice, T>;
+
+DEFINE_GPU_KERNELS(float)
+
+#undef DEFINE_GPU_KERNELS
+
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+ i += blockDim.x * gridDim.x)
+
+static const int CAFFE_CUDA_NUM_THREADS = 1024;
+
+template <typename dtype>
+__global__ void AvePoolBackwardNHWC(const int nthreads,
+ const dtype* const top_diff, const int num,
+ const int height, const int width,
+ const int channels, const int pooled_height,
+ const int pooled_width, const int kernel_h,
+ const int kernel_w, const int stride_h,
+ const int stride_w, const int pad_t,
+ const int pad_l, dtype* const bottom_diff) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // find out the local index
+ // find out the local offset
+ const int c = index % channels;
+ const int w = index / channels % width + pad_l;
+ const int h = (index / channels / width) % height + pad_t;
+ const int n = index / channels / width / height;
+ const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+ const int phend = min(h / stride_h + 1, pooled_height);
+ const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+ const int pwend = min(w / stride_w + 1, pooled_width);
+ dtype gradient = 0;
+ const dtype* const top_diff_slice =
+ top_diff + n * pooled_height * pooled_width * channels + c;
+ for (int ph = phstart; ph < phend; ++ph) {
+ for (int pw = pwstart; pw < pwend; ++pw) {
+ // figure out the pooling size
+ int hstart = ph * stride_h - pad_t;
+ int wstart = pw * stride_w - pad_l;
+ int hend = min(hstart + kernel_h, height);
+ int wend = min(wstart + kernel_w, width);
+ hstart = max(hstart, 0);
+ wstart = max(wstart, 0);
+ int pool_size = (hend - hstart) * (wend - wstart);
+ gradient +=
+ top_diff_slice[(ph * pooled_width + pw) * channels] / pool_size;
+ }
+ }
+ bottom_diff[index] = gradient;
+ }
+}
+
+template <typename T>
+bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
+ const int height, const int width,
+ const int channels, const int pooled_height,
+ const int pooled_width, const int kernel_h,
+ const int kernel_w, const int stride_h,
+ const int stride_w, const int pad_t,
+ const int pad_l, T* const bottom_diff,
+ const GPUDevice& d) {
+ int x_size = num * height * width * channels;
+ int thread_per_block =
+ std::min(CAFFE_CUDA_NUM_THREADS, d.maxCudaThreadsPerMultiProcessor());
+ int block_count = (x_size + thread_per_block - 1) / thread_per_block;
+ AvePoolBackwardNHWC<T><<<block_count, thread_per_block, 0, d.stream()>>>(
+ x_size, top_diff, num, height, width, channels, pooled_height,
+ pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_t,
+ bottom_diff);
+
+ return d.ok();
+}
+
+template bool RunAvePoolBackwardNHWC(
+ const float* const top_diff, const int num, const int height,
+ const int width, const int channels, const int pooled_height,
+ const int pooled_width, const int kernel_h, const int kernel_w,
+ const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+ float* const bottom_diff, const GPUDevice& d);
+
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/batch_matmul_op.cc b/tensorflow/core/kernels/batch_matmul_op.cc
new file mode 100644
index 0000000000..349aac0158
--- /dev/null
+++ b/tensorflow/core/kernels/batch_matmul_op.cc
@@ -0,0 +1,260 @@
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/stream_executor/stream.h"
+#endif // GOOGLE_CUDA
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename Scalar>
+struct LaunchBatchMatMul;
+
+template <typename Scalar>
+struct LaunchBatchMatMul<CPUDevice, Scalar> {
+ static void Launch(OpKernelContext* context, const Tensor& in_x,
+ const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
+ auto Tx = in_x.tensor<Scalar, 3>();
+ auto Ty = in_y.tensor<Scalar, 3>();
+ auto Tz = out->tensor<Scalar, 3>();
+
+ // Shards "n"-matmuls into "num" shards. Each shard is
+ // dispatched to a thread.
+ auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+ const int64 num_units = in_x.dim_size(0);
+ const int64 cost_per_unit =
+ in_x.dim_size(0) * in_x.dim_size(1) * out->dim_size(2);
+ Shard(worker_threads.num_threads, worker_threads.workers, num_units,
+ cost_per_unit, [&Tx, &Ty, adj_x, adj_y, &Tz](int start, int limit) {
+ LaunchBatchMatMul<CPUDevice, Scalar>::Run(Tx, Ty, adj_x, adj_y, Tz,
+ start, limit);
+ });
+ }
+
+ template <typename In, typename Out>
+ static void Run(In Tx, In Ty, bool adj_x, bool adj_y, Out Tz, int start,
+ int limit) {
+ Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs;
+
+ Eigen::internal::scalar_conjugate_op<Scalar> conj;
+ if (!adj_x && !adj_y) {
+ for (int i = start; i < limit; ++i) {
+ auto x = Tx.template chip<0>(i);
+ auto y = Ty.template chip<0>(i);
+ auto z = Tz.template chip<0>(i);
+ contract_pairs[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+ z = x.contract(y, contract_pairs); // matmul
+ }
+ } else if (!adj_x && adj_y) {
+ for (int i = start; i < limit; ++i) {
+ auto x = Tx.template chip<0>(i);
+ auto y = Ty.template chip<0>(i).unaryExpr(conj);
+ auto z = Tz.template chip<0>(i);
+ contract_pairs[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 1);
+ z = x.contract(y, contract_pairs); // matmul
+ }
+ } else if (adj_x && !adj_y) {
+ for (int i = start; i < limit; ++i) {
+ auto x = Tx.template chip<0>(i).unaryExpr(conj);
+ auto y = Ty.template chip<0>(i);
+ auto z = Tz.template chip<0>(i);
+ contract_pairs[0] = Eigen::IndexPair<Eigen::DenseIndex>(0, 0);
+ z = x.contract(y, contract_pairs); // matmul
+ }
+ } else {
+ for (int i = start; i < limit; ++i) {
+ auto x = Tx.template chip<0>(i).unaryExpr(conj);
+ auto y = Ty.template chip<0>(i).unaryExpr(conj);
+ auto z = Tz.template chip<0>(i);
+ contract_pairs[0] = Eigen::IndexPair<Eigen::DenseIndex>(0, 1);
+ z = x.contract(y, contract_pairs); // matmul
+ }
+ }
+ }
+};
+
+#if GOOGLE_CUDA
+
+namespace {
+template <typename T>
+perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
+ perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
+ perftools::gputools::DeviceMemory<T> typed(wrapped);
+ return typed;
+}
+} // namespace
+
+template <typename Scalar>
+struct LaunchBatchMatMul<GPUDevice, Scalar> {
+ static void Launch(OpKernelContext* context, const Tensor& in_x,
+ const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
+ perftools::gputools::blas::Transpose trans[] = {
+ perftools::gputools::blas::Transpose::kNoTranspose,
+ perftools::gputools::blas::Transpose::kTranspose};
+ const uint64 m = in_x.dim_size(adj_x ? 2 : 1);
+ const uint64 k = in_x.dim_size(adj_x ? 1 : 2);
+ const uint64 n = in_y.dim_size(adj_y ? 1 : 2);
+ const uint64 batch_size = in_x.dim_size(0);
+ auto blas_transpose_a = trans[adj_x];
+ auto blas_transpose_b = trans[adj_y];
+
+ auto* stream = context->op_device_context<GPUDeviceContext>()->stream();
+ OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+ typedef perftools::gputools::DeviceMemory<Scalar> DeviceMemoryType;
+ std::vector<DeviceMemoryType> a_device_memory;
+ std::vector<DeviceMemoryType> b_device_memory;
+ std::vector<DeviceMemoryType> c_device_memory;
+ std::vector<DeviceMemoryType*> a_ptrs;
+ std::vector<DeviceMemoryType*> b_ptrs;
+ std::vector<DeviceMemoryType*> c_ptrs;
+ a_device_memory.reserve(batch_size);
+ b_device_memory.reserve(batch_size);
+ c_device_memory.reserve(batch_size);
+ a_ptrs.reserve(batch_size);
+ b_ptrs.reserve(batch_size);
+ c_ptrs.reserve(batch_size);
+ auto* a_base_ptr = in_x.template flat<Scalar>().data();
+ auto* b_base_ptr = in_y.template flat<Scalar>().data();
+ auto* c_base_ptr = out->template flat<Scalar>().data();
+ for (int64 i = 0; i < batch_size; ++i) {
+ a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k));
+ b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n));
+ c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n));
+ a_ptrs.push_back(&a_device_memory.back());
+ b_ptrs.push_back(&b_device_memory.back());
+ c_ptrs.push_back(&c_device_memory.back());
+ }
+
+ // Cublas does
+ // C = A x B
+ // where A, B and C are assumed to be in column major.
+ // We want the output to be in row-major, so we can compute
+ // C' = B' x A' (' stands for transpose)
+ bool blas_launch_status =
+ stream->ThenBlasGemmBatched(blas_transpose_b, blas_transpose_a, n, m, k,
+ static_cast<Scalar>(1.0), b_ptrs,
+ adj_y ? k : n, a_ptrs, adj_x ? m : k,
+ static_cast<Scalar>(0.0), c_ptrs, n,
+ batch_size)
+ .ok();
+ if (!blas_launch_status) {
+ context->SetStatus(errors::Internal(
+ "Blas SGEMMBatched launch failed : a.shape=",
+ in_x.shape().DebugString(), ", b.shape=", in_y.shape().DebugString(),
+ ", m=", m, ", n=", n, ", k=", k, ", batch_size=", batch_size));
+ }
+ }
+};
+
+#endif // GOOGLE_CUDA
+
+template <typename Device, typename Scalar>
+class BatchMatMul : public OpKernel {
+ public:
+ explicit BatchMatMul(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("adj_x", &adj_x_));
+ OP_REQUIRES_OK(context, context->GetAttr("adj_y", &adj_y_));
+ }
+
+ virtual ~BatchMatMul() {}
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor& in0 = ctx->input(0);
+ const Tensor& in1 = ctx->input(1);
+ OP_REQUIRES(ctx, in0.dims() == in1.dims(),
+ errors::InvalidArgument("In[0] and In[1] has different ndims: ",
+ in0.shape().ShortDebugString(), " vs. ",
+ in1.shape().ShortDebugString()));
+ const int ndims = in0.dims();
+ OP_REQUIRES(
+ ctx, ndims >= 3,
+ errors::InvalidArgument("In[0] and In[1] ndims must be >= 3: ", ndims));
+ TensorShape out_shape;
+ for (int i = 0; i < ndims - 2; ++i) {
+ OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i),
+ errors::InvalidArgument("In[0].dim(", i, ") and In[1].dim(",
+ i, ") must be the same: ",
+ in0.shape().DebugString(), " vs ",
+ in1.shape().DebugString()));
+ out_shape.AddDim(in0.dim_size(i));
+ }
+ auto n = out_shape.num_elements();
+ auto d0 = in0.dim_size(ndims - 2);
+ auto d1 = in0.dim_size(ndims - 1);
+ Tensor in0_reshaped;
+ CHECK(in0_reshaped.CopyFrom(in0, TensorShape({n, d0, d1})));
+ auto d2 = in1.dim_size(ndims - 2);
+ auto d3 = in1.dim_size(ndims - 1);
+ Tensor in1_reshaped;
+ CHECK(in1_reshaped.CopyFrom(in1, TensorShape({n, d2, d3})));
+ if (adj_x_) std::swap(d0, d1);
+ if (adj_y_) std::swap(d2, d3);
+ OP_REQUIRES(ctx, d1 == d2,
+ errors::InvalidArgument(
+ "In[0] mismatch In[1] shape: ", d1, " vs. ", d2, ": ",
+ in0.shape().ShortDebugString(), " ",
+ in1.shape().ShortDebugString(), " ", adj_x_, " ", adj_y_));
+ out_shape.AddDim(d0);
+ out_shape.AddDim(d3);
+ Tensor* out = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
+ if (out->NumElements() == 0) {
+ return;
+ }
+ if (in0.NumElements() == 0 || in1.NumElements() == 0) {
+ functor::SetZeroFunctor<Device, Scalar> f;
+ f(ctx->eigen_device<Device>(), out->flat<Scalar>());
+ return;
+ }
+ Tensor out_reshaped;
+ CHECK(out_reshaped.CopyFrom(*out, TensorShape({n, d0, d3})));
+ LaunchBatchMatMul<Device, Scalar>::Launch(ctx, in0_reshaped, in1_reshaped,
+ adj_x_, adj_y_, &out_reshaped);
+ }
+
+ private:
+ bool adj_x_;
+ bool adj_y_;
+};
+
+#define REGISTER_CPU(TYPE) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("BatchMatMul").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
+ BatchMatMul<CPUDevice, TYPE>)
+
+#define REGISTER_GPU(TYPE) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("BatchMatMul").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+ BatchMatMul<GPUDevice, TYPE>)
+
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+REGISTER_CPU(int32);
+REGISTER_CPU(complex64);
+
+#ifdef GOOGLE_CUDA
+// TODO(kalakris): The GPU implementation is currently disabled due to issues
+// encountered in practice. See b/24534272.
+// REGISTER_GPU(float);
+#endif // GOOGLE_CUDA
+
+#undef REGISTER_CPU
+#undef REGISTER_GPU
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc
new file mode 100644
index 0000000000..c67c921631
--- /dev/null
+++ b/tensorflow/core/kernels/batch_norm_op.cc
@@ -0,0 +1,223 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/batch_norm_op.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class BatchNormOp : public OpKernel {
+ public:
+ explicit BatchNormOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context,
+ context->GetAttr("variance_epsilon", &variance_epsilon_));
+ OP_REQUIRES_OK(context, context->GetAttr("scale_after_normalization",
+ &scale_after_normalization_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ const Tensor& mean = context->input(1);
+ const Tensor& var = context->input(2);
+ const Tensor& beta = context->input(3);
+ const Tensor& gamma = context->input(4);
+
+ OP_REQUIRES(context, input.dims() == 4,
+ errors::InvalidArgument("input must be 4-dimensional",
+ input.shape().ShortDebugString()));
+ OP_REQUIRES(context, mean.dims() == 1,
+ errors::InvalidArgument("mean must be 1-dimensional",
+ mean.shape().ShortDebugString()));
+ OP_REQUIRES(context, var.dims() == 1,
+ errors::InvalidArgument("var must be 1-dimensional",
+ var.shape().ShortDebugString()));
+ OP_REQUIRES(context, beta.dims() == 1,
+ errors::InvalidArgument("beta must be 1-dimensional",
+ beta.shape().ShortDebugString()));
+ OP_REQUIRES(context, gamma.dims() == 1,
+ errors::InvalidArgument("gamma must be 1-dimensional",
+ gamma.shape().ShortDebugString()));
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, input.shape(), &output));
+
+ functor::BatchNorm<Device, T>()(
+ context->eigen_device<Device>(), input.tensor<T, 4>(), mean.vec<T>(),
+ var.vec<T>(), beta.vec<T>(), gamma.vec<T>(), variance_epsilon_,
+ scale_after_normalization_, output->tensor<T, 4>());
+ }
+
+ private:
+ float variance_epsilon_;
+ bool scale_after_normalization_;
+};
+
+template <typename Device, typename T>
+class BatchNormGradOp : public OpKernel {
+ public:
+ explicit BatchNormGradOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context,
+ context->GetAttr("variance_epsilon", &variance_epsilon_));
+ OP_REQUIRES_OK(context, context->GetAttr("scale_after_normalization",
+ &scale_after_normalization_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ const Tensor& mean = context->input(1);
+ const Tensor& var = context->input(2);
+ const Tensor& gamma = context->input(3);
+ const Tensor& out_backprop = context->input(4);
+
+ OP_REQUIRES(context, input.dims() == 4,
+ errors::InvalidArgument("input must be 4-dimensional",
+ input.shape().ShortDebugString()));
+ OP_REQUIRES(context, mean.dims() == 1,
+ errors::InvalidArgument("mean must be 1-dimensional",
+ mean.shape().ShortDebugString()));
+ OP_REQUIRES(context, var.dims() == 1,
+ errors::InvalidArgument("var must be 1-dimensional",
+ var.shape().ShortDebugString()));
+ OP_REQUIRES(context, gamma.dims() == 1,
+ errors::InvalidArgument("gamma must be 1-dimensional",
+ gamma.shape().ShortDebugString()));
+ OP_REQUIRES(
+ context, out_backprop.dims() == 4,
+ errors::InvalidArgument("out_backprop must be 4-dimensional",
+ out_backprop.shape().ShortDebugString()));
+
+ Tensor* dx = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, input.shape(), &dx));
+ Tensor* dm = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(1, mean.shape(), &dm));
+ Tensor* dv = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(2, var.shape(), &dv));
+ Tensor* db = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(3, mean.shape(), &db));
+ Tensor* dg = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(4, gamma.shape(), &dg));
+
+ // Scratch buffer of [depth] dimension, aka the 4th dimension of input,
+ // which is dim_size(3), for calculating various combinations of
+ // (var + epsilon).
+ Tensor scratch1;
+ OP_REQUIRES_OK(context, context->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({input.dim_size(3)}), &scratch1));
+
+ // Scratch buffer of [depth] dimension for saving intermediate calculation
+ // values.
+ Tensor scratch2;
+ OP_REQUIRES_OK(context, context->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({input.dim_size(3)}), &scratch2));
+
+ functor::BatchNormGrad<Device, T>()(
+ context->eigen_device<Device>(), input.tensor<T, 4>(), mean.vec<T>(),
+ var.vec<T>(), gamma.vec<T>(), out_backprop.tensor<T, 4>(),
+ variance_epsilon_, scale_after_normalization_, dx->tensor<T, 4>(),
+ dm->vec<T>(), dv->vec<T>(), db->vec<T>(), dg->vec<T>(),
+ scratch1.vec<T>(), scratch2.vec<T>());
+ }
+
+ private:
+ float variance_epsilon_;
+ bool scale_after_normalization_;
+};
+
+#define REGISTER_KERNEL(T) \
+ REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalization") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<T>("T"), \
+ BatchNormOp<CPUDevice, T>);
+
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T) \
+ template <> \
+ void BatchNorm<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \
+ typename TTypes<T>::ConstVec mean, typename TTypes<T>::ConstVec var, \
+ typename TTypes<T>::ConstVec beta, typename TTypes<T>::ConstVec gamma, \
+ float variance_epsilon, bool scale_after_normalization, \
+ typename TTypes<T, 4>::Tensor output); \
+ extern template struct BatchNorm<GPUDevice, T>;
+
+#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
+
+DECLARE_GPU_SPECS(float);
+#undef DECLARE_GPU_SPEC
+} // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(T) \
+ REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalization") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<T>("T"), \
+ BatchNormOp<GPUDevice, T>);
+
+REGISTER_GPU_KERNEL(float);
+#undef REGISTER_GPU_KERNEL
+
+#endif // GOOGLE_CUDA
+
+#define REGISTER_KERNEL(T) \
+ REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalizationGrad") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<T>("T"), \
+ BatchNormGradOp<CPUDevice, T>);
+
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T) \
+ template <> \
+ void BatchNormGrad<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \
+ typename TTypes<T>::ConstVec mean, typename TTypes<T>::ConstVec var, \
+ typename TTypes<T>::ConstVec gamma, \
+ typename TTypes<T, 4>::ConstTensor out_backprop, float variance_epsilon, \
+ bool scale_after_normalization, typename TTypes<T, 4>::Tensor dx, \
+ typename TTypes<T>::Vec dm, typename TTypes<T>::Vec dv, \
+ typename TTypes<T>::Vec db, typename TTypes<T>::Vec dg, \
+ typename TTypes<T>::Vec scratch1, typename TTypes<T>::Vec scratch2); \
+ extern template struct BatchNormGrad<GPUDevice, T>;
+
+#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
+
+DECLARE_GPU_SPECS(float);
+#undef DECLARE_GPU_SPEC
+} // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(T) \
+ REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalizationGrad") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<T>("T"), \
+ BatchNormGradOp<GPUDevice, T>);
+
+REGISTER_GPU_KERNEL(float);
+#undef REGISTER_GPU_KERNEL
+
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_norm_op.h b/tensorflow/core/kernels/batch_norm_op.h
new file mode 100644
index 0000000000..5981e58460
--- /dev/null
+++ b/tensorflow/core/kernels/batch_norm_op.h
@@ -0,0 +1,133 @@
+#ifndef TENSORFLOW_KERNELS_BATCH_NORM_OP_H_
+#define TENSORFLOW_KERNELS_BATCH_NORM_OP_H_
+// Functor definition for BatchNormOp, must be compilable by nvcc.
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by BatchNormOp to do the computations.
+template <typename Device, typename T>
+struct BatchNorm {
+ void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+ typename TTypes<T>::ConstVec mean,
+ typename TTypes<T>::ConstVec var,
+ typename TTypes<T>::ConstVec beta,
+ typename TTypes<T>::ConstVec gamma, float variance_epsilon,
+ bool scale_after_normalization,
+ typename TTypes<T, 4>::Tensor output) {
+ const int depth = mean.dimension(0);
+ const int rest_size = input.size() / depth;
+
+ Eigen::DSizes<int, 2> rest_by_depth(rest_size, depth);
+#if !defined(EIGEN_HAS_INDEX_LIST)
+ Eigen::DSizes<int, 2> rest_by_one(rest_size, 1);
+ Eigen::DSizes<int, 2> one_by_depth(1, depth);
+ Eigen::DSizes<int, 2> depth_by_one(depth, 1);
+#else
+ Eigen::IndexList<int, Eigen::type2index<1> > rest_by_one;
+ rest_by_one.set(0, rest_size);
+ Eigen::IndexList<Eigen::type2index<1>, int> one_by_depth;
+ one_by_depth.set(1, depth);
+ Eigen::IndexList<int, Eigen::type2index<1> > depth_by_one;
+ depth_by_one.set(0, depth);
+#endif
+ if (scale_after_normalization) {
+ output.reshape(rest_by_depth).device(d) =
+ (input.reshape(rest_by_depth) -
+ mean.reshape(one_by_depth).broadcast(rest_by_one)) *
+ ((var + var.constant(variance_epsilon)).rsqrt() * gamma)
+ .eval()
+ .reshape(one_by_depth)
+ .broadcast(rest_by_one) +
+ beta.reshape(one_by_depth).broadcast(rest_by_one);
+ } else {
+ output.reshape(rest_by_depth).device(d) =
+ (input.reshape(rest_by_depth) -
+ mean.reshape(one_by_depth).broadcast(rest_by_one)) *
+ ((var + var.constant(variance_epsilon)).rsqrt())
+ .eval()
+ .reshape(one_by_depth)
+ .broadcast(rest_by_one) +
+ beta.reshape(one_by_depth).broadcast(rest_by_one);
+ }
+ }
+};
+
+template <typename Device, typename T>
+struct BatchNormGrad {
+ void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+ typename TTypes<T>::ConstVec mean,
+ typename TTypes<T>::ConstVec var,
+ typename TTypes<T>::ConstVec gamma,
+ typename TTypes<T, 4>::ConstTensor out_backprop,
+ float variance_epsilon, bool scale_after_normalization,
+ typename TTypes<T, 4>::Tensor dx, typename TTypes<T>::Vec dm,
+ typename TTypes<T>::Vec dv, typename TTypes<T>::Vec db,
+ typename TTypes<T>::Vec dg, typename TTypes<T>::Vec scratch1,
+ typename TTypes<T>::Vec scratch2) {
+ const int depth = mean.dimension(0);
+ const int rest_size = input.size() / depth;
+
+ typedef typename TTypes<T>::ConstVec::Index Index;
+ Eigen::DSizes<Index, 2> rest_by_depth(rest_size, depth);
+ Eigen::DSizes<Index, 2> rest_by_one(rest_size, 1);
+ Eigen::DSizes<Index, 2> one_by_depth(1, depth);
+
+ // db = out_backprop
+ //
+ // dg = out_backprop * ((x - m) * rsqrt(v + epsilon))
+ //
+ // dv = sum_over_rest(out_backprop * gamma * (x - m)) *
+ // (-1/2) * (v + epsilon) ^ (-3/2)
+ //
+ // dm = sum_over_rest(out_backprop * gamma) * (-1 / rsqrt(v + epsilon))
+ //
+ // dx = out_backprop * (gamma * rsqrt(v + epsilon))
+ Eigen::array<Index, 1> reduction_axis;
+ reduction_axis[0] = 0; // Reduces on first dimension.
+
+ db.device(d) = out_backprop.reshape(rest_by_depth).sum(reduction_axis);
+
+ // scratch1 = rsqrt(v + epsilon)
+ scratch1.device(d) = (var + var.constant(variance_epsilon)).rsqrt();
+
+ // scratch2 = sum_over_rest(out_backprop * (x - m))
+ scratch2.device(d) = (out_backprop.reshape(rest_by_depth) *
+ (input.reshape(rest_by_depth) -
+ mean.reshape(one_by_depth).broadcast(rest_by_one)))
+ .sum(reduction_axis);
+
+ if (scale_after_normalization) {
+ dx.reshape(rest_by_depth).device(d) =
+ out_backprop.reshape(rest_by_depth) * ((scratch1 * gamma)
+ .eval()
+ .reshape(one_by_depth)
+ .broadcast(rest_by_one));
+ dm.device(d) = -db * (scratch1 * gamma).eval();
+ dg.device(d) = scratch2 * scratch1;
+ } else {
+ dx.reshape(rest_by_depth).device(d) =
+ out_backprop.reshape(rest_by_depth) *
+ scratch1.reshape(one_by_depth).broadcast(rest_by_one);
+ dm.device(d) = -db * scratch1;
+ dg.device(d) = dg.constant(static_cast<T>(0.0)); // Gamma is not learned.
+ }
+
+ // scratch1 = - 1/2 * (var + epsilon) ^ (-3/2)
+ scratch1.device(d) = scratch1 * scratch1.constant(static_cast<T>(-0.5f)) /
+ (var + var.constant(variance_epsilon));
+
+ if (scale_after_normalization) {
+ dv.device(d) = scratch2 * (scratch1 * gamma).eval();
+ } else {
+ dv.device(d) = scratch2 * scratch1;
+ }
+ }
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_BATCH_NORM_OP_H_
diff --git a/tensorflow/core/kernels/batch_norm_op_gpu.cu.cc b/tensorflow/core/kernels/batch_norm_op_gpu.cu.cc
new file mode 100644
index 0000000000..02e0eeecfa
--- /dev/null
+++ b/tensorflow/core/kernels/batch_norm_op_gpu.cu.cc
@@ -0,0 +1,17 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/batch_norm_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+template struct functor::BatchNorm<GPUDevice, float>;
+template struct functor::BatchNormGrad<GPUDevice, float>;
+
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/bcast_ops.cc b/tensorflow/core/kernels/bcast_ops.cc
new file mode 100644
index 0000000000..bb1492e5b4
--- /dev/null
+++ b/tensorflow/core/kernels/bcast_ops.cc
@@ -0,0 +1,71 @@
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/util/bcast.h"
+
+namespace tensorflow {
+
+// Given shapes of two tensors, computes the reduction indices for the
+// gradient computation.
+//
+// TODO(zhifengc):
+// 1. Adds support for n-ary (n >= 2).
+class BCastGradArgsOp : public OpKernel {
+ public:
+ explicit BCastGradArgsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(
+ ctx, ctx->MatchSignature({DT_INT32, DT_INT32}, {DT_INT32, DT_INT32}));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ OP_REQUIRES(
+ ctx, ctx->num_inputs() == 2,
+ errors::Unimplemented("Broadcast for n-ary operations (n > 2)"));
+ gtl::InlinedVector<BCast::Vec, 4> shapes;
+ for (int i = 0; i < ctx->num_inputs(); ++i) {
+ const Tensor& in = ctx->input(i);
+ OP_REQUIRES(ctx, TensorShapeUtils::IsVector(in.shape()),
+ errors::InvalidArgument("In[", i, "] must be a vector.",
+ in.shape().ShortDebugString()));
+ BCast::Vec vec;
+ for (int64 i = 0; i < in.NumElements(); ++i) {
+ vec.push_back(in.vec<int32>()(i));
+ }
+ shapes.push_back(vec);
+ }
+ BCast bcast(shapes[0], shapes[1]);
+ OP_REQUIRES(ctx, bcast.IsValid(),
+ errors::InvalidArgument(
+ "Incompatible shapes: [", str_util::Join(shapes[0], ","),
+ "] vs. [", str_util::Join(shapes[1], ","), "]"));
+ Output(ctx, 0, bcast.grad_x_reduce_idx());
+ Output(ctx, 1, bcast.grad_y_reduce_idx());
+ }
+
+ private:
+ void Output(OpKernelContext* ctx, int idx, const BCast::Vec& v) {
+ const int len = v.size();
+ Tensor* o = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(idx, TensorShape({len}), &o));
+ for (int i = 0; i < len; ++i) o->flat<int32>()(i) = v[i];
+ }
+
+ TF_DISALLOW_COPY_AND_ASSIGN(BCastGradArgsOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
+ .Device(DEVICE_CPU)
+ .HostMemory("s0")
+ .HostMemory("s1")
+ .HostMemory("r0")
+ .HostMemory("r1"),
+ BCastGradArgsOp);
+REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
+ .Device(DEVICE_GPU)
+ .HostMemory("s0")
+ .HostMemory("s1")
+ .HostMemory("r0")
+ .HostMemory("r1"),
+ BCastGradArgsOp);
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc
new file mode 100644
index 0000000000..68737f6c2d
--- /dev/null
+++ b/tensorflow/core/kernels/bias_op.cc
@@ -0,0 +1,112 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/bias_op.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class BiasOp : public BinaryOp<T> {
+ public:
+ explicit BiasOp(OpKernelConstruction* context) : BinaryOp<T>(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ const Tensor& bias = context->input(1);
+
+ OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input.shape()),
+ errors::InvalidArgument("Input tensor must be at least 2D: ",
+ input.shape().DebugString()));
+ OP_REQUIRES(context, TensorShapeUtils::IsVector(bias.shape()),
+ errors::InvalidArgument("Biases must be 1D: ",
+ bias.shape().DebugString()));
+ const auto last_dim = input.shape().dims() - 1;
+ OP_REQUIRES(
+ context, bias.shape().dim_size(0) == input.shape().dim_size(last_dim),
+ errors::InvalidArgument(
+ "Must provide as many biases as the last dimension "
+ "of the input tensor: ",
+ bias.shape().DebugString(), " vs. ", input.shape().DebugString()));
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, input.shape(), &output));
+
+ switch (input.shape().dims()) {
+ case 2:
+ Compute<2>(context, input, bias, output);
+ break;
+ case 3:
+ Compute<3>(context, input, bias, output);
+ break;
+ case 4:
+ Compute<4>(context, input, bias, output);
+ break;
+ case 5:
+ Compute<5>(context, input, bias, output);
+ break;
+ default:
+ OP_REQUIRES(context, false,
+ errors::InvalidArgument("Only ranks up to 5 supported: ",
+ input.shape().DebugString()));
+ }
+ }
+
+ // Add biases for an input matrix of rank Dims, by using the Bias.
+ template <int Dims>
+ void Compute(OpKernelContext* ctx, const Tensor& input, const Tensor& bias,
+ Tensor* output) {
+ functor::Bias<Device, T, Dims> functor;
+ functor(ctx->eigen_device<Device>(), input.tensor<T, Dims>(), bias.vec<T>(),
+ output->tensor<T, Dims>());
+ }
+};
+
+#define REGISTER_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("BiasAdd").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ BiasOp<CPUDevice, type>);
+
+TF_CALL_NUMBER_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T, Dims) \
+ template <> \
+ void Bias<GPUDevice, T, Dims>::operator()( \
+ const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \
+ typename TTypes<T>::ConstVec bias, \
+ typename TTypes<T, Dims>::Tensor output); \
+ extern template struct Bias<GPUDevice, T, Dims>;
+
+#define DECLARE_GPU_SPECS(T) \
+ DECLARE_GPU_SPEC(T, 2); \
+ DECLARE_GPU_SPEC(T, 3); \
+ DECLARE_GPU_SPEC(T, 4); \
+ DECLARE_GPU_SPEC(T, 5);
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
+} // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("BiasAdd").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+ BiasOp<GPUDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
+
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/bias_op.h b/tensorflow/core/kernels/bias_op.h
new file mode 100644
index 0000000000..513406d251
--- /dev/null
+++ b/tensorflow/core/kernels/bias_op.h
@@ -0,0 +1,41 @@
+#ifndef TENSORFLOW_KERNELS_BIAS_OP_H_
+#define TENSORFLOW_KERNELS_BIAS_OP_H_
+// Functor definition for BiasOp, must be compilable by nvcc.
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by BiasOp to do the computations.
+template <typename Device, typename T, int Dims>
+struct Bias {
+ // Add "bias" to "input", broadcasting it on all dimensions but the last one.
+ void operator()(const Device& d, typename TTypes<T, Dims>::ConstTensor input,
+ typename TTypes<T>::ConstVec bias,
+ typename TTypes<T, Dims>::Tensor output) {
+ const int bias_size = bias.dimension(0);
+ const int rest_size = input.size() / bias_size;
+
+ Eigen::DSizes<int, 2> rest_by_bias(rest_size, bias_size);
+#if !defined(EIGEN_HAS_INDEX_LIST)
+ Eigen::DSizes<int, 2> rest_by_one(rest_size, 1);
+ Eigen::DSizes<int, 2> one_by_bias(1, bias_size);
+#else
+ Eigen::IndexList<int, Eigen::type2index<1> > rest_by_one;
+ rest_by_one.set(0, rest_size);
+ Eigen::IndexList<Eigen::type2index<1>, int> one_by_bias;
+ one_by_bias.set(1, bias_size);
+#endif
+
+ output.reshape(rest_by_bias).device(d) =
+ input.reshape(rest_by_bias) +
+ bias.reshape(one_by_bias).broadcast(rest_by_one);
+ }
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_BIAS_OP_H_
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
new file mode 100644
index 0000000000..d3377b3ce8
--- /dev/null
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -0,0 +1,23 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/bias_op.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Definition of the GPU implementations declared in bias_op.cc.
+#define DEFINE_GPU_SPECS(T) \
+ template struct functor::Bias<GPUDevice, T, 2>; \
+ template struct functor::Bias<GPUDevice, T, 3>; \
+ template struct functor::Bias<GPUDevice, T, 4>; \
+ template struct functor::Bias<GPUDevice, T, 5>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
+
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/candidate_sampler_ops.cc b/tensorflow/core/kernels/candidate_sampler_ops.cc
new file mode 100644
index 0000000000..cd5fde37a6
--- /dev/null
+++ b/tensorflow/core/kernels/candidate_sampler_ops.cc
@@ -0,0 +1,243 @@
+// See docs in ../ops/candidate_sampling_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include <cfloat>
+#include <unordered_map>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/range_sampler.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/util/guarded_philox_random.h"
+
+namespace tensorflow {
+
+class BaseCandidateSamplerOp : public OpKernel {
+ public:
+ explicit BaseCandidateSamplerOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("num_sampled", &num_sampled_));
+ OP_REQUIRES_OK(context, context->GetAttr("num_true", &num_true_));
+ OP_REQUIRES_OK(context, context->GetAttr("unique", &unique_));
+ OP_REQUIRES_OK(context, generator_.Init(context));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& true_classes = context->input(0);
+ OP_REQUIRES(context, true_classes.dims() == 2,
+ errors::InvalidArgument("true_classes must be a matrix"));
+ const int32 batch_size = true_classes.dim_size(0);
+ OP_REQUIRES(context, true_classes.dim_size(1) == num_true_,
+ errors::InvalidArgument("true_classes must have "
+ "num_true columns"));
+
+ // Output candidates and expected_count.
+ Tensor* out_sampled_candidates = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, TensorShape({num_sampled_}),
+ &out_sampled_candidates));
+
+ Tensor* out_true_expected_count = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(
+ 1, TensorShape({batch_size, num_true_}),
+ &out_true_expected_count));
+ Tensor* out_sampled_expected_count = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(2, TensorShape({num_sampled_}),
+ &out_sampled_expected_count));
+
+ gtl::ArraySlice<int64> true_candidate(true_classes.matrix<int64>().data(),
+ batch_size * num_true_);
+ gtl::MutableArraySlice<int64> sampled_candidate(
+ out_sampled_candidates->vec<int64>().data(), num_sampled_);
+ gtl::MutableArraySlice<float> true_expected_count(
+ out_true_expected_count->matrix<float>().data(),
+ batch_size * num_true_);
+ gtl::MutableArraySlice<float> sampled_expected_count(
+ out_sampled_expected_count->vec<float>().data(), num_sampled_);
+
+ CHECK(sampler_) << "CandidateSamplerOp did not set sampler_";
+
+ // Approximately conservatively estimate the number of samples required.
+ // In cases where rejection sampling is used we may occasionally use more
+ // samples than expected, which will result in reused random bits.
+ const int64 samples32 = 2048 * num_sampled_;
+
+ // Pick sampled candidates.
+ auto local_gen = generator_.ReserveSamples32(samples32);
+ random::SimplePhilox random(&local_gen);
+ sampler_->SampleBatchGetExpectedCount(&random, unique_, &sampled_candidate,
+ &sampled_expected_count,
+ true_candidate, &true_expected_count);
+
+ if (sampler_->NeedsUpdates()) {
+ sampler_->Update(true_candidate);
+ }
+ }
+
+ protected:
+ void set_sampler(RangeSampler* sampler) { sampler_.reset(sampler); }
+
+ private:
+ int32 num_true_;
+ int32 num_sampled_;
+ bool unique_;
+ std::unique_ptr<RangeSampler> sampler_;
+ GuardedPhiloxRandom generator_;
+};
+
+template <class RangeSamplerType>
+class SimpleCandidateSamplerOp : public BaseCandidateSamplerOp {
+ public:
+ explicit SimpleCandidateSamplerOp(OpKernelConstruction* context)
+ : BaseCandidateSamplerOp(context) {
+ int64 range_max;
+ OP_REQUIRES_OK(context, context->GetAttr("range_max", &range_max));
+ set_sampler(new RangeSamplerType(range_max));
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("UniformCandidateSampler").Device(DEVICE_CPU),
+ SimpleCandidateSamplerOp<UniformSampler>);
+
+REGISTER_KERNEL_BUILDER(Name("LogUniformCandidateSampler").Device(DEVICE_CPU),
+ SimpleCandidateSamplerOp<LogUniformSampler>);
+
+REGISTER_KERNEL_BUILDER(Name("LearnedUnigramCandidateSampler")
+ .Device(DEVICE_CPU),
+ SimpleCandidateSamplerOp<UnigramSampler>);
+
+REGISTER_KERNEL_BUILDER(Name("ThreadUnsafeUnigramCandidateSampler")
+ .Device(DEVICE_CPU),
+ SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
+
+class AllCandidateSamplerOp : public BaseCandidateSamplerOp {
+ public:
+ explicit AllCandidateSamplerOp(OpKernelConstruction* context)
+ : BaseCandidateSamplerOp(context) {
+ int64 range_max;
+ OP_REQUIRES_OK(context, context->GetAttr("num_sampled", &range_max));
+ set_sampler(new AllSampler(range_max));
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("AllCandidateSampler").Device(DEVICE_CPU),
+ AllCandidateSamplerOp);
+
+class FixedUnigramCandidateSamplerOp : public BaseCandidateSamplerOp {
+ public:
+ explicit FixedUnigramCandidateSamplerOp(OpKernelConstruction* context)
+ : BaseCandidateSamplerOp(context) {
+ int64 range_max;
+ OP_REQUIRES_OK(context, context->GetAttr("range_max", &range_max));
+ string vocab_file;
+ OP_REQUIRES_OK(context, context->GetAttr("vocab_file", &vocab_file));
+ std::vector<float> unigrams;
+ OP_REQUIRES_OK(context, context->GetAttr("unigrams", &unigrams));
+ OP_REQUIRES(
+ context, !vocab_file.empty() || !unigrams.empty(),
+ errors::InvalidArgument("Must provide either vocab_file or unigrams."));
+ OP_REQUIRES(context, vocab_file.empty() || unigrams.empty(),
+ errors::InvalidArgument(
+ "Must only provide one of vocab_file and unigrams."));
+ float distortion;
+ OP_REQUIRES_OK(context, context->GetAttr("distortion", &distortion));
+ int64 num_reserved_ids;
+ OP_REQUIRES_OK(context,
+ context->GetAttr("num_reserved_ids", &num_reserved_ids));
+ int64 num_shards;
+ OP_REQUIRES_OK(context, context->GetAttr("num_shards", &num_shards));
+ int64 shard;
+ OP_REQUIRES_OK(context, context->GetAttr("shard", &shard));
+
+ if (!vocab_file.empty()) {
+ set_sampler(new FixedUnigramSampler(context->env(), range_max, vocab_file,
+ distortion, num_reserved_ids,
+ num_shards, shard));
+ } else {
+ set_sampler(new FixedUnigramSampler(range_max, unigrams, distortion,
+ num_reserved_ids, num_shards, shard));
+ }
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("FixedUnigramCandidateSampler").Device(DEVICE_CPU),
+ FixedUnigramCandidateSamplerOp);
+
+class ComputeAccidentalHitsOp : public OpKernel {
+ public:
+ explicit ComputeAccidentalHitsOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("num_true", &num_true_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& in_true_candidates = context->input(0);
+ TensorShape in_true_candidates_shape = in_true_candidates.shape();
+ OP_REQUIRES(context, TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
+ in_true_candidates_shape.dim_size(1) == num_true_,
+ errors::InvalidArgument(
+ "true_candidates must be a batch_size * num_true matrix"));
+
+ const int64 batch_size = in_true_candidates_shape.dim_size(0);
+
+ const Tensor& in_sampled_candidates = context->input(1);
+ OP_REQUIRES(context,
+ TensorShapeUtils::IsVector(in_sampled_candidates.shape()),
+ errors::InvalidArgument(
+ "sampled_candidates must be a vector, which is typically "
+ "an output from CandidateSampler"));
+
+ std::unordered_map<int64, int> sampled_candidate_to_pos;
+ for (int64 i = 0; i < in_sampled_candidates.dim_size(0); ++i) {
+ sampled_candidate_to_pos[in_sampled_candidates.vec<int64>()(i)] = i;
+ }
+
+ // Produce output in the same format as UnpackSparseFeatures.
+ std::vector<int> indices;
+ std::vector<int64> ids;
+ std::vector<float> weights;
+
+ for (int64 i = 0; i < batch_size; ++i) {
+ for (int64 j = 0; j < num_true_; ++j) {
+ const int64 true_candidate = in_true_candidates.matrix<int64>()(i, j);
+ const auto look = sampled_candidate_to_pos.find(true_candidate);
+ if (look != sampled_candidate_to_pos.end()) {
+ indices.push_back(i);
+ ids.push_back(look->second);
+ weights.push_back(-FLT_MAX);
+ }
+ }
+ }
+
+ Tensor* out_indices = nullptr;
+ OP_REQUIRES_OK(
+ context,
+ context->allocate_output(
+ 0, TensorShape({static_cast<int>(indices.size())}), &out_indices));
+ Tensor* out_ids = nullptr;
+ OP_REQUIRES_OK(
+ context, context->allocate_output(
+ 1, TensorShape({static_cast<int>(ids.size())}), &out_ids));
+ Tensor* out_weights = nullptr;
+ OP_REQUIRES_OK(
+ context,
+ context->allocate_output(
+ 2, TensorShape({static_cast<int>(weights.size())}), &out_weights));
+
+ for (size_t i = 0; i < indices.size(); ++i) {
+ out_indices->vec<int32>()(i) = indices[i];
+ out_ids->vec<int64>()(i) = ids[i];
+ out_weights->vec<float>()(i) = weights[i];
+ }
+ }
+
+ private:
+ int64 num_true_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ComputeAccidentalHits").Device(DEVICE_CPU),
+ ComputeAccidentalHitsOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
new file mode 100644
index 0000000000..779ac57b6a
--- /dev/null
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -0,0 +1,233 @@
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/cast_op.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <typename Device, typename Tout, typename Tin>
+void CastMaybeInline(const Device& d, typename TTypes<Tout>::Flat o,
+ typename TTypes<Tin>::ConstFlat i) {
+ if (o.size() * (sizeof(Tin) + sizeof(Tout)) < 131072) {
+ // Small cast on a CPU: do inline
+ o = i.template cast<Tout>();
+ } else {
+ o.device(d) = i.template cast<Tout>();
+ }
+}
+
+template <typename O, typename I>
+struct CastFunctor<CPUDevice, O, I> {
+ void operator()(const CPUDevice& d, typename TTypes<O>::Flat o,
+ typename TTypes<I>::ConstFlat i) {
+ CastMaybeInline<CPUDevice, O, I>(d, o, i);
+ }
+};
+
+} // namespace functor
+
+#define CAST_CASE(DEVICE, IN, OUT) \
+ if (DataTypeToEnum<IN>::value == src_dtype_ && \
+ DataTypeToEnum<OUT>::value == dst_dtype_) { \
+ work_ = [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) { \
+ functor::CastFunctor<DEVICE, OUT, IN> func; \
+ func(ctx->eigen_device<DEVICE>(), out->flat<OUT>(), inp.flat<IN>()); \
+ }; \
+ return Status::OK(); \
+ }
+
+class CastOpBase : public OpKernel {
+ public:
+ explicit CastOpBase(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("SrcT", &src_dtype_));
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("DstT", &dst_dtype_));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor& inp = ctx->input(0);
+ if (work_ == nullptr) {
+ ctx->set_output(0, inp);
+ } else {
+ Tensor* out = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, inp.shape(), &out));
+ work_(ctx, inp, out);
+ }
+ }
+
+ protected:
+ DataType src_dtype_;
+ DataType dst_dtype_;
+ std::function<void(OpKernelContext*, const Tensor&, Tensor*)> work_ = nullptr;
+
+ virtual Status Prepare() = 0;
+ Status Unimplemented() {
+ return errors::Unimplemented("Cast ", DataTypeString(src_dtype_), " to ",
+ DataTypeString(dst_dtype_),
+ " is not supported");
+ }
+
+ TF_DISALLOW_COPY_AND_ASSIGN(CastOpBase);
+};
+
+class CpuCastOp : public CastOpBase {
+ public:
+ explicit CpuCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) {
+ OP_REQUIRES_OK(ctx, Prepare());
+ }
+
+ protected:
+ Status Prepare() override {
+ if (src_dtype_ == dst_dtype_) {
+ work_ = nullptr; // Identity
+ return Status::OK();
+ }
+ CAST_CASE(CPUDevice, bool, float);
+ CAST_CASE(CPUDevice, bool, int32);
+ CAST_CASE(CPUDevice, bool, double);
+ CAST_CASE(CPUDevice, double, float);
+ CAST_CASE(CPUDevice, double, int32);
+ CAST_CASE(CPUDevice, double, int64);
+ CAST_CASE(CPUDevice, float, double);
+ CAST_CASE(CPUDevice, float, uint8);
+ CAST_CASE(CPUDevice, float, int32);
+ CAST_CASE(CPUDevice, float, int64);
+ CAST_CASE(CPUDevice, int32, double);
+ CAST_CASE(CPUDevice, int32, float);
+ CAST_CASE(CPUDevice, int32, uint8);
+ CAST_CASE(CPUDevice, int32, int64);
+ CAST_CASE(CPUDevice, int64, double);
+ CAST_CASE(CPUDevice, int64, float);
+ CAST_CASE(CPUDevice, int64, int32);
+ CAST_CASE(CPUDevice, uint8, float);
+ CAST_CASE(CPUDevice, uint8, int32);
+ CAST_CASE(CPUDevice, uint8, int64);
+ CAST_CASE(CPUDevice, uint8, double);
+ if (src_dtype_ == DT_BFLOAT16 && dst_dtype_ == DT_FLOAT) {
+ work_ = [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {
+ int64 N = out->NumElements();
+ auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+ int num_threads =
+ std::min<int>(std::min(4, worker_threads->num_threads), N / 4096);
+ if (num_threads < 1) {
+ BFloat16ToFloat(inp.flat<bfloat16>().data(),
+ out->flat<float>().data(), N);
+ } else {
+ auto work = [&inp, &out](int64 start, int64 end) {
+ BFloat16ToFloat(inp.flat<bfloat16>().data() + start,
+ out->flat<float>().data() + start, end - start);
+ };
+ Shard(num_threads, worker_threads->workers, N, 100, work);
+ }
+ };
+ return Status::OK();
+ }
+ if (src_dtype_ == DT_FLOAT && dst_dtype_ == DT_BFLOAT16) {
+ work_ = [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {
+ int64 N = out->NumElements();
+ auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+ int num_threads =
+ std::min<int>(std::min(4, worker_threads->num_threads), N / 4096);
+ if (num_threads < 1) {
+ FloatToBFloat16(inp.flat<float>().data(),
+ out->flat<bfloat16>().data(), N);
+ } else {
+ auto work = [&inp, &out](int64 start, int64 end) {
+ FloatToBFloat16(inp.flat<float>().data() + start,
+ out->flat<bfloat16>().data() + start, end - start);
+ };
+ Shard(num_threads, worker_threads->workers, N, 100, work);
+ }
+ };
+ return Status::OK();
+ }
+ return Unimplemented();
+ }
+};
+
+class GpuCastOp : public CastOpBase {
+ public:
+ explicit GpuCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) {
+ OP_REQUIRES_OK(ctx, Prepare());
+ }
+
+ protected:
+ Status Prepare() override {
+ if (src_dtype_ == dst_dtype_) {
+ work_ = nullptr; // Identity
+ return Status::OK();
+ }
+ CAST_CASE(GPUDevice, bfloat16, float);
+ CAST_CASE(GPUDevice, bool, float);
+ CAST_CASE(GPUDevice, double, float);
+ CAST_CASE(GPUDevice, double, int64);
+ CAST_CASE(GPUDevice, float, bfloat16);
+ CAST_CASE(GPUDevice, float, double);
+ CAST_CASE(GPUDevice, float, int64);
+ CAST_CASE(GPUDevice, int64, double);
+ CAST_CASE(GPUDevice, int64, float);
+ CAST_CASE(GPUDevice, uint8, float);
+ CAST_CASE(GPUDevice, float, uint8);
+ CAST_CASE(GPUDevice, bool, int32);
+ CAST_CASE(GPUDevice, double, int32);
+ CAST_CASE(GPUDevice, float, int32);
+ CAST_CASE(GPUDevice, int32, double);
+ CAST_CASE(GPUDevice, int32, float);
+ CAST_CASE(GPUDevice, int32, int64);
+ CAST_CASE(GPUDevice, int64, int32);
+ return Unimplemented();
+ }
+};
+
+#undef CAST_CASE
+
+REGISTER_KERNEL_BUILDER(Name("Cast").Device(DEVICE_CPU), CpuCastOp);
+
+#if GOOGLE_CUDA
+#define REGISTER_CAST_GPU(srctype, dsttype) \
+ REGISTER_KERNEL_BUILDER(Name("Cast") \
+ .TypeConstraint<srctype>("SrcT") \
+ .TypeConstraint<dsttype>("DstT") \
+ .Device(DEVICE_GPU), \
+ GpuCastOp);
+REGISTER_CAST_GPU(bfloat16, float);
+REGISTER_CAST_GPU(bool, float);
+REGISTER_CAST_GPU(double, float);
+REGISTER_CAST_GPU(double, int64);
+REGISTER_CAST_GPU(float, bfloat16);
+REGISTER_CAST_GPU(float, double);
+REGISTER_CAST_GPU(float, int64);
+REGISTER_CAST_GPU(int64, double);
+REGISTER_CAST_GPU(int64, float);
+REGISTER_CAST_GPU(uint8, float);
+REGISTER_CAST_GPU(float, uint8);
+REGISTER_CAST_GPU(bool, int32);
+REGISTER_CAST_GPU(double, int32);
+REGISTER_CAST_GPU(float, int32);
+REGISTER_CAST_GPU(int32, double);
+REGISTER_CAST_GPU(int32, float);
+REGISTER_CAST_GPU(int32, int64);
+REGISTER_CAST_GPU(int64, int32);
+#undef REGISTER_CAST_GPU
+#endif // GOOGLE_CUDA
+
+// HostCast differs from Cast in that its input and output are in host memory.
+REGISTER_KERNEL_BUILDER(Name("_HostCast").Device(DEVICE_CPU), CpuCastOp);
+REGISTER_KERNEL_BUILDER(
+ Name("_HostCast").Device(DEVICE_GPU).HostMemory("x").HostMemory("y"),
+ CpuCastOp);
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
new file mode 100644
index 0000000000..d066206abc
--- /dev/null
+++ b/tensorflow/core/kernels/cast_op.h
@@ -0,0 +1,71 @@
+#ifndef TENSORFLOW_KERNELS_CAST_OP_H_
+#define TENSORFLOW_KERNELS_CAST_OP_H_
+
+#include "tensorflow/core/framework/bfloat16.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/port.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename Tout, typename Tin>
+void Cast(const Device& d, typename TTypes<Tout>::Flat o,
+ typename TTypes<Tin>::ConstFlat i) {
+ o.device(d) = i.template cast<Tout>();
+}
+
+template <typename Device, typename Tout, typename Tin>
+struct CastFunctor {
+ void operator()(const Device& d, typename TTypes<Tout>::Flat o,
+ typename TTypes<Tin>::ConstFlat i);
+};
+
+} // end namespace functor
+} // end namespace tensorflow
+
+namespace Eigen {
+namespace internal {
+
+// Specialized cast op impls for bfloat16.
+template <>
+struct scalar_cast_op< ::tensorflow::bfloat16, float> {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+ typedef float result_type;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(
+ const ::tensorflow::bfloat16& a) const {
+ static_assert(::tensorflow::port::kLittleEndian, "");
+ float ret;
+ uint16_t* p = reinterpret_cast<uint16_t*>(&ret);
+ p[0] = 0;
+ p[1] = a.value;
+ return ret;
+ }
+};
+
+template <>
+struct functor_traits<scalar_cast_op< ::tensorflow::bfloat16, float> > {
+ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false };
+};
+
+template <>
+struct scalar_cast_op<float, ::tensorflow::bfloat16> {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+ typedef ::tensorflow::bfloat16 result_type;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ::tensorflow::bfloat16 operator()(
+ const float a) const {
+ static_assert(::tensorflow::port::kLittleEndian, "");
+ const uint16_t* p = reinterpret_cast<const uint16_t*>(&a);
+ return ::tensorflow::bfloat16(p[1]);
+ }
+};
+
+template <>
+struct functor_traits<scalar_cast_op<float, ::tensorflow::bfloat16> > {
+ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false };
+};
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // TENSORFLOW_KERNELS_CAST_OP_H_
diff --git a/tensorflow/core/kernels/cast_op_gpu.cu.cc b/tensorflow/core/kernels/cast_op_gpu.cu.cc
new file mode 100644
index 0000000000..cd198c752b
--- /dev/null
+++ b/tensorflow/core/kernels/cast_op_gpu.cu.cc
@@ -0,0 +1,45 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/bfloat16.h"
+#include "tensorflow/core/kernels/cast_op.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename O, typename I>
+struct CastFunctor<GPUDevice, O, I> {
+ void operator()(const GPUDevice& d, typename TTypes<O>::Flat o,
+ typename TTypes<I>::ConstFlat i) {
+ Cast<GPUDevice, O, I>(d, o, i);
+ }
+};
+
+#define DEFINE(O, I) template struct CastFunctor<GPUDevice, O, I>;
+DEFINE(float, double);
+DEFINE(float, int32);
+DEFINE(float, int64);
+DEFINE(double, float);
+DEFINE(double, int32);
+DEFINE(double, int64);
+DEFINE(int32, float);
+DEFINE(int32, double);
+DEFINE(int32, int64);
+DEFINE(int64, float);
+DEFINE(int64, double);
+DEFINE(int64, int32);
+DEFINE(int32, bool);
+DEFINE(float, bool);
+DEFINE(float, uint8);
+DEFINE(uint8, float);
+DEFINE(float, bfloat16);
+DEFINE(bfloat16, float);
+#undef DEFINE
+
+} // end namespace functor
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
new file mode 100644
index 0000000000..f774fbcfe8
--- /dev/null
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -0,0 +1,100 @@
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+
+template <typename Src, typename Dst>
+static Graph* Cast(int num) {
+ Graph* g = new Graph(OpRegistry::Global());
+ Tensor data(DataTypeToEnum<Src>::value,
+ TensorShape({64, 64, num / (64 * 64)}));
+ data.flat<Src>().setRandom();
+ test::graph::Cast(g, test::graph::Constant(g, data),
+ DataTypeToEnum<Dst>::value);
+ return g;
+}
+
+class CastOpTest : public OpsTestBase {
+ protected:
+ void MakeOp(DataType src, DataType dst) {
+ RequireDefaultOps();
+ EXPECT_OK(NodeDefBuilder("cast_op", "Cast")
+ .Input(FakeInput(DT_INT32))
+ .Attr("SrcT", src)
+ .Attr("DstT", dst)
+ .Finalize(node_def()));
+ EXPECT_OK(InitOp());
+ }
+};
+
+TEST_F(CastOpTest, Int32ToUint8) {
+ MakeOp(DT_INT32, DT_UINT8);
+ AddInputFromArray<int32>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+ ASSERT_OK(RunOpKernel());
+ Tensor expected(allocator(), DT_UINT8, TensorShape({1, 2, 2, 1}));
+ test::FillValues<uint8>(&expected, {1, 2, 3, 4});
+ test::ExpectTensorEqual<uint8>(expected, *GetOutput(0));
+}
+
+static void BM_cpu_float_int64(int iters, int num) {
+ testing::ItemsProcessed(static_cast<int64>(iters) * num);
+ testing::BytesProcessed(static_cast<int64>(iters) * num *
+ (sizeof(float) + sizeof(int64)));
+ testing::UseRealTime();
+ test::Benchmark("cpu", Cast<float, int64>(num)).Run(iters);
+}
+BENCHMARK(BM_cpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
+
+static void BM_gpu_float_int64(int iters, int num) {
+ testing::ItemsProcessed(static_cast<int64>(iters) * num);
+ testing::BytesProcessed(static_cast<int64>(iters) * num *
+ (sizeof(float) + sizeof(int64)));
+ testing::UseRealTime();
+ test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
+}
+BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
+
+static void BM_cpu_bool_float(int iters, int num) {
+ testing::ItemsProcessed(static_cast<int64>(iters) * num);
+ testing::BytesProcessed(static_cast<int64>(iters) * num *
+ (sizeof(bool) + sizeof(float)));
+ testing::UseRealTime();
+ test::Benchmark("cpu", Cast<bool, float>(num)).Run(iters);
+}
+BENCHMARK(BM_cpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
+
+static void BM_gpu_bool_float(int iters, int num) {
+ testing::ItemsProcessed(static_cast<int64>(iters) * num);
+ testing::BytesProcessed(static_cast<int64>(iters) * num *
+ (sizeof(bool) + sizeof(float)));
+ testing::UseRealTime();
+ test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
+}
+BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
+
+static void BM_cpu_float_bfloat16(int iters, int num) {
+ testing::ItemsProcessed(static_cast<int64>(iters) * num);
+ testing::BytesProcessed(static_cast<int64>(iters) * num *
+ (sizeof(float) + sizeof(bfloat16)));
+ testing::UseRealTime();
+ test::Benchmark("cpu", Cast<float, bfloat16>(num)).Run(iters);
+}
+BENCHMARK(BM_cpu_float_bfloat16)->Arg(64 << 10)->Arg(32 << 20);
+
+static void BM_cpu_bfloat16_float(int iters, int num) {
+ testing::ItemsProcessed(static_cast<int64>(iters) * num);
+ testing::BytesProcessed(static_cast<int64>(iters) * num *
+ (sizeof(float) + sizeof(bfloat16)));
+ testing::UseRealTime();
+ test::Benchmark("cpu", Cast<bfloat16, float>(num)).Run(iters);
+}
+BENCHMARK(BM_cpu_bfloat16_float)->Arg(64 << 10)->Arg(32 << 20);
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/check_numerics_op.cc b/tensorflow/core/kernels/check_numerics_op.cc
new file mode 100644
index 0000000000..65487a303c
--- /dev/null
+++ b/tensorflow/core/kernels/check_numerics_op.cc
@@ -0,0 +1,190 @@
+// See docs in ../ops/array_ops.cc.
+
+#include <math.h>
+#include <algorithm>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/public/tensor.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/stream_executor/stream.h"
+#endif // GOOGLE_CUDA
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+#if GOOGLE_CUDA
+template <typename T>
+struct CheckNumericsLaunch {
+ void Run(const GPUDevice& d, const T* data, int size,
+ int abnormal_detected[2]);
+};
+#endif
+
+namespace {
+
+template <typename Device, typename T>
+class CheckNumericsOp;
+
+// Partial specialization for CPU
+template <typename T>
+class CheckNumericsOp<CPUDevice, T> : public OpKernel {
+ public:
+ explicit CheckNumericsOp(OpKernelConstruction* context) : OpKernel(context) {
+ // message_ is used as the prefix for the assertion error message. For
+ // instance, this can be the name of the input op that produced the tensor.
+ OP_REQUIRES_OK(context, context->GetAttr("message", &message_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ // pass along the input to the output
+ context->set_output(0, context->input(0));
+
+ auto in = context->input(0).flat<T>();
+ const T* data = in.data();
+ const int size = in.size();
+ // Check to see if any element of the tensor is NaN or Inf.
+ int fp_props =
+ std::accumulate(data, data + size, 0, [](const int& x, const T& y) {
+ int prop = std::fpclassify(y);
+ int result = x;
+ if (prop == FP_INFINITE) {
+ result |= kInfBit;
+ } else if (prop == FP_NAN) {
+ result |= kNaNBit;
+ }
+ return result;
+ });
+ string status;
+ if ((fp_props & kInfBit) && (fp_props & kNaNBit)) {
+ status = "Inf and NaN";
+ } else {
+ if (fp_props & kInfBit) {
+ status = "Inf";
+ }
+ if (fp_props & kNaNBit) {
+ status = "NaN";
+ }
+ }
+ if (!status.empty()) {
+ context->SetStatus(errors::InvalidArgument(message_, " : Tensor had ",
+ status, " values"));
+ }
+ }
+
+ private:
+ string message_;
+ static const int kInfBit = 0x01;
+ static const int kNaNBit = 0x02;
+};
+
+#if GOOGLE_CUDA
+// Partial specialization for GPU
+template <typename T>
+class CheckNumericsOp<GPUDevice, T> : public OpKernel {
+ public:
+ typedef GPUDevice Device;
+
+ explicit CheckNumericsOp(OpKernelConstruction* context) : OpKernel(context) {
+ // message_ is used as the prefix for the assertion error message. For
+ // instance, this can be the name of the input op that produced the tensor.
+ OP_REQUIRES_OK(context, context->GetAttr("message", &message_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ // pass along the input to the output
+ context->set_output(0, context->input(0));
+ auto input = context->input(0).flat<T>();
+
+ // Allocate and initialize the elements to hold the check results
+ const int abnormal_detected_size = 2;
+ Tensor abnormal_detected;
+ OP_REQUIRES_OK(context, context->allocate_temp(
+ DT_INT32, TensorShape({abnormal_detected_size}),
+ &abnormal_detected));
+
+ auto* stream = context->op_device_context<GPUDeviceContext>()->stream();
+ OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+ perftools::gputools::DeviceMemoryBase abnormal_detected_ptr(
+ abnormal_detected.flat<int>().data(),
+ abnormal_detected.flat<int>().size());
+ stream->ThenMemset32(&abnormal_detected_ptr, 0,
+ abnormal_detected.flat<int>().size() * sizeof(int));
+
+ // Call the Cuda kernels for the numerical checks
+ const Device& d = context->eigen_device<Device>();
+ CheckNumericsLaunch<T>().Run(d, input.data(), input.size(),
+ abnormal_detected.flat<int>().data());
+
+ // Copy the results from device to host
+ AllocatorAttributes attr;
+ attr.set_on_host(true);
+ attr.set_gpu_compatible(true);
+ Tensor abnormal_detected_out;
+ OP_REQUIRES_OK(context, context->allocate_temp(
+ DT_INT32, TensorShape({abnormal_detected_size}),
+ &abnormal_detected_out, attr));
+ int* abnormal_detected_host = abnormal_detected_out.flat<int>().data();
+ stream->ThenMemcpy(abnormal_detected_host, abnormal_detected_ptr,
+ abnormal_detected_size * sizeof(int));
+ stream->BlockHostUntilDone();
+ OP_REQUIRES(context, stream->ok(),
+ errors::Internal("cudaMemcpy from device to host failed"));
+
+ int is_nan = abnormal_detected_host[0];
+ int is_inf = abnormal_detected_host[1];
+ if (is_nan || is_inf) {
+ string status;
+ LOG(ERROR) << "abnormal_detected_host @" << abnormal_detected_host
+ << " = {" << is_nan << ", " << is_inf << "} " << message_;
+
+ // Results should always be 1 or 0. If we see anything else then
+ // there has been some GPU memory corruption.
+ CHECK_GE(is_nan, 0);
+ CHECK_GE(is_inf, 0);
+ CHECK_LE(is_nan, 1);
+ CHECK_LE(is_inf, 1);
+
+ if (is_nan && is_inf) {
+ status = "Inf and NaN";
+ } else if (is_nan) {
+ status = "NaN";
+ } else if (is_inf) {
+ status = "Inf";
+ }
+ context->SetStatus(errors::InvalidArgument(message_, " : Tensor had ",
+ status, " values"));
+ }
+ }
+
+ private:
+ string message_;
+};
+#endif // GOOGLE_CUDA
+
+} // namespace
+
+REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
+ .Device(DEVICE_CPU)
+ .TypeConstraint<float>("T"),
+ CheckNumericsOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
+ .Device(DEVICE_CPU)
+ .TypeConstraint<double>("T"),
+ CheckNumericsOp<CPUDevice, double>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<float>("T"),
+ CheckNumericsOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<double>("T"),
+ CheckNumericsOp<GPUDevice, double>);
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
new file mode 100644
index 0000000000..cb84f98731
--- /dev/null
+++ b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
@@ -0,0 +1,62 @@
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+#include <assert.h>
+
+#include <math.h>
+#include <algorithm>
+
+#include "tensorflow/core/platform/port.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// A Cuda kernel to check if each element is Inf or Nan. If any exists, the
+// relevant elements in abnormal_detected will be set
+template <typename T>
+__global__ void CheckNumericsKernel(const T *data, int size,
+ int abnormal_detected[2]) {
+ const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+ const int32 total_thread_count = gridDim.x * blockDim.x;
+
+ int32 offset = thread_id;
+
+ while (offset < size) {
+ if (isnan(data[offset])) {
+ abnormal_detected[0] = 1;
+ }
+ if (isinf(data[offset])) {
+ abnormal_detected[1] = 1;
+ }
+ offset += total_thread_count;
+ }
+}
+
+} // namespace
+
+// A simple launch pad to launch the Cuda kernels that checks the numerical
+// abnormality in the given array
+template <typename T>
+struct CheckNumericsLaunch {
+ void Run(const GPUDevice &d, const T *data, int size,
+ int abnormal_detected[2]) {
+ const int32 block_size = d.maxCudaThreadsPerBlock();
+ const int32 num_blocks =
+ (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
+ block_size;
+
+ CheckNumericsKernel<T><<<num_blocks, block_size, 0, d.stream()>>>(
+ data, size, abnormal_detected);
+ }
+};
+
+template struct CheckNumericsLaunch<float>;
+template struct CheckNumericsLaunch<double>;
+
+} // namespace tensorflow
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cholesky_op.cc b/tensorflow/core/kernels/cholesky_op.cc
new file mode 100644
index 0000000000..12632fb248
--- /dev/null
+++ b/tensorflow/core/kernels/cholesky_op.cc
@@ -0,0 +1,71 @@
+// See docs in ../ops/linalg_ops.cc.
+// TODO(konstantinos): Enable complex inputs. This will require additional tests
+// and OP_REQUIRES.
+
+#include <cmath>
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/Eigen/Cholesky"
+
+namespace tensorflow {
+
+template <class Scalar, bool SupportsBatchOperationT>
+class CholeskyOp : public LinearAlgebraOp<Scalar, SupportsBatchOperationT> {
+ public:
+ explicit CholeskyOp(OpKernelConstruction* context)
+ : LinearAlgebraOp<Scalar, SupportsBatchOperationT>(context) {}
+
+ TensorShape GetOutputMatrixShape(
+ const TensorShape& input_matrix_shape) override {
+ return input_matrix_shape;
+ }
+
+ int64 GetCostPerUnit(const TensorShape& input_matrix_shape) override {
+ const int64 rows = input_matrix_shape.dim_size(0);
+ if (rows > (1LL << 20)) {
+ // A big number to cap the cost in case overflow.
+ return kint32max;
+ } else {
+ return rows * rows * rows;
+ }
+ }
+
+ using typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::MatrixMap;
+ using
+ typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::ConstMatrixMap;
+
+ void ComputeMatrix(OpKernelContext* context, const ConstMatrixMap& input,
+ MatrixMap* output) override {
+ OP_REQUIRES(context, input.rows() == input.cols(),
+ errors::InvalidArgument("Input matrix must be square."));
+ if (input.rows() == 0) {
+ // If X is an empty matrix (0 rows, 0 col), X * X' == X.
+ // Therefore, we return X.
+ return;
+ }
+ // Perform the actual LL^T Cholesky decomposition. This will only use
+ // the lower triangular part of data_in by default. The upper triangular
+ // part of the matrix will not be read.
+ Eigen::LLT<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic,
+ Eigen::RowMajor>> llt_decomposition(input);
+
+ // Output the lower triangular in a dense form.
+ *output = llt_decomposition.matrixL();
+
+ OP_REQUIRES(context, llt_decomposition.info() == Eigen::Success,
+ errors::InvalidArgument("LLT decomposition was not successful. "
+ "The input might not be valid."));
+ }
+};
+
+REGISTER_LINALG_OP("Cholesky", (CholeskyOp<float, false>), float);
+REGISTER_LINALG_OP("Cholesky", (CholeskyOp<double, false>), double);
+REGISTER_LINALG_OP("BatchCholesky", (CholeskyOp<float, true>), float);
+REGISTER_LINALG_OP("BatchCholesky", (CholeskyOp<double, true>), double);
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
new file mode 100644
index 0000000000..b68fcec515
--- /dev/null
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -0,0 +1,153 @@
+// See docs in ../ops/array_ops.cc.
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/concat_op.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/public/status.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// --------------------------------------------------------------------------
+template <typename Device, typename T>
+class ConcatOp : public OpKernel {
+ public:
+ typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>
+ ConstMatrixVector;
+
+ explicit ConcatOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+ void Compute(OpKernelContext* c) override {
+ const Tensor* concat_dim_tensor;
+ OP_REQUIRES_OK(c, c->input("concat_dim", &concat_dim_tensor));
+ OP_REQUIRES(
+ c, TensorShapeUtils::IsLegacyScalar(concat_dim_tensor->shape()),
+ errors::InvalidArgument(
+ "Concat dim tensor should be a scalar integer, but got shape ",
+ concat_dim_tensor->shape().DebugString()));
+ const int32 concat_dim = concat_dim_tensor->scalar<int32>()();
+ OpInputList values;
+ OP_REQUIRES_OK(c, c->input_list("values", &values));
+ const int N = values.size();
+ const int input_dims = values[0].dims();
+ const TensorShape& input_shape = values[0].shape();
+ OP_REQUIRES(
+ c, (0 <= concat_dim && concat_dim < input_dims) ||
+ (kAllowLegacyScalars && concat_dim == 0),
+ errors::InvalidArgument(
+ "ConcatOp : Expected concatenating dimensions in the range [", 0,
+ ", ", input_dims, "), but got ", concat_dim));
+
+ // Note that we reduce the concat of n-dimensional tensors into a two
+ // dimensional concat. Assuming the dimensions of any input/output
+ // tensor are {x0, x1,...,xn-1, y0, y1,...,ym-1}, where the concat is along
+ // the dimension indicated with size y0, we flatten it to {x, y}, where y =
+ // Prod_i(yi) and x = ((n > 0) ? Prod_i(xi) : 1).
+ ConstMatrixVector inputs_flat;
+ inputs_flat.reserve(N);
+ int64 inputs_flat_dim0 = 1;
+ for (int d = 0; d < concat_dim; ++d) {
+ inputs_flat_dim0 *= input_shape.dim_size(d);
+ }
+ int output_concat_dim = 0;
+ const bool input_is_scalar = TensorShapeUtils::IsLegacyScalar(input_shape);
+ for (int i = 0; i < N; ++i) {
+ const auto in = values[i];
+ const bool in_is_scalar = TensorShapeUtils::IsLegacyScalar(in.shape());
+ OP_REQUIRES(
+ c, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
+ errors::InvalidArgument(
+ "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
+ input_shape.ShortDebugString(), " vs. shape[", i, "] = ",
+ in.shape().ShortDebugString()));
+ for (int j = 0; j < input_dims; ++j) {
+ if (j == concat_dim) {
+ continue;
+ }
+ OP_REQUIRES(
+ c, in.dim_size(j) == input_shape.dim_size(j),
+ errors::InvalidArgument(
+ "ConcatOp : Dimensions of inputs should match: shape[0] = ",
+ input_shape.ShortDebugString(), " vs. shape[", i, "] = ",
+ in.shape().ShortDebugString()));
+ }
+ if (in.NumElements() > 0) {
+ int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
+ inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+ in.shaped<T, 2>({inputs_flat_dim0, inputs_flat_dim1})));
+ }
+ // TODO(irving): Remove check once !kAllowLegacyScalars
+ output_concat_dim += in.dims() > 0 ? in.dim_size(concat_dim) : 1;
+ }
+
+ TensorShape output_shape(input_shape);
+ // TODO(irving): Remove rank 0 case once !kAllowLegacyScalars
+ if (output_shape.dims() == 0) {
+ output_shape.AddDim(output_concat_dim);
+ } else {
+ output_shape.set_dim(concat_dim, output_concat_dim);
+ }
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
+ if (output->NumElements() > 0) {
+ int64 output_dim1 = output->NumElements() / inputs_flat_dim0;
+ auto output_flat = output->shaped<T, 2>({inputs_flat_dim0, output_dim1});
+ if (std::is_same<Device, GPUDevice>::value) {
+ ConcatGPU<T>(c->eigen_gpu_device(), inputs_flat, &output_flat);
+ } else {
+ ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+ }
+ }
+ }
+};
+
+#define REGISTER_CONCAT(type) \
+ REGISTER_KERNEL_BUILDER(Name("Concat") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("concat_dim"), \
+ ConcatOp<CPUDevice, type>)
+
+TF_CALL_ALL_TYPES(REGISTER_CONCAT);
+REGISTER_CONCAT(quint8);
+REGISTER_CONCAT(qint8);
+REGISTER_CONCAT(qint32);
+REGISTER_CONCAT(bfloat16);
+
+#undef REGISTER_CONCAT
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU(type) \
+ REGISTER_KERNEL_BUILDER(Name("Concat") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("concat_dim"), \
+ ConcatOp<GPUDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+#undef REGISTER_GPU
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Concat")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<int32>("T")
+ .HostMemory("concat_dim")
+ .HostMemory("values")
+ .HostMemory("output"),
+ ConcatOp<CPUDevice, int32>);
+
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_op.h b/tensorflow/core/kernels/concat_op.h
new file mode 100644
index 0000000000..664e55080d
--- /dev/null
+++ b/tensorflow/core/kernels/concat_op.h
@@ -0,0 +1,27 @@
+#ifndef TENSORFLOW_KERNELS_CONCAT_OP_H_
+#define TENSORFLOW_KERNELS_CONCAT_OP_H_
+
+#include <vector>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/device_base.h"
+
+namespace tensorflow {
+
+// Assumes all inputs are nonempty
+template <typename T>
+void ConcatCPU(DeviceBase* d,
+ const std::vector<
+ std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+ typename TTypes<T, 2>::Matrix* output);
+
+// Assumes all inputs are nonempty
+template <typename T>
+void ConcatGPU(const Eigen::GpuDevice& d,
+ const std::vector<
+ std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+ typename TTypes<T, 2>::Matrix* output);
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_CONCAT_OP_H_
diff --git a/tensorflow/core/kernels/concat_op_cpu.cc b/tensorflow/core/kernels/concat_op_cpu.cc
new file mode 100644
index 0000000000..679a53721c
--- /dev/null
+++ b/tensorflow/core/kernels/concat_op_cpu.cc
@@ -0,0 +1,122 @@
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/concat_op.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+template <typename T>
+static inline void Copy(T* dst, const T* src, int n) {
+ if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
+ memcpy(dst, src, n * sizeof(T));
+ } else {
+ for (int k = 0; k < n; ++k) {
+ *dst++ = *src++;
+ }
+ }
+}
+
+template <typename T>
+void ConcatCPU(DeviceBase* d,
+ const std::vector<
+ std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+ typename TTypes<T, 2>::Matrix* output) {
+ int num_inputs = inputs.size();
+ std::vector<ptrdiff_t> sizes;
+ sizes.reserve(num_inputs);
+ int row_size = 0;
+ for (int j = 0; j < num_inputs; ++j) {
+ sizes.push_back(inputs[j]->dimension(1));
+ row_size += sizes.back();
+ }
+
+ auto worker_threads = d->tensorflow_cpu_worker_threads();
+ int num_threads = std::min<int>(std::min(4, worker_threads->num_threads),
+ output->size() / 4096);
+ // Single threaded mode.
+ if (num_threads == 0) {
+ T* out = &(*output)(0, 0);
+ std::vector<const T*> inp;
+ inp.reserve(num_inputs);
+ for (int j = 0; j < num_inputs; ++j) {
+ inp.push_back(&(*inputs[j])(0, 0));
+ }
+ const int dim0 = output->dimension(0);
+ for (int i = 0; i < dim0; ++i) {
+ for (int j = 0; j < num_inputs; ++j) {
+ auto size = sizes[j];
+ Copy(out, inp[j], size);
+ out += size;
+ inp[j] += size;
+ }
+ }
+ return;
+ }
+
+ // Sharded mode.
+ auto work = [&row_size, &sizes, &inputs, &output, &num_inputs](int64 start,
+ int64 end) {
+ int64 skipped_rows = start / row_size;
+ T* out = output->data() + skipped_rows * row_size;
+ T* out_start = output->data() + start;
+ T* out_end = output->data() + end;
+
+ // Handle partial row at start
+ if (out < out_start) {
+ for (int j = 0; j < num_inputs; ++j) {
+ ptrdiff_t size = sizes[j];
+ ptrdiff_t offset = out_start - out;
+ if (size <= offset) {
+ out += size;
+ continue;
+ }
+ const T* inp = &(*inputs[j])(skipped_rows, 0);
+ if (offset > 0) {
+ out += offset;
+ inp += offset;
+ size -= offset;
+ }
+ size = std::min(size, out_end - out);
+ if (size <= 0) break;
+ Copy(out, inp, size);
+ out += size;
+ }
+ ++skipped_rows;
+ }
+ if (out == out_end) return;
+ CHECK(out >= out_start);
+ CHECK(out < out_end);
+
+ // Copy remaining data.
+ std::vector<const T*> inp;
+ inp.reserve(num_inputs);
+ for (int j = 0; j < num_inputs; ++j) {
+ inp.push_back(&(*inputs[j])(skipped_rows, 0));
+ }
+ const int dim0 = output->dimension(0);
+ for (int i = skipped_rows; i < dim0; ++i) {
+ for (int j = 0; j < num_inputs; ++j) {
+ ptrdiff_t size = std::min(sizes[j], out_end - out);
+ Copy(out, inp[j], size);
+ out += size;
+ inp[j] += size;
+ if (out == out_end) return;
+ }
+ }
+ };
+ Shard(num_threads, worker_threads->workers, output->size(), 100, work);
+}
+
+#define REGISTER(T) \
+ template void ConcatCPU<T>( \
+ DeviceBase*, \
+ const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
+ typename TTypes<T, 2>::Matrix* output);
+TF_CALL_ALL_TYPES(REGISTER)
+REGISTER(quint8)
+REGISTER(qint8)
+REGISTER(qint32)
+REGISTER(bfloat16)
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_op_gpu.cu.cc b/tensorflow/core/kernels/concat_op_gpu.cu.cc
new file mode 100644
index 0000000000..d8ce6bd85d
--- /dev/null
+++ b/tensorflow/core/kernels/concat_op_gpu.cu.cc
@@ -0,0 +1,41 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+
+#include <memory>
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename T>
+void ConcatGPU(const GPUDevice& d,
+ const std::vector<
+ std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+ typename TTypes<T, 2>::Matrix* output) {
+ Eigen::array<ptrdiff_t, 2> offset(0, 0);
+ for (int i = 0; i < inputs.size(); ++i) {
+ Eigen::array<ptrdiff_t, 2> size = inputs[i]->dimensions();
+ output->slice(offset, size).device(d) = *inputs[i];
+ offset[1] += size[1];
+ }
+}
+
+#define REGISTER_GPU(T) \
+ template void ConcatGPU<T>( \
+ const GPUDevice& d, \
+ const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& \
+ inputs, \
+ typename TTypes<T, 2>::Matrix* output);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+#undef REGISTER_GPU
+
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/concat_op_test.cc b/tensorflow/core/kernels/concat_op_test.cc
new file mode 100644
index 0000000000..4ccc5b5b19
--- /dev/null
+++ b/tensorflow/core/kernels/concat_op_test.cc
@@ -0,0 +1,240 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+// For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
+// in size, and concat them together along "concat_dimension"
+template <typename T>
+static void ConcatHelper(int iters, int concat_dimension, int dim2) {
+ testing::StopTiming();
+ RequireDefaultOps();
+ Graph* g = new Graph(OpRegistry::Global());
+
+ DataType dt = DataTypeToEnum<T>::v();
+ const int kDim1 = 100;
+ Tensor concat_dim(DT_INT32, TensorShape({}));
+ concat_dim.scalar<int32>()() = concat_dimension;
+ Tensor in0(dt, TensorShape({kDim1, dim2}));
+ in0.flat<T>().setRandom();
+ Tensor in1(dt, TensorShape({kDim1, dim2}));
+ in1.flat<T>().setRandom();
+
+ Node* node;
+ TF_CHECK_OK(
+ NodeBuilder(g->NewName("n"), "Concat")
+ .Input(test::graph::Constant(g, concat_dim))
+ .Input({test::graph::Constant(g, in0), test::graph::Constant(g, in1)})
+ .Attr("N", 2)
+ .Attr("T", dt)
+ .Finalize(g, &node));
+
+ testing::BytesProcessed(static_cast<int64>(iters) *
+ ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(T));
+ testing::StartTiming();
+ test::Benchmark("cpu", g).Run(iters);
+ testing::UseRealTime();
+}
+
+static void BM_ConcatDim0Float(int iters, int dim2) {
+ ConcatHelper<float>(iters, 0, dim2);
+}
+
+static void BM_ConcatDim1Float(int iters, int dim2) {
+ ConcatHelper<float>(iters, 1, dim2);
+}
+
+BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000)->Arg(1000000);
+BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000)->Arg(1000000);
+
+static void BM_ConcatDim1int16(int iters, int dim2) {
+ ConcatHelper<int16>(iters, 1, dim2);
+}
+static void BM_ConcatDim1bfloat16(int iters, int dim2) {
+ ConcatHelper<bfloat16>(iters, 1, dim2);
+}
+
+BENCHMARK(BM_ConcatDim1int16)->Arg(1000)->Arg(100000)->Arg(1000000);
+BENCHMARK(BM_ConcatDim1bfloat16)->Arg(1000)->Arg(100000)->Arg(1000000);
+
+template <typename T>
+static void ConcatManyHelper(int iters, int concat_dimension, int dim2) {
+ testing::StopTiming();
+ RequireDefaultOps();
+ Graph* g = new Graph(OpRegistry::Global());
+
+ DataType dt = DataTypeToEnum<T>::v();
+ const int kDim1 = 40000;
+ const int kNumInputs = 64;
+ Tensor concat_dim(DT_INT32, TensorShape({}));
+ concat_dim.scalar<int32>()() = concat_dimension;
+ std::vector<NodeBuilder::NodeOut> inputs;
+ inputs.reserve(kNumInputs);
+ for (int i = 0; i < kNumInputs; ++i) {
+ Tensor in(dt, TensorShape({kDim1, dim2}));
+ in.flat<T>().setRandom();
+ inputs.push_back(test::graph::Constant(g, in));
+ }
+
+ Node* node;
+ TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Concat")
+ .Input(test::graph::Constant(g, concat_dim))
+ .Input(inputs)
+ .Attr("N", 64)
+ .Attr("T", dt)
+ .Finalize(g, &node));
+ testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
+ kNumInputs * sizeof(T));
+ testing::StartTiming();
+ test::Benchmark("cpu", g).Run(iters);
+ testing::UseRealTime();
+}
+
+static void BM_ConcatManyDim1bfloat16(int iters, int dim2) {
+ ConcatManyHelper<bfloat16>(iters, 1, dim2);
+}
+
+BENCHMARK(BM_ConcatManyDim1bfloat16)->Arg(18)->Arg(34)->Arg(60);
+
+static void MemcpyAlternativeHelper(int iters, int concat_dimension, int dim2) {
+ testing::StopTiming();
+
+ const int kDim1 = 100;
+ std::vector<float> data1(kDim1 * dim2, 1.0f);
+ std::vector<float> data2(kDim1 * dim2, 2.0f);
+
+ testing::BytesProcessed(static_cast<int64>(iters) *
+ ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(float));
+ testing::StartTiming();
+ while (--iters > 0) {
+ const int n0 = data1.size();
+ const int n1 = data2.size();
+ float* result = new float[n0 + n1];
+ memcpy(&result[0], &data1[0], n0 * sizeof(float));
+ memcpy(&result[n0], &data2[0], n1 * sizeof(float));
+ delete[] result;
+ }
+}
+
+static void BM_MemcpyAlternativeDim0(int iters, int dim2) {
+ MemcpyAlternativeHelper(iters, 0, dim2);
+}
+static void BM_MemcpyAlternativeDim1(int iters, int dim2) {
+ MemcpyAlternativeHelper(iters, 1, dim2);
+}
+
+BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000);
+BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
+
+typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
+ Eigen::Unaligned> EigenMap;
+static void MemcpyManyAlternative1(int iters, int dim2) {
+ testing::StopTiming();
+
+ const int kDim1 = 40000;
+ const int kNumCopies = 64;
+ const int size = kDim1 * dim2 * kNumCopies;
+ bfloat16* data = new bfloat16[size];
+ EigenMap map(data, size);
+ map.setRandom();
+
+ testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
+ kNumCopies * sizeof(bfloat16));
+ testing::StartTiming();
+ while (iters-- > 0) {
+ std::vector<bfloat16*> inputs(kNumCopies);
+ for (int i = 0; i < kNumCopies; ++i) {
+ inputs[i] = &data[i * kDim1 * dim2];
+ }
+ bfloat16* result = new bfloat16[size];
+ for (int j = 0; j < kNumCopies; ++j) {
+ bfloat16* output = &result[j * dim2];
+ for (int i = 0; i < kDim1; ++i) {
+ if (i + 1 < kDim1) {
+ port::prefetch<port::PREFETCH_HINT_T0>(inputs[j] + dim2);
+ }
+ memcpy(output, inputs[j], dim2 * sizeof(bfloat16));
+ inputs[j] += dim2;
+ output += dim2 * kNumCopies;
+ }
+ }
+ delete[] result;
+ }
+ delete[] data;
+}
+
+static void MemcpyManyAlternative2(int iters, int dim2) {
+ testing::StopTiming();
+
+ const int kDim1 = 40000;
+ const int kNumCopies = 64;
+ const int size = kDim1 * dim2 * kNumCopies;
+ bfloat16* data = new bfloat16[size];
+ EigenMap map(data, size);
+ map.setRandom();
+
+ testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
+ kNumCopies * sizeof(bfloat16));
+ testing::StartTiming();
+ std::vector<bfloat16*> inputs(kNumCopies);
+ while (--iters > 0) {
+ bfloat16* result = new bfloat16[size];
+ for (int i = 0; i < kNumCopies; ++i) {
+ inputs[i] = &data[i * kDim1 * dim2];
+ }
+ bfloat16* output = result;
+ for (int i = 0; i < kDim1; ++i) {
+ for (int j = 0; j < kNumCopies; ++j) {
+ if (j + 1 < kNumCopies) {
+ port::prefetch<port::PREFETCH_HINT_T0>(inputs[j + 1]);
+ }
+ memcpy(output, inputs[j], dim2 * sizeof(bfloat16));
+ inputs[j] += dim2;
+ output += dim2;
+ }
+ }
+ delete[] result;
+ }
+ delete[] data;
+}
+
+BENCHMARK(MemcpyManyAlternative1)
+ ->Arg(16)
+ ->Arg(17)
+ ->Arg(18)
+ ->Arg(32)
+ ->Arg(33)
+ ->Arg(34)
+ ->Arg(60)
+ ->Arg(64)
+ ->Arg(65);
+
+BENCHMARK(MemcpyManyAlternative2)
+ ->Arg(16)
+ ->Arg(17)
+ ->Arg(18)
+ ->Arg(32)
+ ->Arg(33)
+ ->Arg(34)
+ ->Arg(60)
+ ->Arg(64)
+ ->Arg(65);
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
new file mode 100644
index 0000000000..281bafd3df
--- /dev/null
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -0,0 +1,249 @@
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/constant_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+ConstantOp::ConstantOp(OpKernelConstruction* ctx)
+ : OpKernel(ctx), tensor_(ctx->output_type(0)) {
+ const TensorProto* proto = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
+ OP_REQUIRES_OK(ctx, ctx->device()->MakeTensorFromProto(
+ *proto, AllocatorAttributes(), &tensor_));
+ OP_REQUIRES(
+ ctx, ctx->output_type(0) == tensor_.dtype(),
+ errors::InvalidArgument("Type mismatch between value (",
+ DataTypeString(tensor_.dtype()), ") and dtype (",
+ DataTypeString(ctx->output_type(0)), ")"));
+}
+
+void ConstantOp::Compute(OpKernelContext* ctx) { ctx->set_output(0, tensor_); }
+
+ConstantOp::~ConstantOp() {}
+
+REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_CPU), ConstantOp);
+
+#if GOOGLE_CUDA
+#define REGISTER_KERNEL(D, TYPE) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Const").Device(DEVICE_##D).TypeConstraint<TYPE>("dtype"), \
+ ConstantOp);
+REGISTER_KERNEL(GPU, float);
+REGISTER_KERNEL(GPU, double);
+REGISTER_KERNEL(GPU, uint8);
+REGISTER_KERNEL(GPU, int8);
+REGISTER_KERNEL(GPU, int16);
+REGISTER_KERNEL(GPU, int64);
+REGISTER_KERNEL(GPU, complex64);
+REGISTER_KERNEL(GPU, bool);
+// Currently we do not support string constants on GPU
+#undef REGISTER_KERNEL
+#endif
+
+// HostConstantOp differs from ConstantOp in that its output is always
+// in host memory.
+class HostConstantOp : public OpKernel {
+ public:
+ explicit HostConstantOp(OpKernelConstruction* ctx)
+ : OpKernel(ctx), tensor_(ctx->output_type(0)) {
+ const TensorProto* proto = nullptr;
+ AllocatorAttributes alloc_attr;
+ alloc_attr.set_on_host(true);
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
+ OP_REQUIRES_OK(
+ ctx, ctx->device()->MakeTensorFromProto(*proto, alloc_attr, &tensor_));
+ OP_REQUIRES(
+ ctx, ctx->output_type(0) == tensor_.dtype(),
+ errors::InvalidArgument(
+ "Type mismatch between value (", DataTypeString(tensor_.dtype()),
+ ") and dtype (", DataTypeString(ctx->output_type(0)), ")"));
+ }
+
+ void Compute(OpKernelContext* ctx) override { ctx->set_output(0, tensor_); }
+
+ bool IsExpensive() override { return false; }
+
+ ~HostConstantOp() override {}
+
+ private:
+ Tensor tensor_;
+ TF_DISALLOW_COPY_AND_ASSIGN(HostConstantOp);
+};
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Const")
+ .Device(DEVICE_GPU)
+ .HostMemory("output")
+ .TypeConstraint<int32>("dtype"),
+ HostConstantOp);
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+// Partial specialization of FillFunctor<Device=CPUDevice, T>.
+template <typename T>
+struct FillFunctor<CPUDevice, T> {
+ void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstScalar in) {
+ out.device(d) = out.constant(in());
+ }
+};
+
+// Partial specialization of SetZeroFunctor<Device=CPUDevice, T>.
+template <typename T>
+struct SetZeroFunctor<CPUDevice, T> {
+ void operator()(const CPUDevice& d, typename TTypes<T>::Flat out) {
+ out.device(d) = out.constant(0);
+ }
+};
+
+#define DEFINE_SETZERO_CPU(T) template struct SetZeroFunctor<CPUDevice, T>
+DEFINE_SETZERO_CPU(float);
+DEFINE_SETZERO_CPU(double);
+DEFINE_SETZERO_CPU(int32);
+DEFINE_SETZERO_CPU(complex64);
+#undef DEFINE_SETZERO_CPU
+
+} // end namespace functor
+
+template <typename Device, typename T>
+class FillOp : public OpKernel {
+ public:
+ explicit FillOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& Tdims = context->input(0);
+ OP_REQUIRES(context, TensorShapeUtils::IsLegacyVector(Tdims.shape()),
+ errors::InvalidArgument("dims must be a vector of int32."));
+ const Tensor& Tvalue = context->input(1);
+ OP_REQUIRES(context, TensorShapeUtils::IsLegacyScalar(Tvalue.shape()),
+ errors::InvalidArgument("value must be a scalar."));
+ auto dims = Tdims.flat<int32>();
+ for (int i = 0; i < dims.size(); i++) {
+ OP_REQUIRES(context, dims(i) >= 0,
+ errors::InvalidArgument("dims[", i, "] = ", dims(i),
+ " must be nonnegative."));
+ }
+ Tensor* out = nullptr;
+ OP_REQUIRES_OK(
+ context,
+ context->allocate_output(
+ 0, TensorShapeUtils::MakeShape(
+ reinterpret_cast<const int32*>(dims.data()), dims.size()),
+ &out));
+ functor::FillFunctor<Device, T> functor;
+ functor(context->eigen_device<Device>(), out->flat<T>(),
+ Tvalue.scalar<T>());
+ }
+};
+
+#define REGISTER_KERNEL(D, TYPE) \
+ REGISTER_KERNEL_BUILDER(Name("Fill") \
+ .Device(DEVICE_##D) \
+ .TypeConstraint<TYPE>("T") \
+ .HostMemory("dims"), \
+ FillOp<D##Device, TYPE>);
+
+#define REGISTER_CPU_KERNEL(TYPE) REGISTER_KERNEL(CPU, TYPE)
+TF_CALL_ALL_TYPES(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL(GPU, float);
+REGISTER_KERNEL(GPU, double);
+REGISTER_KERNEL(GPU, uint8);
+REGISTER_KERNEL(GPU, int8);
+REGISTER_KERNEL(GPU, int16);
+REGISTER_KERNEL(GPU, int64);
+// Currently we do not support filling strings and complex64 on GPU
+
+#endif // GOOGLE_CUDA
+
+#undef REGISTER_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Fill")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<int32>("T")
+ .HostMemory("dims")
+ .HostMemory("value")
+ .HostMemory("output"),
+ FillOp<CPUDevice, int32>);
+
+template <typename Device, typename T>
+class ZerosLikeOp : public OpKernel {
+ public:
+ explicit ZerosLikeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor& input = ctx->input(0);
+ Tensor* out = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &out));
+ Tensor zero(DataTypeToEnum<T>::value, {1});
+ zero.scalar<T>().setZero();
+ const Tensor& zero_cref = zero;
+ functor::FillFunctor<Device, T> functor;
+ functor(ctx->eigen_device<Device>(), out->flat<T>(), zero_cref.scalar<T>());
+ }
+};
+
+#define REGISTER_KERNEL(type, dev) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("ZerosLike").Device(DEVICE_##dev).TypeConstraint<type>("T"), \
+ ZerosLikeOp<dev##Device, type>)
+
+#define REGISTER_CPU(type) REGISTER_KERNEL(type, CPU)
+TF_CALL_ALL_TYPES(REGISTER_CPU);
+#undef REGISTER_CPU
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL(float, GPU);
+REGISTER_KERNEL(double, GPU);
+#endif // GOOGLE_CUDA
+
+#undef REGISTER_KERNEL
+
+class PlaceholderOp : public OpKernel {
+ public:
+ explicit PlaceholderOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &expected_shape_));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ if (expected_shape_.dims() > 0) {
+ OP_REQUIRES(ctx, false,
+ errors::InvalidArgument(
+ "You must feed a value for placeholder tensor '", name(),
+ "' with dtype ", DataTypeString(output_type(0)),
+ " and shape ", expected_shape_.DebugString()));
+ } else {
+ OP_REQUIRES(ctx, false,
+ errors::InvalidArgument(
+ "You must feed a value for placeholder tensor '", name(),
+ "' with dtype ", DataTypeString(output_type(0))));
+ }
+ }
+
+ private:
+ TensorShape expected_shape_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Placeholder").Device(DEVICE_CPU), PlaceholderOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/constant_op.h b/tensorflow/core/kernels/constant_op.h
new file mode 100644
index 0000000000..20a5c9c42f
--- /dev/null
+++ b/tensorflow/core/kernels/constant_op.h
@@ -0,0 +1,25 @@
+#ifndef TENSORFLOW_KERNELS_CONSTANT_OP_H_
+#define TENSORFLOW_KERNELS_CONSTANT_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+// ConstantOp returns a tensor specified by ConstantOpDef.
+class ConstantOp : public OpKernel {
+ public:
+ explicit ConstantOp(OpKernelConstruction* ctx);
+ void Compute(OpKernelContext* ctx) override;
+ bool IsExpensive() override { return false; }
+ ~ConstantOp() override;
+
+ private:
+ Tensor tensor_;
+ TF_DISALLOW_COPY_AND_ASSIGN(ConstantOp);
+};
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_CONSTANT_OP_H_
diff --git a/tensorflow/core/kernels/constant_op_gpu.cu.cc b/tensorflow/core/kernels/constant_op_gpu.cu.cc
new file mode 100644
index 0000000000..64502378bd
--- /dev/null
+++ b/tensorflow/core/kernels/constant_op_gpu.cu.cc
@@ -0,0 +1,89 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/platform/port.h"
+
+namespace Eigen {
+namespace internal {
+
+template <typename T>
+struct scalar_const_op {
+ typedef typename packet_traits<T>::type Packet;
+
+ const T* val;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ scalar_const_op(const scalar_const_op& x)
+ : val(x.val) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_const_op(const T* v) : val(v) {}
+
+ template <typename Index>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(Index,
+ Index = 0) const {
+ return *val;
+ }
+
+ template <typename Index>
+ EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const {
+ return internal::pset1<Packet>(*val);
+ }
+};
+
+template <typename T>
+struct functor_traits<scalar_const_op<T> > {
+ enum {
+ Cost = 1,
+ PacketAccess = packet_traits<T>::Vectorizable,
+ IsRepeatable = true
+ };
+};
+
+} // end namespace internal
+} // end namespace Eigen
+
+namespace tensorflow {
+
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Partial specialization FillFunctor<Device=GPUDevice, T>
+template <typename T>
+struct FillFunctor<GPUDevice, T> {
+ void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstScalar in) {
+ Eigen::internal::scalar_const_op<T> f(in.data());
+ out.device(d) = out.nullaryExpr(f);
+ }
+};
+
+#define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>
+DEFINE_FILL_GPU(float);
+DEFINE_FILL_GPU(double);
+DEFINE_FILL_GPU(int32);
+DEFINE_FILL_GPU(uint8);
+DEFINE_FILL_GPU(int16);
+DEFINE_FILL_GPU(int8);
+DEFINE_FILL_GPU(int64);
+#undef DEFINE_FILL_GPU
+
+// Partial specialization of FillFunctor<Device=GPUDevice, T>.
+template <typename T>
+struct SetZeroFunctor<GPUDevice, T> {
+ void operator()(const GPUDevice& d, typename TTypes<T>::Flat out) {
+ out.device(d) = out.constant(0);
+ }
+};
+
+#define DEFINE_SETZERO_GPU(T) template struct SetZeroFunctor<GPUDevice, T>
+DEFINE_SETZERO_GPU(float);
+#undef DEFINE_SETZERO_GPU
+
+} // end namespace functor
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/constant_op_test.cc b/tensorflow/core/kernels/constant_op_test.cc
new file mode 100644
index 0000000000..f5a464c07c
--- /dev/null
+++ b/tensorflow/core/kernels/constant_op_test.cc
@@ -0,0 +1,43 @@
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+// Returns graph containing "num" const nodes. If 'sequential' is
+// true, make sure all constants are executed sequentially in the
+// graph by adding control dependencies.
+static Graph* ManyConsts(int num, bool sequential) {
+ Graph* g = new Graph(OpRegistry::Global());
+ Node* prev = nullptr;
+ for (int i = 0; i < num; ++i) {
+ Tensor c(DT_FLOAT, TensorShape({}));
+ c.scalar<float>()() = i;
+ Node* curr = test::graph::Constant(g, c);
+ if (sequential && prev != nullptr) {
+ g->AddControlEdge(prev, curr);
+ }
+ prev = curr;
+ }
+ return g;
+}
+
+static void BM_ManyConsts_Parallel(int iters, int num) {
+ testing::ItemsProcessed(static_cast<int64>(iters) * num);
+ test::Benchmark("cpu", ManyConsts(num, false /* !sequential */)).Run(iters);
+}
+BENCHMARK(BM_ManyConsts_Parallel)->Range(1, 1 << 10);
+
+static void BM_ManyConsts_Sequential(int iters, int num) {
+ testing::ItemsProcessed(static_cast<int64>(iters) * num);
+ test::Benchmark("cpu", ManyConsts(num, true /* sequential */)).Run(iters);
+}
+BENCHMARK(BM_ManyConsts_Sequential)->Range(1, 1 << 10);
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
new file mode 100644
index 0000000000..bc44a7f7cc
--- /dev/null
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -0,0 +1,359 @@
+#include "tensorflow/core/kernels/control_flow_ops.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+// A switch op has two inputs and two outputs. It forwards the value of
+// Input:0 to the output specified by input:1. Input:1 is a boolean tensor.
+// Input:0 is forwarded to output:0 if input:1 is false, otherwise to
+// output:1.
+class SwitchOp : public OpKernel {
+ public:
+ explicit SwitchOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& outputPorts = context->input(1);
+ OP_REQUIRES(
+ context, TensorShapeUtils::IsScalar(outputPorts.shape()),
+ errors::InvalidArgument("The second input must be a scalar, "
+ "but it has shape ",
+ outputPorts.shape().ShortDebugString()));
+
+ bool pred = outputPorts.scalar<bool>()();
+ int port = (pred) ? 1 : 0;
+ if (IsRefType(context->input_dtype(0))) {
+ context->forward_ref_input_to_ref_output(0, port);
+ } else {
+ context->set_output(port, context->input(0));
+ }
+ }
+
+ bool IsExpensive() override { return false; }
+
+ ~SwitchOp() override {}
+
+ TF_DISALLOW_COPY_AND_ASSIGN(SwitchOp);
+};
+
+#define REGISTER_CPU_SWITCH(type) \
+ REGISTER_KERNEL_BUILDER(Name("Switch") \
+ .Device(DEVICE_CPU) \
+ .HostMemory("pred") \
+ .TypeConstraint<type>("T"), \
+ SwitchOp)
+
+#define REGISTER_CPU_REF_SWITCH(type) \
+ REGISTER_KERNEL_BUILDER(Name("RefSwitch") \
+ .Device(DEVICE_CPU) \
+ .HostMemory("pred") \
+ .TypeConstraint<type>("T"), \
+ SwitchOp)
+
+#define REGISTER_GPU_SWITCH(type) \
+ REGISTER_KERNEL_BUILDER(Name("Switch") \
+ .Device(DEVICE_GPU) \
+ .HostMemory("pred") \
+ .TypeConstraint<type>("T"), \
+ SwitchOp)
+
+#define REGISTER_GPU_REF_SWITCH(type) \
+ REGISTER_KERNEL_BUILDER(Name("RefSwitch") \
+ .Device(DEVICE_GPU) \
+ .HostMemory("pred") \
+ .TypeConstraint<type>("T"), \
+ SwitchOp)
+
+TF_CALL_ALL_TYPES(REGISTER_CPU_SWITCH);
+TF_CALL_ALL_TYPES(REGISTER_CPU_REF_SWITCH);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SWITCH);
+REGISTER_GPU_SWITCH(bool);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_REF_SWITCH);
+REGISTER_GPU_REF_SWITCH(int32);
+REGISTER_GPU_REF_SWITCH(bool);
+
+#undef REGISTER_CPU_SWITCH
+#undef REGISTER_CPU_REF_SWITCH
+#undef REGISTER_GPU_SWITCH
+#undef REGISTER_GPU_REF_SWITCH
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Switch")
+ .Device(DEVICE_GPU)
+ .HostMemory("data")
+ .HostMemory("pred")
+ .HostMemory("output_false")
+ .HostMemory("output_true")
+ .TypeConstraint<int32>("T"),
+ SwitchOp);
+
+class RefSelectOp : public OpKernel {
+ public:
+ explicit RefSelectOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("N", &num_ref_inputs_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& index_tensor = context->input(0);
+ OP_REQUIRES(
+ context, TensorShapeUtils::IsScalar(index_tensor.shape()),
+ errors::InvalidArgument("Index must be a scalar, "
+ "but it has shape ",
+ index_tensor.shape().ShortDebugString()));
+
+ int32 index = index_tensor.scalar<int32>()();
+
+ OP_REQUIRES(context, index >= 0 && index < num_ref_inputs_,
+ errors::InvalidArgument("Index must be in the range [0, ",
+ num_ref_inputs_, ") but got ", index));
+ context->forward_ref_input_to_ref_output(index + 1, 0);
+ }
+
+ bool IsExpensive() override { return false; }
+
+ ~RefSelectOp() override {}
+
+ TF_DISALLOW_COPY_AND_ASSIGN(RefSelectOp);
+
+ private:
+ int num_ref_inputs_;
+};
+
+#define REGISTER_CPU_REF_SELECT(type) \
+ REGISTER_KERNEL_BUILDER(Name("RefSelect") \
+ .Device(DEVICE_CPU) \
+ .HostMemory("index") \
+ .TypeConstraint<type>("T"), \
+ RefSelectOp)
+TF_CALL_ALL_TYPES(REGISTER_CPU_REF_SELECT);
+
+#undef REGISTER_CPU_REF_SWITCH
+
+// A merge op has n inputs and two outputs. It forwards the value of the
+// first input that becomes available to its first output, and the
+// index of the first input to its second output.
+class MergeOp : public OpKernel {
+ public:
+ explicit MergeOp(OpKernelConstruction* context) : OpKernel(context) {
+ const DataType dt = context->input_type(0);
+ const int num_in = context->num_inputs();
+ OP_REQUIRES_OK(context, context->MatchSignature(DataTypeVector(num_in, dt),
+ {dt, DT_INT32}));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ bool input_seen = false;
+ for (int i = 0; i < context->num_inputs(); ++i) {
+ if (context->has_input(i)) {
+ if (input_seen) {
+ context->SetStatus(errors::Internal(
+ "Merge can not have more than one valid input."));
+ return;
+ }
+ input_seen = true;
+
+ context->set_output(0, context->input(i));
+ Tensor* value_index = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape({}),
+ &value_index));
+ value_index->scalar<int32>()() = i;
+ }
+ }
+ }
+
+ bool IsExpensive() override { return false; }
+
+ ~MergeOp() override {}
+
+ TF_DISALLOW_COPY_AND_ASSIGN(MergeOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Merge").Device(DEVICE_CPU), MergeOp);
+
+#define REGISTER_GPU_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER(Name("Merge") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("value_index"), \
+ MergeOp);
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+
+#undef REGISTER_GPU_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Merge")
+ .Device(DEVICE_GPU)
+ .HostMemory("inputs")
+ .HostMemory("output")
+ .HostMemory("value_index")
+ .TypeConstraint<int32>("T"),
+ MergeOp);
+
+// An enter op has one input and one output. It creates or finds
+// the child frame that is uniquely identified by the frame_name,
+// and makes its input available to the child frame.
+class EnterOp : public OpKernel {
+ public:
+ explicit EnterOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ if (IsRefType(context->input_dtype(0))) {
+ context->forward_ref_input_to_ref_output(0, 0);
+ } else {
+ context->set_output(0, context->input(0));
+ }
+ }
+
+ bool IsExpensive() override { return false; }
+
+ ~EnterOp() override {}
+
+ TF_DISALLOW_COPY_AND_ASSIGN(EnterOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Enter").Device(DEVICE_CPU), EnterOp);
+REGISTER_KERNEL_BUILDER(Name("RefEnter").Device(DEVICE_CPU), EnterOp);
+
+#define REGISTER_GPU_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Enter").Device(DEVICE_GPU).TypeConstraint<type>("T"), EnterOp);
+#define REGISTER_GPU_REF_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("RefEnter").Device(DEVICE_GPU).TypeConstraint<type>("T"), EnterOp);
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_NUMBER_TYPES(REGISTER_GPU_REF_KERNEL);
+
+#undef REGISTER_GPU_KERNEL
+#undef REGISTER_GPU_REF_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Enter")
+ .Device(DEVICE_GPU)
+ .HostMemory("data")
+ .HostMemory("output")
+ .TypeConstraint<int32>("T"),
+ EnterOp);
+
+// An exit op has one input and one output. It exits the current
+// frame to its parent frame, and makes its input available to the
+// parent frame.
+class ExitOp : public OpKernel {
+ public:
+ explicit ExitOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ context->set_output(0, context->input(0));
+ }
+
+ bool IsExpensive() override { return false; }
+
+ ~ExitOp() override {}
+
+ TF_DISALLOW_COPY_AND_ASSIGN(ExitOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Exit").Device(DEVICE_CPU), ExitOp);
+
+#define REGISTER_GPU_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Exit").Device(DEVICE_GPU).TypeConstraint<type>("T"), ExitOp);
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+
+#undef REGISTER_GPU_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Exit")
+ .Device(DEVICE_GPU)
+ .HostMemory("data")
+ .HostMemory("output")
+ .TypeConstraint<int32>("T"),
+ ExitOp);
+
+// A next_iteration op has one input and one output. It makes its input
+// available to the next iteration.
+class NextIterationOp : public OpKernel {
+ public:
+ explicit NextIterationOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ context->set_output(0, context->input(0));
+ }
+
+ bool IsExpensive() override { return false; }
+
+ ~NextIterationOp() override {}
+
+ TF_DISALLOW_COPY_AND_ASSIGN(NextIterationOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("NextIteration").Device(DEVICE_CPU),
+ NextIterationOp);
+
+#define REGISTER_GPU_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("NextIteration").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+ NextIterationOp);
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+
+#undef REGISTER_GPU_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("NextIteration")
+ .Device(DEVICE_GPU)
+ .HostMemory("data")
+ .HostMemory("output")
+ .TypeConstraint<int32>("T"),
+ NextIterationOp);
+
+// A LoopCond op has one input and one output. The input is a boolean
+// scalar representing the taken branches of the "pivot" Switch that
+// determines loop termination. As a contract, any high-level front-end
+// should always use port '0' of the "pivot" switches for loop exit.
+class LoopCondOp : public OpKernel {
+ public:
+ explicit LoopCondOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ context->set_output(0, context->input(0));
+ }
+
+ bool IsExpensive() override { return false; }
+
+ ~LoopCondOp() override {}
+
+ TF_DISALLOW_COPY_AND_ASSIGN(LoopCondOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("LoopCond").Device(DEVICE_CPU), LoopCondOp);
+REGISTER_KERNEL_BUILDER(Name("LoopCond")
+ .Device(DEVICE_GPU)
+ .HostMemory("input")
+ .HostMemory("output"),
+ LoopCondOp);
+
+// ControlTrigger kernels
+REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_CPU),
+ ControlTriggerOp);
+
+REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_GPU),
+ ControlTriggerOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/control_flow_ops.h b/tensorflow/core/kernels/control_flow_ops.h
new file mode 100644
index 0000000000..184cc9fb63
--- /dev/null
+++ b/tensorflow/core/kernels/control_flow_ops.h
@@ -0,0 +1,22 @@
+#ifndef TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_
+#define TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+// A ControlTriggerOp is similar to a NoOp. However, it always treats the input
+// control edges as Live edges. Its primary use so far is in the scheduling of
+// recvs, where we add ControlTrigger nodes and use them to trigger recvs. We
+// allow ControlTrigger nodes to be enabled by dead nodes.
+class ControlTriggerOp : public OpKernel {
+ public:
+ explicit ControlTriggerOp(OpKernelConstruction* context)
+ : OpKernel(context) {}
+ void Compute(OpKernelContext* context) override {}
+ bool IsExpensive() override { return false; }
+};
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_
diff --git a/tensorflow/core/kernels/control_flow_ops_test.cc b/tensorflow/core/kernels/control_flow_ops_test.cc
new file mode 100644
index 0000000000..52bc11abf0
--- /dev/null
+++ b/tensorflow/core/kernels/control_flow_ops_test.cc
@@ -0,0 +1,71 @@
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+namespace {
+
+// Tests for the switch op
+class SwitchOpTest : public OpsTestBase {
+ protected:
+ void Initialize(DataType dt) {
+ RequireDefaultOps();
+ ASSERT_OK(NodeDefBuilder("op", "Switch")
+ .Input(FakeInput(dt))
+ .Input(FakeInput())
+ .Finalize(node_def()));
+ ASSERT_OK(InitOp());
+ }
+};
+
+TEST_F(SwitchOpTest, Int32Success_6_s0) {
+ Initialize(DT_INT32);
+ AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+ AddInputFromArray<bool>(TensorShape({}), {false});
+ ASSERT_OK(RunOpKernel());
+ Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+ test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+ test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+ EXPECT_EQ(nullptr, GetOutput(1));
+}
+
+TEST_F(SwitchOpTest, Int32Success_6_s1) {
+ Initialize(DT_INT32);
+ AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+ AddInputFromArray<bool>(TensorShape({}), {true});
+ ASSERT_OK(RunOpKernel());
+ Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+ test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+ test::ExpectTensorEqual<int32>(expected, *GetOutput(1));
+ EXPECT_EQ(nullptr, GetOutput(0));
+}
+
+TEST_F(SwitchOpTest, Int32Success_2_3_s0) {
+ Initialize(DT_INT32);
+ AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
+ AddInputFromArray<bool>(TensorShape({}), {false});
+ ASSERT_OK(RunOpKernel());
+ Tensor expected(allocator(), DT_INT32, TensorShape({2, 3}));
+ test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+ test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+ EXPECT_EQ(nullptr, GetOutput(1));
+}
+
+TEST_F(SwitchOpTest, StringSuccess_s1) {
+ Initialize(DT_STRING);
+ AddInputFromArray<string>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
+ AddInputFromArray<bool>(TensorShape({}), {true});
+ ASSERT_OK(RunOpKernel());
+ Tensor expected(allocator(), DT_STRING, TensorShape({6}));
+ test::FillValues<string>(&expected, {"A", "b", "C", "d", "E", "f"});
+ test::ExpectTensorEqual<string>(expected, *GetOutput(1));
+ EXPECT_EQ(nullptr, GetOutput(0));
+}
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
new file mode 100644
index 0000000000..2fb623244c
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -0,0 +1,127 @@
+#ifndef TENSORFLOW_KERNELS_CONV_2D_H_
+#define TENSORFLOW_KERNELS_CONV_2D_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// TODO(yangke): revisit these operations and in particular, see if we can
+// combine all of them into just one operation without causing nvcc to
+// timeout.
+template <typename Device, typename T, int Dims>
+struct ShuffleAndReverse {
+ void operator()(const Device& d, typename TTypes<T, Dims>::ConstTensor input,
+ const Eigen::DSizes<Eigen::DenseIndex, Dims>& order,
+ const Eigen::array<bool, Dims>& reverse_dims,
+ typename TTypes<T, Dims>::Tensor output) {
+ output.device(d) = input.shuffle(order).reverse(reverse_dims);
+ }
+};
+
+template <typename Device, typename T, int Dims>
+struct InflatePadAndShuffle {
+ void operator()(
+ const Device& d, typename TTypes<T, Dims>::ConstTensor input,
+ const Eigen::DSizes<Eigen::DenseIndex, Dims>& strides,
+ const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, Dims>& pad_dims,
+ const Eigen::DSizes<Eigen::DenseIndex, Dims>& order,
+ typename TTypes<T, Dims>::Tensor output) {
+ output.device(d) = input.inflate(strides).pad(pad_dims).shuffle(order);
+ }
+};
+
+template <typename Device, typename Input, typename Filter, typename Output>
+void SpatialConvolutionFunc(const Device& d, Output output, Input input,
+ Filter filter, int stride,
+ const Eigen::PaddingType& padding) {
+ output.device(d) = Eigen::SpatialConvolution(input, filter, stride, padding);
+}
+
+template <typename Device, typename T>
+struct SpatialConvolution {
+ void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
+ typename TTypes<T, 4>::ConstTensor input,
+ typename TTypes<T, 4>::ConstTensor filter, int stride,
+ const Eigen::PaddingType& padding) {
+ SpatialConvolutionFunc(d, output, input, filter, stride, padding);
+ }
+};
+
+template <typename Device, typename T>
+struct SpatialConvolutionBackwardInput {
+ void operator()(const Device& d, typename TTypes<T, 4>::Tensor input_backward,
+ typename TTypes<T, 4>::ConstTensor kernel,
+ typename TTypes<T, 4>::ConstTensor output_backward,
+ int input_rows, int input_cols, int stride) {
+ input_backward.device(d) = Eigen::SpatialConvolutionBackwardInput(
+ kernel, output_backward, input_rows, input_cols, stride);
+ }
+};
+
+template <typename Device, typename T>
+struct SpatialConvolutionBackwardKernel {
+ void operator()(const Device& d,
+ typename TTypes<T, 4>::Tensor kernel_backward,
+ typename TTypes<T, 4>::ConstTensor input,
+ typename TTypes<T, 4>::ConstTensor output_backward,
+ int kernel_rows, int kernel_cols, int stride) {
+ kernel_backward.device(d) = Eigen::SpatialConvolutionBackwardKernel(
+ input, output_backward, kernel_rows, kernel_cols, stride);
+ }
+};
+
+// TODO(vrv): Figure out how to use the MatMulFunctor in matmul_op.h.
+// My initial attempt to do this compiled but failed in the pytest
+// due to a swigdeps error.
+template <typename Device, typename T>
+struct MatMulConvFunctor {
+ // Computes on device "d": out = in0 * in1, where * is matrix
+ // multiplication.
+ void operator()(
+ const Device& d, typename TTypes<T, 2>::Tensor out,
+ typename TTypes<T, 2>::ConstTensor in0,
+ typename TTypes<T, 2>::ConstTensor in1,
+ const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
+ out.device(d) = in0.contract(in1, dim_pair);
+ }
+};
+
+template <typename Device, typename T>
+struct TransformFilter {
+ void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor in,
+ typename TTypes<T, 4>::Tensor out) {
+ out.device(d) = in.shuffle(Eigen::DSizes<Eigen::DenseIndex, 4>(3, 2, 0, 1));
+ }
+};
+
+template <typename Device, typename T>
+struct TransformDepth {
+ void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor in,
+ const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle,
+ typename TTypes<T, 4>::Tensor out) {
+ out.device(d) = in.shuffle(shuffle);
+ }
+};
+
+template <typename Device, typename T>
+struct PadInput {
+ void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor in,
+ int padding_rows_left, int padding_rows_right,
+ int padding_cols_left, int padding_cols_right,
+ typename TTypes<T, 4>::Tensor out) {
+ Eigen::array<std::pair<ptrdiff_t, ptrdiff_t>, 4> padding;
+ padding[0] = std::make_pair(0, 0);
+ padding[1] = std::make_pair(padding_rows_left, padding_rows_right);
+ padding[2] = std::make_pair(padding_cols_left, padding_cols_right);
+ padding[3] = std::make_pair(0, 0);
+ out.device(d) = in.pad(padding);
+ }
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_CONV_2D_H_
diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc
new file mode 100644
index 0000000000..bb21d7003c
--- /dev/null
+++ b/tensorflow/core/kernels/conv_grad_ops.cc
@@ -0,0 +1,1190 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define USE_EIGEN_TENSOR
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/public/tensor.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/stream_executor/stream.h"
+#endif // GOOGLE_CUDA
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// The operation to compute Conv2D gradients.
+//
+//
+// To compute the gradients for Conv2D, we need three input tensors:
+// input, filter, and backprop for output.
+// And we need to compute two backprops: one for input and one for filter. We
+// compute them in two different kernels.
+
+// Both backprops can be computed as straightforward conv2d.
+//
+// Consider a case where the input is 3x3 and the filter is 2x1:
+//
+// INPUT = [ A B C ]
+// [ D E F ]
+// [ G H I ]
+//
+// where each "A", "B", etc is batch x in_depth
+//
+// FILTER = [ X Y ]
+//
+// where both "X" and "Y" are in_depth x out_depth
+//
+// With VALID padding, the output is 3x2:
+//
+// OUTPUT = [ a b ]
+// [ c d ]
+// [ e f ]
+//
+// where each "a", "b", etc is batch x out_depth
+//
+// So we have:
+//
+// a = A * X + B * Y
+// b = B * X + C * Y
+// c = D * X + E * Y
+// d = E * X + F * Y
+// e = G * X + H * Y
+// f = H * X + I * Y
+//
+// So when we have backprops for the outputs (we denote them by
+// a', b', ... ):
+//
+// The backprops for the input are:
+//
+// A' = a' * X^t
+// B' = a' * Y^t + b' * X^t
+// C' = b' * Y^t
+// ...
+//
+// This is essentially computing a 2d conv of
+//
+// INPUT = [ 0 a' b' 0 ]
+// [ 0 c' d' 0 ]
+// [ 0 e' f' 0 ]
+// and
+//
+// FILTER = [ Y^t X^t ]
+//
+// The backprops for the filter are:
+//
+// X' = A^t * a' + B^t * b' + D^t * c' + E^t * d' + G^t * e' + H^t * f'
+// Y' = B^t * a' + C^t * b' + E^t + c' + F^t * d' + H^t * e' + I^t * f'
+//
+// This is essentially computing a 2d conv of
+//
+// INPUT = [ A^t B^t C^t ]
+// [ D^t E^t F^t ]
+// [ G^t H^t I^t ]
+//
+// and
+//
+// FILTER = [ a' b' ]
+// [ c' d' ]
+// [ e' f' ]
+//
+//
+//////////////////////////////////////////////////////////
+//
+// With stride more than one, it's a bit more complicated (we will need to
+// create holes to the backprop).
+//
+// Consider the case where
+//
+// INPUT = [ A B C D E ]
+// [ F G H I J ]
+// [ K L M N O ]
+// and
+//
+// FILTER = [ X Y Z ]
+//
+// with stride 2.
+//
+// The output will be
+//
+// OUTPUT = [ a b ]
+// [ c d ]
+//
+// where:
+//
+// a = A * X + B * Y + C * Z
+// b = C * X + D * Y + E * Z
+// c = K * X + L * Y + M * Z
+// d = M * X + N * Y + O * Z
+//
+//
+// To compute the backprop for INPUT, we need to convolve
+//
+// INPUT = [ 0 0 a' 0 b' 0 0 ]
+// [ 0 0 0 0 0 0 0 ]
+// [ 0 0 c' 0 d' 0 0 ]
+//
+// (notice the holes in INPUT)
+//
+// and
+//
+// FILTER = [ Z^t Y^t X^t ]
+//
+// with stride 1.
+//
+// To compute the backprop for FILTER, we need to convolve
+
+//
+// INPUT = [ A^t B^t C^t D^t E^t ]
+// [ F^t G^t H^t I^t J^t ]
+// [ K^t L^t M^t N^t O^t ]
+// and
+//
+// FILTER = [ a' 0 b' ]
+// [ 0 0 0 ]
+// [ c' 0 d' ]
+//
+// (notice the holes in FILTER)
+//
+//
+// with stride 1
+//
+//////////////////////////////////////////////////////////
+//
+//
+// The case for SAME padding is in fact very similar to VALID -- we just
+// need to pad the input tensor a bit when computing the filter_backprop.
+
+// Common code between the two kernels: verifies that the dimensions all match
+// and extract the padded rows and columns.
+#define EXTRACT_AND_VERIFY_DIMENSIONS(label) \
+ const Tensor& out_backprop = context->input(2); \
+ OP_REQUIRES( \
+ context, input_shape.dims() == 4, \
+ errors::InvalidArgument(label, ": input must be 4-dimensional")); \
+ OP_REQUIRES( \
+ context, filter_shape.dims() == 4, \
+ errors::InvalidArgument(label, ": filter must be 4-dimensional")); \
+ OP_REQUIRES( \
+ context, out_backprop.dims() == 4, \
+ errors::InvalidArgument(label, ": out_backprop must be 4-dimensional")); \
+ const int64 batch = input_shape.dim_size(0); \
+ OP_REQUIRES( \
+ context, batch == out_backprop.dim_size(0), \
+ errors::InvalidArgument( \
+ label, ": input and out_backprop must have the same batch size")); \
+ const int64 input_rows = input_shape.dim_size(1); \
+ const int64 input_cols = input_shape.dim_size(2); \
+ const int64 filter_rows = filter_shape.dim_size(0); \
+ const int64 filter_cols = filter_shape.dim_size(1); \
+ const int64 output_rows = out_backprop.dim_size(1); \
+ const int64 output_cols = out_backprop.dim_size(2); \
+ const int64 in_depth = input_shape.dim_size(3); \
+ OP_REQUIRES(context, in_depth == filter_shape.dim_size(2), \
+ errors::InvalidArgument( \
+ label, ": input and filter must have the same depth")); \
+ const int64 out_depth = filter_shape.dim_size(3); \
+ OP_REQUIRES( \
+ context, out_depth == out_backprop.dim_size(3), \
+ errors::InvalidArgument( \
+ label, ": filter and out_backprop must have the same out_depth")); \
+ const auto stride = strides_[1]; \
+ int out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; \
+ if (filter_cols == filter_rows && filter_rows == 1 && stride == 1) { \
+ out_rows = input_rows; \
+ out_cols = input_cols; \
+ } else { \
+ OP_REQUIRES_OK( \
+ context, Get2dOutputSize(input_rows, input_cols, filter_rows, \
+ filter_cols, stride, stride, padding_, \
+ &out_rows, &out_cols, &pad_rows, &pad_cols)); \
+ } \
+ OP_REQUIRES( \
+ context, output_rows == out_rows, \
+ errors::InvalidArgument( \
+ label, ": Number of rows of out_backprop doesn't match computed: ", \
+ "actual = ", output_rows, ", computed = ", out_rows)); \
+ OP_REQUIRES( \
+ context, output_cols == out_cols, \
+ errors::InvalidArgument( \
+ label, ": Number of cols of out_backprop doesn't match computed: ", \
+ "actual = ", output_cols, ", computed = ", out_cols)); \
+ const auto expanded_out_rows = (output_rows - 1) * stride + 1; \
+ const auto expanded_out_cols = (output_cols - 1) * stride + 1; \
+ const auto padded_out_rows = input_rows + filter_rows - 1; \
+ const auto padded_out_cols = input_cols + filter_cols - 1; \
+ const auto top_pad_rows = filter_rows - 1 - pad_rows; \
+ const auto left_pad_cols = filter_cols - 1 - pad_cols; \
+ const auto bottom_pad_rows = \
+ padded_out_rows - expanded_out_rows - top_pad_rows; \
+ const auto right_pad_cols = \
+ padded_out_cols - expanded_out_cols - left_pad_cols; \
+ Eigen::DSizes<Eigen::DenseIndex, 4> strides{1, stride, stride, 1}; \
+ VLOG(2) << "Conv2d: " << label \
+ << ": expanded_out_rows = " << expanded_out_rows \
+ << ", expanded_out_cols = " << expanded_out_cols \
+ << ", filter_rows = " << filter_rows \
+ << ", filter_cols = " << filter_cols \
+ << ", padded_out_rows = " << padded_out_rows \
+ << ", padded_out_cols = " << padded_out_cols \
+ << ", top_pad_rows = " << top_pad_rows \
+ << ", left_pad_cols = " << left_pad_cols \
+ << ", bottom_pad_rows = " << bottom_pad_rows \
+ << ", right_pad_cols = " << right_pad_cols \
+ << ", strides = " << strides[1]
+
+namespace {
+TensorShape VectorToShape(const TTypes<int32>::ConstVec& sizes) {
+ TensorShape shape;
+
+ using Index = TTypes<int32>::ConstVec::Index;
+ const Index dims = sizes.size();
+ for (Index i = 0; i < dims; ++i) {
+ shape.AddDim(sizes(i));
+ }
+
+ return shape;
+}
+} // namespace
+
+// The fast versions using eigen computations directly. They are only enabled
+// for CPU for now since nvcc times out when trying to compile them.
+// TODO(yangke): enable them for GPUs when we have a faster compiler.
+
+template <typename Device, class T>
+class Conv2DFastBackpropInputOp : public OpKernel {
+ public:
+ explicit Conv2DFastBackpropInputOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+ OP_REQUIRES(context, strides_.size() == 4,
+ errors::InvalidArgument(
+ "Sliding window strides field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES(context, strides_[1] == strides_[2],
+ errors::InvalidArgument(
+ "Current implementation only supports equal length "
+ "strides in the row and column dimensions."));
+ OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1),
+ errors::InvalidArgument(
+ "Current implementation does not yet support "
+ "strides in the batch and depth dimensions."));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input_sizes = context->input(0);
+ const Tensor& filter = context->input(1);
+ OP_REQUIRES(
+ context, TensorShapeUtils::IsVector(input_sizes.shape()),
+ errors::InvalidArgument(
+ "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
+ input_sizes.dims()));
+ TensorShape input_shape = VectorToShape(input_sizes.vec<int32>());
+ const TensorShape& filter_shape = filter.shape();
+
+ EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropInput");
+ Tensor* in_backprop = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, input_shape, &in_backprop));
+ // Need to flip the input_rows and input_cols when passing to eigen.
+ functor::SpatialConvolutionBackwardInput<Device, T>()(
+ context->eigen_device<Device>(), in_backprop->tensor<T, 4>(),
+ filter.tensor<T, 4>(), out_backprop.tensor<T, 4>(), input_cols,
+ input_rows, stride);
+ }
+
+ private:
+ std::vector<int32> strides_;
+ Padding padding_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(Conv2DFastBackpropInputOp);
+};
+
+// Based on implementation written by Yangqing Jia (jiayq).
+template <typename Device, class T>
+class Conv2DCustomBackpropInputOp : public OpKernel {
+ public:
+ explicit Conv2DCustomBackpropInputOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+ OP_REQUIRES(context, strides_.size() == 4,
+ errors::InvalidArgument("Sliding window strides field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES(context, strides_[1] == strides_[2],
+ errors::InvalidArgument(
+ "Current implementation only supports equal length "
+ "strides in the row and column dimensions."));
+ OP_REQUIRES(
+ context, (strides_[0] == 1 && strides_[3] == 1),
+ errors::InvalidArgument("Current implementation does not yet support "
+ "strides in the batch and depth dimensions."));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input_sizes = context->input(0);
+ const Tensor& filter = context->input(1);
+ OP_REQUIRES(
+ context, TensorShapeUtils::IsVector(input_sizes.shape()),
+ errors::InvalidArgument(
+ "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
+ input_sizes.dims()));
+ TensorShape input_shape = VectorToShape(input_sizes.vec<int32>());
+ const TensorShape& filter_shape = filter.shape();
+
+ EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropInput");
+ Tensor* in_backprop = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, input_shape, &in_backprop));
+
+ // TODO(andydavis) Consider moving code shared with
+ // Conv2DCustomBackpropFilterOp into a shared helper function.
+ int pad_top;
+ int pad_bottom;
+ int pad_left;
+ int pad_right;
+ OP_REQUIRES_OK(
+ context,
+ Get2dOutputSizeVerbose(input_rows, input_cols, filter_rows, filter_cols,
+ stride, stride, padding_, &out_rows, &out_cols,
+ &pad_top, &pad_bottom, &pad_left, &pad_right));
+
+ // The total dimension size of each kernel.
+ const int filter_total_size = filter_rows * filter_cols * in_depth;
+ // The output image size is the spatial size of the output.
+ const int output_image_size = out_rows * out_cols;
+
+ Tensor col_buffer;
+ OP_REQUIRES_OK(
+ context,
+ context->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({output_image_size, filter_total_size}), &col_buffer));
+
+ // The input offset corresponding to a single input image.
+ const int input_offset = input_rows * input_cols * in_depth;
+ // The output offset corresponding to a single output image.
+ const int output_offset = out_rows * out_cols * out_depth;
+
+ auto* filter_data = filter.template flat<T>().data();
+ auto* col_buffer_data = col_buffer.template flat<T>().data();
+ auto* out_backprop_data = out_backprop.template flat<T>().data();
+ auto* input_backprop_data = in_backprop->template flat<T>().data();
+
+ typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
+ Eigen::RowMajor>> MatrixMap;
+ typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
+ Eigen::RowMajor>> ConstMatrixMap;
+
+ for (int image_id = 0; image_id < batch; ++image_id) {
+ // Compute gradient into col_buffer.
+ MatrixMap C(col_buffer_data, output_image_size, filter_total_size);
+
+ ConstMatrixMap A(out_backprop_data + output_offset * image_id,
+ output_image_size, out_depth);
+ ConstMatrixMap B(filter_data, filter_total_size, out_depth);
+
+ // TODO(andydavis) Use a multi-threaded matmul implementation here.
+ C.noalias() = A * B.transpose();
+
+ Col2im<T>(col_buffer_data, in_depth, input_rows, input_cols, filter_rows,
+ filter_cols, pad_top, pad_left, pad_bottom, pad_right, stride,
+ stride, input_backprop_data);
+
+ input_backprop_data += input_offset;
+ }
+ }
+
+ private:
+ std::vector<int32> strides_;
+ Padding padding_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropInputOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
+ .Device(DEVICE_CPU)
+ .TypeConstraint<float>("T"),
+ Conv2DCustomBackpropInputOp<CPUDevice, float>);
+
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
+ .Device(DEVICE_CPU)
+ .Label("custom")
+ .TypeConstraint<float>("T"),
+ Conv2DCustomBackpropInputOp<CPUDevice, float>);
+
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
+ .Device(DEVICE_CPU)
+ .Label("eigen_tensor")
+ .TypeConstraint<float>("T"),
+ Conv2DFastBackpropInputOp<CPUDevice, float>);
+
+template <typename Device, class T>
+class Conv2DFastBackpropFilterOp : public OpKernel {
+ public:
+ explicit Conv2DFastBackpropFilterOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+ OP_REQUIRES(context, strides_.size() == 4,
+ errors::InvalidArgument(
+ "Sliding window strides field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES(context, strides_[1] == strides_[2],
+ errors::InvalidArgument(
+ "Current implementation only supports equal length "
+ "strides in the row and column dimensions."));
+ OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1),
+ errors::InvalidArgument(
+ "Current implementation does not yet support "
+ "strides in the batch and depth dimensions."));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ const Tensor& filter_sizes = context->input(1);
+ OP_REQUIRES(
+ context, TensorShapeUtils::IsVector(filter_sizes.shape()),
+ errors::InvalidArgument(
+ "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
+ filter_sizes.dims()));
+ const TensorShape& input_shape = input.shape();
+ TensorShape filter_shape = VectorToShape(filter_sizes.vec<int32>());
+
+ EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropFilter");
+ Tensor* filter_backprop = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, filter_shape, &filter_backprop));
+
+ // Need to flip the filter_rows and filter_cols when passing to eigen.
+ functor::SpatialConvolutionBackwardKernel<Device, T>()(
+ context->eigen_device<Device>(), filter_backprop->tensor<T, 4>(),
+ input.tensor<T, 4>(), out_backprop.tensor<T, 4>(), filter_cols,
+ filter_rows, stride);
+ }
+
+ private:
+ std::vector<int32> strides_;
+ Padding padding_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(Conv2DFastBackpropFilterOp);
+};
+
+// Based on implementation written by Yangqing Jia (jiayq).
+template <typename Device, class T>
+class Conv2DCustomBackpropFilterOp : public OpKernel {
+ public:
+ explicit Conv2DCustomBackpropFilterOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+ OP_REQUIRES(context, strides_.size() == 4,
+ errors::InvalidArgument("Sliding window strides field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES(context, strides_[1] == strides_[2],
+ errors::InvalidArgument(
+ "Current implementation only supports equal length "
+ "strides in the row and column dimensions."));
+ OP_REQUIRES(
+ context, (strides_[0] == 1 && strides_[3] == 1),
+ errors::InvalidArgument("Current implementation does not yet support "
+ "strides in the batch and depth dimensions."));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ const Tensor& filter_sizes = context->input(1);
+ OP_REQUIRES(
+ context, TensorShapeUtils::IsVector(filter_sizes.shape()),
+ errors::InvalidArgument(
+ "Conv2DCustomBackpropFilter: filter_sizes input must be 1-dim, "
+ "not ",
+ filter_sizes.dims()));
+ const TensorShape& input_shape = input.shape();
+ TensorShape filter_shape = VectorToShape(filter_sizes.vec<int32>());
+
+ EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DCustomBackpropFilter");
+ Tensor* filter_backprop;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, filter_shape, &filter_backprop));
+
+ int pad_top;
+ int pad_bottom;
+ int pad_left;
+ int pad_right;
+ OP_REQUIRES_OK(
+ context,
+ Get2dOutputSizeVerbose(input_rows, input_cols, filter_rows, filter_cols,
+ stride, stride, padding_, &out_rows, &out_cols,
+ &pad_top, &pad_bottom, &pad_left, &pad_right));
+
+ // The total dimension size of each kernel.
+ const int filter_total_size = filter_rows * filter_cols * in_depth;
+ // The output image size is the spatial size of the output.
+ const int output_image_size = out_rows * out_cols;
+
+ Tensor col_buffer;
+ OP_REQUIRES_OK(
+ context,
+ context->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({output_image_size, filter_total_size}), &col_buffer));
+
+ // The input offset corresponding to a single input image.
+ const int input_offset = input_rows * input_cols * in_depth;
+ // The output offset corresponding to a single output image.
+ const int output_offset = out_rows * out_cols * out_depth;
+
+ auto* input_data = input.template flat<T>().data();
+ auto* col_buffer_data = col_buffer.template flat<T>().data();
+ auto* out_backprop_data = out_backprop.template flat<T>().data();
+ auto* filter_backprop_data = filter_backprop->template flat<T>().data();
+
+ typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
+ Eigen::RowMajor>> MatrixMap;
+ typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
+ Eigen::RowMajor>> ConstMatrixMap;
+
+ MatrixMap C(filter_backprop_data, filter_total_size, out_depth);
+
+ C.setZero();
+ for (int image_id = 0; image_id < batch; ++image_id) {
+ // When we compute the gradient with respect to the filters, we need to do
+ // im2col to allow gemm-type computation.
+ Im2col<T>(input_data, in_depth, input_rows, input_cols, filter_rows,
+ filter_cols, pad_top, pad_left, pad_bottom, pad_right, stride,
+ stride, col_buffer_data);
+
+ ConstMatrixMap A(col_buffer_data, output_image_size, filter_total_size);
+ ConstMatrixMap B(out_backprop_data + output_offset * image_id,
+ output_image_size, out_depth);
+
+ // Compute gradient with respect to filter.
+ // TODO(andydavis) Use a multi-threaded matmul implementation here.
+ C.noalias() += A.transpose() * B;
+
+ input_data += input_offset;
+ }
+ }
+
+ private:
+ std::vector<int32> strides_;
+ Padding padding_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropFilterOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
+ .Device(DEVICE_CPU)
+ .TypeConstraint<float>("T"),
+ Conv2DCustomBackpropFilterOp<CPUDevice, float>);
+
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
+ .Device(DEVICE_CPU)
+ .Label("custom")
+ .TypeConstraint<float>("T"),
+ Conv2DCustomBackpropFilterOp<CPUDevice, float>);
+
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
+ .Device(DEVICE_CPU)
+ .Label("eigen_tensor")
+ .TypeConstraint<float>("T"),
+ Conv2DFastBackpropFilterOp<CPUDevice, float>);
+
+// GPU definitions of both ops.
+#if GOOGLE_CUDA
+namespace {
+template <typename T>
+perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory,
+ uint64 size) {
+ perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory),
+ size * sizeof(T));
+ perftools::gputools::DeviceMemory<T> typed(wrapped);
+ return typed;
+}
+} // namespace
+
+// The slow version (but compiles for GPU)
+
+// Backprop for input.
+template <typename Device, class T>
+class Conv2DSlowBackpropInputOp : public OpKernel {
+ public:
+ explicit Conv2DSlowBackpropInputOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+ OP_REQUIRES(context, strides_.size() == 4,
+ errors::InvalidArgument(
+ "Sliding window strides field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES(context, strides_[1] == strides_[2],
+ errors::InvalidArgument(
+ "Current implementation only supports equal length "
+ "strides in the row and column dimensions."));
+ OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1),
+ errors::InvalidArgument(
+ "Current implementation does not yet support "
+ "strides in the batch and depth dimensions."));
+ OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
+ use_cudnn_ &= CanUseCudnn();
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input_sizes = context->input(0);
+ const Tensor& filter = context->input(1);
+ OP_REQUIRES(
+ context, TensorShapeUtils::IsVector(input_sizes.shape()),
+ errors::InvalidArgument(
+ "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
+ input_sizes.dims()));
+ TensorShape input_shape = VectorToShape(input_sizes.vec<int32>());
+ const TensorShape& filter_shape = filter.shape();
+
+ EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropInput");
+ Tensor* in_backprop = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, input_shape, &in_backprop));
+
+ const int padding_rows =
+ (output_rows - 1) * stride + filter_rows - input_rows;
+ const int padding_cols =
+ (output_cols - 1) * stride + filter_cols - input_cols;
+
+ // TODO(keveman): cuDNN only supports equal padding on both sides, so only
+ // calling it when that is true. Remove this check when (if?) cuDNN starts
+ // supporting different padding.
+ bool padding_compatible =
+ (padding_rows % 2 == 0) && (padding_cols % 2 == 0);
+
+ auto* stream = context->op_device_context<GPUDeviceContext>()->stream();
+ OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+ if (use_cudnn_ && padding_compatible) {
+ if (filter_rows == 1 && filter_cols == 1 && stride == 1) {
+ // 1x1 filter, so call cublas directly.
+ const uint64 m = batch * input_rows * input_cols;
+ const uint64 k = out_depth;
+ const uint64 n = in_depth;
+
+ auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
+ out_backprop.template flat<T>().size());
+ auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
+ filter.template flat<T>().size());
+ auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
+ in_backprop->template flat<T>().size());
+
+ auto transpose = perftools::gputools::blas::Transpose::kTranspose;
+ auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+
+ bool blas_launch_status =
+ stream->ThenBlasGemm(transpose, no_transpose, n, m, k, 1.0f, b_ptr,
+ k, a_ptr, k, 0.0f, &c_ptr, n)
+ .ok();
+ if (!blas_launch_status) {
+ context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=",
+ m, ", n=", n, ", k=", k));
+ }
+ return;
+ }
+
+ perftools::gputools::dnn::BatchDescriptor input_desc;
+ input_desc.set_count(batch)
+ .set_height(input_rows)
+ .set_width(input_cols)
+ .set_feature_map_count(in_depth)
+ .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+ perftools::gputools::dnn::BatchDescriptor output_desc;
+ output_desc.set_count(batch)
+ .set_height(output_rows)
+ .set_width(output_cols)
+ .set_feature_map_count(out_depth)
+ .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+ perftools::gputools::dnn::FilterDescriptor filter_desc;
+ filter_desc.set_input_filter_height(filter_rows)
+ .set_input_filter_width(filter_cols)
+ .set_input_feature_map_count(in_depth)
+ .set_output_feature_map_count(out_depth);
+ perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+ conv_desc.set_vertical_filter_stride(stride)
+ .set_horizontal_filter_stride(stride)
+ .set_zero_padding_height(padding_rows / 2)
+ .set_zero_padding_width(padding_cols / 2);
+
+ // NOTE(keveman):
+ // cuDNN only supports the following layouts :
+ // Input : B x D x R x C
+ // Filter : OD x ID x R x C
+ // Whereas, we have
+ // Input : B x R x C x D
+ // Filter : R x C x ID x OD
+ // TransformFilter performs (R x C x ID x OD) => (OD x ID x R x C)
+ // The first TransformDepth performs
+ // (B x R x C x D) => (B x D x R x C).
+ // Since the tensor returned from cuDNN is B x D x R x C also,
+ // the second TransformDepth performs
+ // (B x D x R x C) => (B x R x C x D).
+ Tensor transformed_filter;
+ OP_REQUIRES_OK(
+ context,
+ context->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({out_depth, in_depth, filter_rows, filter_cols}),
+ &transformed_filter));
+
+ functor::TransformFilter<Device, T>()(context->eigen_device<Device>(),
+ filter.tensor<T, 4>(),
+ transformed_filter.tensor<T, 4>());
+
+ Tensor transformed_out_backprop;
+ OP_REQUIRES_OK(
+ context,
+ context->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({batch, out_depth, output_rows, output_cols}),
+ &transformed_out_backprop));
+
+ functor::TransformDepth<Device, T>()(
+ context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
+ Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2),
+ transformed_out_backprop.tensor<T, 4>());
+
+ Tensor pre_transformed_in_backprop;
+ OP_REQUIRES_OK(context,
+ context->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({batch, in_depth, input_rows, input_cols}),
+ &pre_transformed_in_backprop));
+
+ auto out_backprop_ptr =
+ AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
+ transformed_out_backprop.template flat<T>().size());
+ auto filter_ptr =
+ AsDeviceMemory(transformed_filter.template flat<T>().data(),
+ transformed_filter.template flat<T>().size());
+ auto in_backprop_ptr =
+ AsDeviceMemory(pre_transformed_in_backprop.template flat<T>().data(),
+ pre_transformed_in_backprop.template flat<T>().size());
+
+ bool cudnn_launch_status =
+ stream->ThenConvolveBackwardData(filter_desc, filter_ptr, output_desc,
+ out_backprop_ptr, conv_desc,
+ input_desc, &in_backprop_ptr)
+ .ok();
+
+ if (!cudnn_launch_status) {
+ context->SetStatus(errors::Internal(
+ "cuDNN Backward Data function launch failure : input shape(",
+ input_shape.DebugString(), ") filter shape(",
+ filter_shape.DebugString(), ")"));
+ }
+
+ auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+ functor::TransformDepth<Device, T>()(
+ context->eigen_device<Device>(),
+ toConstTensor(pre_transformed_in_backprop).template tensor<T, 4>(),
+ Eigen::DSizes<Eigen::DenseIndex, 4>(0, 2, 3, 1),
+ in_backprop->tensor<T, 4>());
+ } else {
+ // We fill out a padded out_backprop
+ TensorShape padded_out_shape(
+ {batch, padded_out_rows, padded_out_cols, out_depth});
+ Tensor padded_output;
+ OP_REQUIRES_OK(context,
+ context->allocate_temp(DataTypeToEnum<T>::v(),
+ padded_out_shape, &padded_output));
+
+ Eigen::DSizes<Eigen::DenseIndex, 4> trivial_order{0, 1, 2, 3};
+ Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4> pad_dims{
+ {{0, 0},
+ {top_pad_rows, bottom_pad_rows},
+ {left_pad_cols, right_pad_cols},
+ {0, 0}}};
+
+ functor::InflatePadAndShuffle<Device, T, 4>()(
+ context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), strides,
+ pad_dims, trivial_order, padded_output.tensor<T, 4>());
+ const Tensor& padded_output_cref = padded_output;
+
+ // We then need to fill a new "reverted" filter
+ // We need to transpose the in_depth and out_depth for the filter and
+ // inverse the rows and cols.
+ TensorShape r_filter_shape(
+ {filter_rows, filter_cols, out_depth, in_depth});
+ Tensor r_filter;
+ OP_REQUIRES_OK(context,
+ context->allocate_temp(DataTypeToEnum<T>::v(),
+ r_filter_shape, &r_filter));
+
+ Eigen::DSizes<Eigen::DenseIndex, 4> filter_order{0, 1, 3, 2};
+ Eigen::array<bool, 4> filter_rev_dims{true, true, false, false};
+ functor::ShuffleAndReverse<Device, T, 4>()(
+ context->eigen_device<Device>(), filter.tensor<T, 4>(), filter_order,
+ filter_rev_dims, r_filter.tensor<T, 4>());
+ const Tensor& r_filter_cref = r_filter;
+
+ // Now we can call conv_2d directly.
+ functor::SpatialConvolution<Device, T>()(
+ context->eigen_device<Device>(), in_backprop->tensor<T, 4>(),
+ padded_output_cref.tensor<T, 4>(), r_filter_cref.tensor<T, 4>(), 1,
+ BrainPadding2EigenPadding(VALID));
+ }
+ }
+
+ private:
+ std::vector<int32> strides_;
+ Padding padding_;
+ bool use_cudnn_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(Conv2DSlowBackpropInputOp);
+};
+
+// Backprop for filter.
+template <typename Device, class T>
+class Conv2DSlowBackpropFilterOp : public OpKernel {
+ public:
+ explicit Conv2DSlowBackpropFilterOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+ OP_REQUIRES(context, strides_.size() == 4,
+ errors::InvalidArgument(
+ "Sliding window strides field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES(context, strides_[1] == strides_[2],
+ errors::InvalidArgument(
+ "Current implementation only supports equal length "
+ "strides in the row and column dimensions."));
+ OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1),
+ errors::InvalidArgument(
+ "Current implementation does not yet support "
+ "strides in the batch and depth dimensions."));
+ OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
+ use_cudnn_ &= CanUseCudnn();
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ const Tensor& filter_sizes = context->input(1);
+ OP_REQUIRES(
+ context, TensorShapeUtils::IsVector(filter_sizes.shape()),
+ errors::InvalidArgument(
+ "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
+ filter_sizes.dims()));
+ const TensorShape& input_shape = input.shape();
+ TensorShape filter_shape = VectorToShape(filter_sizes.vec<int32>());
+
+ EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropFilter");
+ Tensor* filter_backprop = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, filter_shape, &filter_backprop));
+
+ const int padding_rows =
+ (output_rows - 1) * stride + filter_rows - input_rows;
+ const int padding_cols =
+ (output_cols - 1) * stride + filter_cols - input_cols;
+
+ // TODO(zhengxq): cuDNN only supports equal padding on both sides, so only
+ // calling it when that is true. Remove this check when (if?) cuDNN starts
+ // supporting different padding.
+ bool padding_compatible =
+ (padding_rows % 2 == 0) && (padding_cols % 2 == 0);
+
+ auto* stream = context->op_device_context<GPUDeviceContext>()->stream();
+ OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+ if (use_cudnn_ && padding_compatible) {
+ if (filter_rows == 1 && filter_cols == 1 && stride == 1) {
+ const uint64 m = in_depth;
+ const uint64 k = batch * input_rows * input_cols;
+ const uint64 n = out_depth;
+
+ // The shape of output backprop is
+ // [batch, out_rows, out_cols, out_depth]
+ // From cublas's perspective, it is: n x k
+ auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
+ out_backprop.template flat<T>().size());
+
+ // The shape of input is
+ // [batch, in_rows, in_cols, in_depth],
+ // From cublas's perspective, it is: m x k
+ auto b_ptr = AsDeviceMemory(input.template flat<T>().data(),
+ input.template flat<T>().size());
+
+ // the shape of the filter backprop from the conv_2d should be
+ // [1, 1, in_depth, out_depth]
+ // From cublas's perspective, it is: n x m
+ auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
+ filter_backprop->template flat<T>().size());
+
+ bool blas_launch_status =
+ stream->ThenBlasGemm(
+ perftools::gputools::blas::Transpose::kNoTranspose,
+ perftools::gputools::blas::Transpose::kTranspose, n, m, k,
+ 1.0f, a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n)
+ .ok();
+ if (!blas_launch_status) {
+ context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=",
+ m, ", n=", n, ", k=", k));
+ }
+ return;
+ }
+
+ perftools::gputools::dnn::BatchDescriptor input_desc;
+ input_desc.set_count(batch)
+ .set_height(input_rows)
+ .set_width(input_cols)
+ .set_feature_map_count(in_depth)
+ .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+ perftools::gputools::dnn::BatchDescriptor output_desc;
+ output_desc.set_count(batch)
+ .set_height(output_rows)
+ .set_width(output_cols)
+ .set_feature_map_count(out_depth)
+ .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+ perftools::gputools::dnn::FilterDescriptor filter_desc;
+ filter_desc.set_input_filter_height(filter_rows)
+ .set_input_filter_width(filter_cols)
+ .set_input_feature_map_count(in_depth)
+ .set_output_feature_map_count(out_depth);
+ perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+ conv_desc.set_vertical_filter_stride(stride)
+ .set_horizontal_filter_stride(stride)
+ .set_zero_padding_height(padding_rows / 2)
+ .set_zero_padding_width(padding_cols / 2);
+
+ // NOTE(zhengxq):
+ // cuDNN only supports the following layouts :
+ // Input : B x D x R x C
+ // Filter : OD x ID x R x C
+ // Whereas, we have
+ // Input : B x R x C x D
+ // Filter : R x C x ID x OD
+ // TransformFilter performs (R x C x ID x OD) => (OD x ID x R x C)
+ // The first TransformDepth performs
+ // (B x R x C x D) => (B x D x R x C).
+ // Since the tensor returned from cuDNN is B x D x R x C also,
+ // the second TransformDepth performs
+ // (B x D x R x C) => (B x R x C x D).
+
+ Tensor pre_transformed_filter_backprop;
+ OP_REQUIRES_OK(
+ context,
+ context->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({out_depth, in_depth, filter_rows, filter_cols}),
+ &pre_transformed_filter_backprop));
+
+ Tensor transformed_out_backprop;
+ OP_REQUIRES_OK(
+ context,
+ context->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({batch, out_depth, output_rows, output_cols}),
+ &transformed_out_backprop));
+
+ functor::TransformDepth<Device, T>()(
+ context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
+ Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2),
+ transformed_out_backprop.tensor<T, 4>());
+
+ Tensor transformed_input;
+ OP_REQUIRES_OK(context,
+ context->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({batch, in_depth, input_rows, input_cols}),
+ &transformed_input));
+
+ functor::TransformDepth<Device, T>()(
+ context->eigen_device<Device>(), input.tensor<T, 4>(),
+ Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2),
+ transformed_input.tensor<T, 4>());
+
+ auto out_backprop_ptr =
+ AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
+ transformed_out_backprop.template flat<T>().size());
+ auto filter_backprop_ptr = AsDeviceMemory(
+ pre_transformed_filter_backprop.template flat<T>().data(),
+ pre_transformed_filter_backprop.template flat<T>().size());
+ auto input_ptr =
+ AsDeviceMemory(transformed_input.template flat<T>().data(),
+ transformed_input.template flat<T>().size());
+
+ bool cudnn_launch_status =
+ stream->ThenConvolveBackwardFilter(input_desc, input_ptr, output_desc,
+ out_backprop_ptr, conv_desc,
+ filter_desc, &filter_backprop_ptr)
+ .ok();
+
+ if (!cudnn_launch_status) {
+ context->SetStatus(errors::Internal(
+ "cuDNN Backward Filter function launch failure : input shape(",
+ input_shape.DebugString(), ") filter shape(",
+ filter_shape.DebugString(), ")"));
+ }
+
+ auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+ functor::TransformDepth<Device, T>()(
+ context->eigen_device<Device>(),
+ toConstTensor(pre_transformed_filter_backprop)
+ .template tensor<T, 4>(),
+ Eigen::DSizes<Eigen::DenseIndex, 4>(2, 3, 1, 0),
+ filter_backprop->tensor<T, 4>());
+ } else {
+ // Fall back to the non-cudnn code path
+
+ // For the backprop of the filter, we need to also transpose the
+ // out_backprop.
+ // The shape of backprop is
+ // [batch, out_rows, out_cols, out_depth]
+ // And we need to change it to
+ // [out_depth, out_rows, out_cols, batch]
+ Eigen::DSizes<Eigen::DenseIndex, 4> out_order{3, 1, 2, 0};
+ TensorShape padded_out_shape(
+ {out_depth, padded_out_rows, padded_out_cols, batch});
+ Tensor padded_output;
+ OP_REQUIRES_OK(context,
+ context->allocate_temp(DataTypeToEnum<T>::v(),
+ padded_out_shape, &padded_output));
+
+ Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4> pad_dims{
+ {{0, 0},
+ {top_pad_rows, bottom_pad_rows},
+ {left_pad_cols, right_pad_cols},
+ {0, 0}}};
+ functor::InflatePadAndShuffle<Device, T, 4>()(
+ context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), strides,
+ pad_dims, out_order, padded_output.tensor<T, 4>());
+ const Tensor& padded_output_cref = padded_output;
+
+ // For the backprop of the filter, we need to transpose the input.
+ // The shape of input is
+ // [batch, in_rows, in_cols, in_depth]
+ // And we need to change it to
+ // [in_rows, in_cols, batch, in_depth]
+ Eigen::DSizes<Eigen::DenseIndex, 4> in_order{1, 2, 0, 3};
+ TensorShape in_shuffle_shape({input_rows, input_cols, batch, in_depth});
+ Tensor in_shuffle;
+ OP_REQUIRES_OK(context,
+ context->allocate_temp(DataTypeToEnum<T>::v(),
+ in_shuffle_shape, &in_shuffle));
+
+ // No need for reversing this time.
+ Eigen::array<bool, 4> trivial_dims{false, false, false, false};
+ functor::ShuffleAndReverse<Device, T, 4>()(
+ context->eigen_device<Device>(), input.tensor<T, 4>(), in_order,
+ trivial_dims, in_shuffle.tensor<T, 4>());
+ const Tensor& in_shuffle_cref = in_shuffle;
+
+ // The output of the conv_2d would be
+ // [out_depth, filter_rows, filter_cols, in_depth]
+ // and we need to shuffle it back to
+ // [filter_rows, filter_cols, in_depth, out_depth];
+ // And we need to reverse the filter backprops
+ // So we need to allocated (sigh) yet another piece of memory to hold the
+ // ouptut.
+ TensorShape filter_shuffle_shape(
+ {out_depth, filter_rows, filter_cols, in_depth});
+ Tensor filter_shuffle;
+ OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::v(),
+ filter_shuffle_shape,
+ &filter_shuffle));
+
+ functor::SpatialConvolution<Device, T>()(
+ context->eigen_device<Device>(), filter_shuffle.tensor<T, 4>(),
+ padded_output_cref.tensor<T, 4>(), in_shuffle_cref.tensor<T, 4>(), 1,
+ BrainPadding2EigenPadding(VALID));
+
+ // Now copy the filter_backprop back to the destination.
+ Eigen::DSizes<Eigen::DenseIndex, 4> filter_order{1, 2, 3, 0};
+ Eigen::array<bool, 4> filter_rev_dims{true, true, false, false};
+ const Tensor& filter_shuffle_cref = filter_shuffle;
+ functor::ShuffleAndReverse<Device, T, 4>()(
+ context->eigen_device<Device>(), filter_shuffle_cref.tensor<T, 4>(),
+ filter_order, filter_rev_dims, filter_backprop->tensor<T, 4>());
+ }
+ }
+
+ private:
+ std::vector<int32> strides_;
+ Padding padding_;
+ bool use_cudnn_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(Conv2DSlowBackpropFilterOp);
+};
+
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T) \
+ template <> \
+ void ShuffleAndReverse<GPUDevice, T, 4>::operator()( \
+ const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \
+ const Eigen::DSizes<Eigen::DenseIndex, 4>& order, \
+ const Eigen::array<bool, 4>& reverse_dims, \
+ typename TTypes<T, 4>::Tensor output); \
+ extern template struct ShuffleAndReverse<GPUDevice, T, 4>; \
+ template <> \
+ void InflatePadAndShuffle<GPUDevice, T, 4>::operator()( \
+ const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \
+ const Eigen::DSizes<Eigen::DenseIndex, 4>& strides, \
+ const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4>& pad_dims, \
+ const Eigen::DSizes<Eigen::DenseIndex, 4>& order, \
+ typename TTypes<T, 4>::Tensor output); \
+ extern template struct InflatePadAndShuffle<GPUDevice, T, 4>; \
+ template <> \
+ void TransformFilter<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in, \
+ typename TTypes<T, 4>::Tensor out); \
+ extern template struct TransformFilter<GPUDevice, T>; \
+ template <> \
+ void TransformDepth<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in, \
+ const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle, \
+ typename TTypes<T, 4>::Tensor out); \
+ extern template struct TransformDepth<GPUDevice, T>; \
+ template <> \
+ void SpatialConvolution<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T, 4>::Tensor output, \
+ typename TTypes<T, 4>::ConstTensor input, \
+ typename TTypes<T, 4>::ConstTensor filter, int stride, \
+ const Eigen::PaddingType& padding); \
+ extern template struct SpatialConvolution<GPUDevice, T>; \
+ template <> \
+ void SpatialConvolutionBackwardInput<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T, 4>::Tensor in_backprop, \
+ typename TTypes<T, 4>::ConstTensor filter, \
+ typename TTypes<T, 4>::ConstTensor output_backprop, int input_rows, \
+ int input_cols, int stride); \
+ extern template struct SpatialConvolutionBackwardInput<GPUDevice, T>
+
+DECLARE_GPU_SPEC(float);
+#undef DECLARE_GPU_SPEC
+} // namespace functor
+
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<float>("T")
+ .HostMemory("input_sizes"),
+ Conv2DSlowBackpropInputOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<float>("T")
+ .HostMemory("filter_sizes"),
+ Conv2DSlowBackpropFilterOp<GPUDevice, float>);
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
new file mode 100644
index 0000000000..aaa2951778
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -0,0 +1,373 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define USE_EIGEN_TENSOR
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/public/tensor.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/stream_executor/stream.h"
+#endif // GOOGLE_CUDA
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+struct LaunchGeneric {
+ static void launch(OpKernelContext* ctx, const Tensor& input,
+ const Tensor& filter, int stride,
+ const Eigen::PaddingType& padding, Tensor* output) {
+ if (filter.dim_size(1) == filter.dim_size(0) && filter.dim_size(0) == 1 &&
+ stride == 1) {
+ // For 1x1 kernel, the 2D convolution is reduced to matrix
+ // multiplication.
+ //
+ // TODO(vrv): We should be able to call SpatialConvolution
+ // and it will produce the same result, but doing so
+ // led to NaNs during training. Using matmul instead for now.
+ int conv_width = 1; // Width for the convolution step.
+ for (int i = 0; i < 3; ++i) {
+ conv_width *= output->dim_size(i);
+ }
+
+ Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+ dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+ functor::MatMulConvFunctor<Device, T>()(
+ ctx->eigen_device<Device>(),
+ output->shaped<T, 2>({conv_width, filter.dim_size(3)}),
+ input.shaped<T, 2>({conv_width, filter.dim_size(2)}),
+ filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)}),
+ dim_pair);
+ } else {
+ functor::SpatialConvolution<Device, T>()(
+ ctx->eigen_device<Device>(), output->tensor<T, 4>(),
+ input.tensor<T, 4>(), filter.tensor<T, 4>(), stride, padding);
+ }
+ }
+};
+
+template <typename Device, typename T>
+struct LaunchConvOp;
+
+template <typename T>
+struct LaunchConvOp<CPUDevice, T> {
+ static void launch(OpKernelContext* ctx, bool use_cudnn, const Tensor& input,
+ const Tensor& filter, int stride,
+ const Eigen::PaddingType& padding, Tensor* output) {
+ LaunchGeneric<CPUDevice, T>::launch(ctx, input, filter, stride, padding,
+ output);
+ }
+};
+
+template <typename Device, typename T>
+class Conv2DOp : public BinaryOp<T> {
+ public:
+ explicit Conv2DOp(OpKernelConstruction* context) : BinaryOp<T>(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+ OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
+ use_cudnn_ &= CanUseCudnn();
+ OP_REQUIRES(context, strides_.size() == 4,
+ errors::InvalidArgument(
+ "Sliding window strides field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES(context, strides_[1] == strides_[2],
+ errors::InvalidArgument(
+ "Current implementation only supports equal length "
+ "strides in the row and column dimensions."));
+ OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1),
+ errors::InvalidArgument(
+ "Current implementation does not yet support "
+ "strides in the batch and depth dimensions."));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ // Input tensor is of the following dimensions:
+ // [ batch, in_rows, in_cols, in_depth ]
+
+ const Tensor& input = context->input(0);
+
+ // Input filter is of the following dimensions:
+ // [ filter_rows, filter_cols, in_depth, out_depth]
+ const Tensor& filter = context->input(1);
+
+ // For 2D convolution, there should be 4 dimensions.
+ OP_REQUIRES(context, input.dims() == 4,
+ errors::InvalidArgument("input must be 4-dimensional",
+ input.shape().ShortDebugString()));
+ OP_REQUIRES(context, filter.dims() == 4,
+ errors::InvalidArgument("filter must be 4-dimensional: ",
+ filter.shape().ShortDebugString()));
+
+ // The last dimension for input is in_depth. It must be the same as the
+ // filter's in_depth.
+ const int64 in_depth = input.dim_size(3);
+ OP_REQUIRES(
+ context, in_depth == filter.dim_size(2),
+ errors::InvalidArgument("input and filter must have the same depth: ",
+ in_depth, " vs ", filter.dim_size(2)));
+
+ // The last dimension for filter is out_depth.
+ const int64 out_depth = filter.dim_size(3);
+
+ // The second dimension for input is rows/height.
+ // The first dimension for filter is rows/height.
+ const int64 input_rows = input.dim_size(1);
+ const int64 filter_rows = filter.dim_size(0);
+
+ // The third dimension for input is columns/width.
+ // The second dimension for filter is columns/width.
+ const int64 input_cols = input.dim_size(2);
+ const int64 filter_cols = filter.dim_size(1);
+
+ // The first dimension for input is batch.
+ const int64 batch = input.dim_size(0);
+
+ // For now we take the stride from the second dimension only (we
+ // assume row = col stride, and do not support striding on the
+ // batch or depth dimension).
+ const int stride = strides_[1];
+
+ int out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+ if (filter_cols == filter_rows && filter_rows == 1 && stride == 1) {
+ // For 1x1 kernel, the 2D convolution is reduced to matrix
+ // multiplication.
+ out_rows = input_rows;
+ out_cols = input_cols;
+ } else {
+ OP_REQUIRES_OK(
+ context, Get2dOutputSize(input_rows, input_cols, filter_rows,
+ filter_cols, stride, stride, padding_,
+ &out_rows, &out_cols, &pad_rows, &pad_cols));
+ }
+ TensorShape out_shape({batch, out_rows, out_cols, out_depth});
+
+ // Output tensor is of the following dimensions:
+ // [ in_batch, out_rows, out_cols, out_depth ]
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+ VLOG(2) << "Conv2D: in_depth = " << in_depth
+ << ", input_cols = " << input_cols
+ << ", filter_cols = " << filter_cols
+ << ", input_rows = " << input_rows
+ << ", filter_rows = " << filter_rows << ", stride = " << stride
+ << ", out_depth = " << out_depth;
+
+ LaunchConvOp<Device, T>::launch(context, use_cudnn_, input, filter, stride,
+ BrainPadding2EigenPadding(padding_),
+ output);
+ }
+
+ private:
+ std::vector<int32> strides_;
+ bool use_cudnn_;
+ Padding padding_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(Conv2DOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Conv2D")
+ .Device(DEVICE_CPU)
+ .TypeConstraint<float>("T"),
+ Conv2DOp<CPUDevice, float>);
+
+#if GOOGLE_CUDA
+
+namespace {
+template <typename T>
+perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory,
+ uint64 size) {
+ perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory),
+ size * sizeof(T));
+ perftools::gputools::DeviceMemory<T> typed(wrapped);
+ return typed;
+}
+} // namespace
+
+template <typename T>
+struct LaunchConvOp<GPUDevice, T> {
+ static void launch(OpKernelContext* ctx, bool use_cudnn,
+ const Tensor& input_param, const Tensor& filter,
+ int stride, const Eigen::PaddingType& padding,
+ Tensor* output) {
+ auto* stream = ctx->op_device_context<GPUDeviceContext>()->stream();
+ OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
+
+ if (use_cudnn) {
+ Tensor input = input_param;
+ if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1) {
+ // 1x1 filter, so call cublas directly.
+ const uint64 m =
+ input.dim_size(0) * input.dim_size(1) * input.dim_size(2);
+ const uint64 k = filter.dim_size(2);
+ const uint64 n = filter.dim_size(3);
+
+ auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
+ input.template flat<T>().size());
+ auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
+ filter.template flat<T>().size());
+ auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
+ output->template flat<T>().size());
+
+ auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+ bool blas_launch_status =
+ stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f,
+ b_ptr, n, a_ptr, k, 0.0f, &c_ptr, n)
+ .ok();
+ if (!blas_launch_status) {
+ ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
+ ", n=", n, ", k=", k));
+ }
+ return;
+ }
+ if (padding == Eigen::PADDING_SAME) {
+ const int64 out_rows = output->dim_size(1);
+ const int64 out_cols = output->dim_size(2);
+ const int64 in_rows = input.dim_size(1);
+ const int64 in_cols = input.dim_size(2);
+ const int64 patch_rows = filter.dim_size(0);
+ const int64 patch_cols = filter.dim_size(1);
+ // Total padding on rows and cols is
+ // Pr = (R' - 1) * S + Kr - R
+ // Pc = (C' - 1) * S + Kc - C
+ // where (R', C') are output dimensions, (R, C) are input dimensions, S
+ // is stride, (Kr, Kc) are filter dimensions.
+ // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
+ // and Pc - Pc/2 on the bottom. When Pr or Pc is odd, this means
+ // we pad more on the right and bottom than on the top and left.
+ const int padding_rows = (out_rows - 1) * stride + patch_rows - in_rows;
+ const int padding_cols = (out_cols - 1) * stride + patch_cols - in_cols;
+ Tensor transformed_input;
+ OP_REQUIRES_OK(
+ ctx, ctx->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape(
+ {input.dim_size(0), input.dim_size(1) + padding_rows,
+ input.dim_size(2) + padding_cols, input.dim_size(3)}),
+ &transformed_input));
+
+ functor::PadInput<GPUDevice, T>()(
+ ctx->eigen_device<GPUDevice>(), input_param.tensor<T, 4>(),
+ padding_rows / 2, padding_rows - padding_rows / 2, padding_cols / 2,
+ padding_cols - padding_cols / 2, transformed_input.tensor<T, 4>());
+ input = transformed_input;
+ }
+
+ perftools::gputools::dnn::BatchDescriptor input_desc;
+ input_desc.set_count(input.dim_size(0))
+ .set_height(input.dim_size(1))
+ .set_width(input.dim_size(2))
+ .set_feature_map_count(input.dim_size(3))
+ .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
+ perftools::gputools::dnn::BatchDescriptor output_desc;
+ output_desc.set_count(output->dim_size(0))
+ .set_height(output->dim_size(1))
+ .set_width(output->dim_size(2))
+ .set_feature_map_count(output->dim_size(3))
+ .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
+ perftools::gputools::dnn::FilterDescriptor filter_desc;
+ filter_desc.set_input_filter_height(filter.dim_size(0))
+ .set_input_filter_width(filter.dim_size(1))
+ .set_input_feature_map_count(filter.dim_size(2))
+ .set_output_feature_map_count(filter.dim_size(3));
+ perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+ conv_desc.set_vertical_filter_stride(stride)
+ .set_horizontal_filter_stride(stride);
+
+ Tensor transformed_filter;
+ OP_REQUIRES_OK(ctx,
+ ctx->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({filter.dim_size(3), filter.dim_size(2),
+ filter.dim_size(0), filter.dim_size(1)}),
+ &transformed_filter));
+
+ functor::TransformFilter<GPUDevice, T>()(
+ ctx->eigen_device<GPUDevice>(), filter.tensor<T, 4>(),
+ transformed_filter.tensor<T, 4>());
+
+ auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
+ input.template flat<T>().size());
+ auto filter_ptr =
+ AsDeviceMemory(transformed_filter.template flat<T>().data(),
+ transformed_filter.template flat<T>().size());
+ auto output_ptr = AsDeviceMemory(output->template flat<T>().data(),
+ output->template flat<T>().size());
+
+ bool cudnn_launch_status =
+ stream->ThenConvolve(input_desc, input_ptr, filter_desc, filter_ptr,
+ conv_desc, output_desc, &output_ptr)
+ .ok();
+
+ if (!cudnn_launch_status) {
+ ctx->SetStatus(errors::Internal(
+ "cuDNN launch failure : input shape(", input.shape().DebugString(),
+ ") filter shape(", filter.shape().DebugString(), ")"));
+ }
+ } else {
+ LaunchGeneric<GPUDevice, T>::launch(ctx, input_param, filter, stride,
+ padding, output);
+ }
+ }
+};
+
+#endif // GOOGLE_CUDA
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T) \
+ template <> \
+ void SpatialConvolution<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T, 4>::Tensor output, \
+ typename TTypes<T, 4>::ConstTensor input, \
+ typename TTypes<T, 4>::ConstTensor filter, int stride, \
+ const Eigen::PaddingType& padding); \
+ extern template struct SpatialConvolution<GPUDevice, T>; \
+ template <> \
+ void MatMulConvFunctor<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T, 2>::Tensor out, \
+ typename TTypes<T, 2>::ConstTensor in0, \
+ typename TTypes<T, 2>::ConstTensor in1, \
+ const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair); \
+ extern template struct MatMulConvFunctor<GPUDevice, T>; \
+ template <> \
+ void TransformFilter<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in, \
+ typename TTypes<T, 4>::Tensor out); \
+ extern template struct TransformFilter<GPUDevice, T>; \
+ template <> \
+ void PadInput<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in, \
+ int padding_rows_left, int padding_rows_right, int padding_cols_left, \
+ int padding_cols_right, typename TTypes<T, 4>::Tensor out); \
+ extern template struct PadInput<GPUDevice, T>
+
+DECLARE_GPU_SPEC(float);
+#undef DECLARE_GPU_SPEC
+} // namespace functor
+
+// Registration of the GPU implementations.
+REGISTER_KERNEL_BUILDER(Name("Conv2D")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<float>("T"),
+ Conv2DOp<GPUDevice, float>);
+
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_gpu.cu.cc b/tensorflow/core/kernels/conv_ops_gpu.cu.cc
new file mode 100644
index 0000000000..44af814e2b
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops_gpu.cu.cc
@@ -0,0 +1,35 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/conv_2d.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <typename T>
+struct SpatialConvolution<GPUDevice, T> {
+ void operator()(const GPUDevice& d, typename TTypes<T, 4>::Tensor output,
+ typename TTypes<T, 4>::ConstTensor input,
+ typename TTypes<T, 4>::ConstTensor filter, int stride,
+ const Eigen::PaddingType& padding) {
+ // TODO(keveman): nvcc 6.5 crashes when 32 bit indexing is turned on. Enable
+ // this when we move to cuda 7.0.
+ // SpatialConvolutionFunc(d, To32Bit(output), To32Bit(input),
+ // To32Bit(filter), stride, padding);
+
+ SpatialConvolutionFunc(d, output, input, filter, stride, padding);
+ }
+};
+
+template struct SpatialConvolution<GPUDevice, float>;
+
+} // end namespace functor
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc
new file mode 100644
index 0000000000..e2e9d25d83
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc
@@ -0,0 +1,16 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/conv_2d.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+template struct functor::InflatePadAndShuffle<GPUDevice, float, 4>;
+
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
new file mode 100644
index 0000000000..dbbe08ef9c
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -0,0 +1,22 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/conv_2d.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+template struct functor::ShuffleAndReverse<GPUDevice, float, 4>;
+
+template struct functor::TransformFilter<GPUDevice, float>;
+
+template struct functor::PadInput<GPUDevice, float>;
+
+template struct functor::TransformDepth<GPUDevice, float>;
+
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_ops_gpu_matmul.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_matmul.cu.cc
new file mode 100644
index 0000000000..87d79ecb4d
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops_gpu_matmul.cu.cc
@@ -0,0 +1,16 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/conv_2d.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+template struct functor::MatMulConvFunctor<GPUDevice, float>;
+
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/core_ops_test.cc b/tensorflow/core/kernels/core_ops_test.cc
new file mode 100644
index 0000000000..a42a5999da
--- /dev/null
+++ b/tensorflow/core/kernels/core_ops_test.cc
@@ -0,0 +1,990 @@
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif // GOOGLE_CUDA
+
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/eigen_thread_pool.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/util/port.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+
+static void SetConstOp(const string& name, std::initializer_list<int64> dims,
+ NodeDef* node) {
+ Tensor tensor(DT_FLOAT, TensorShape(dims));
+ for (int64 i = 0; i < tensor.NumElements(); ++i) {
+ tensor.flat<float>()(i) = i / 10.0f;
+ }
+ TF_CHECK_OK(NodeDefBuilder(name, "Const")
+ .Attr("dtype", DT_FLOAT)
+ .Attr("value", tensor)
+ .Finalize(node));
+}
+
+static void SetConstSizesOp(const string& name, const std::vector<int32>& sizes,
+ NodeDef* node) {
+ TensorShape shape;
+ shape.AddDim(sizes.size());
+ Tensor tensor(DT_INT32, shape);
+ for (int64 i = 0; i < tensor.NumElements(); ++i) {
+ tensor.flat<int32>()(i) = sizes[i];
+ }
+ TF_CHECK_OK(NodeDefBuilder(name, "Const")
+ .Attr("dtype", DT_INT32)
+ .Attr("value", tensor)
+ .Finalize(node));
+}
+
+namespace {
+
+enum CONV_OP {
+ CONV_OP_FORWARD = 0,
+ CONV_OP_BACKPROP_INPUT = 1,
+ CONV_OP_BACKPROP_FILTER = 2
+};
+
+} // namespace
+
+static void BM_ConvFloat(int iters, int batch, int rows, int cols, int in_depth,
+ int out_depth, int filter_rows, int filter_cols,
+ CONV_OP op, int num_threads, int stride,
+ Padding padding, bool use_gpu, const string& label) {
+ if (!IsGoogleCudaEnabled() && use_gpu) {
+ testing::SetLabel(
+ strings::StrCat("Skipping GPU test (no --config=cuda): ", label));
+ return;
+ }
+ testing::SetLabel(label);
+
+ // Set the number of threads
+ SessionOptions options;
+ options.config.set_intra_op_parallelism_threads(num_threads);
+
+ // We set up a graph for computing convolution.
+ GraphDef graph;
+
+ // For this, we need an input tensor and a filter tensor.
+ // Compute the output size.
+ int out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+ TF_CHECK_OK(Get2dOutputSize(rows, cols, filter_rows, filter_cols, stride,
+ stride, padding, &out_rows, &out_cols, &pad_rows,
+ &pad_cols));
+ // Counting the number of floating point operations (both MUL and ADD)
+ int64 num_ops = 0;
+ if (op == CONV_OP_FORWARD) {
+ // Forward computation:
+ // BATCH x OUT_ROW X OUT_COL X IN_DEPTH X PATCH_ROW X PATH_COL X OUT_DEPTH
+ // We multiply by two since there are mutliplications and additions.
+ num_ops = static_cast<int64>(batch * in_depth * out_depth) *
+ static_cast<int64>(filter_rows * filter_cols) *
+ static_cast<int64>(out_rows * out_cols) * 2;
+ } else {
+ // Backward computation: both input and filter backprop take the same
+ // amount of computation:
+ // BATCH x IN_ROW X IN_COL X IN_DEPTH X PATCH_ROW X PATCH_COL X OUT_DEPTH
+ // We multiply by two since there are mutliplications and additions.
+ num_ops = static_cast<int64>(batch * in_depth * out_depth) *
+ static_cast<int64>(filter_rows * filter_cols) *
+ static_cast<int64>(rows * cols) * 2;
+ }
+
+ SetConstOp("input", {batch, rows, cols, in_depth}, graph.add_node());
+ SetConstOp("filter", {filter_rows, filter_cols, in_depth, out_depth},
+ graph.add_node());
+ SetConstOp("output_backprop", {batch, out_rows, out_cols, out_depth},
+ graph.add_node());
+ SetConstSizesOp("input_sizes",
+ std::vector<int32>({batch, rows, cols, in_depth}),
+ graph.add_node());
+ SetConstSizesOp("filter_sizes", std::vector<int32>({filter_rows, filter_cols,
+ in_depth, out_depth}),
+ graph.add_node());
+
+ // Now add the convolution op
+ NodeDef* conv = graph.add_node();
+ switch (op) {
+ case CONV_OP_FORWARD:
+ TF_CHECK_OK(NodeDefBuilder("conv2d", "Conv2D")
+ .Input("input", 0, DT_FLOAT)
+ .Input("filter", 0, DT_FLOAT)
+ .Attr("strides", {1, stride, stride, 1})
+ .Attr("padding", padding == VALID ? "VALID" : "SAME")
+ .Finalize(conv));
+ break;
+ case CONV_OP_BACKPROP_INPUT:
+ TF_CHECK_OK(NodeDefBuilder("conv2d", "Conv2DBackpropInput")
+ .Input("input_sizes", 0, DT_INT32)
+ .Input("filter", 0, DT_FLOAT)
+ .Input("output_backprop", 0, DT_FLOAT)
+ .Attr("strides", {1, stride, stride, 1})
+ .Attr("padding", padding == VALID ? "VALID" : "SAME")
+ .Finalize(conv));
+ break;
+ case CONV_OP_BACKPROP_FILTER:
+ TF_CHECK_OK(NodeDefBuilder("conv2d", "Conv2DBackpropFilter")
+ .Input("input", 0, DT_FLOAT)
+ .Input("filter_sizes", 0, DT_INT32)
+ .Input("output_backprop", 0, DT_FLOAT)
+ .Attr("strides", {1, stride, stride, 1})
+ .Attr("padding", padding == VALID ? "VALID" : "SAME")
+ .Finalize(conv));
+ break;
+ }
+ Graph* g = new Graph(OpRegistry::Global());
+ GraphConstructorOptions opts;
+ TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph, g));
+
+ string device = use_gpu ? "gpu" : "cpu";
+ test::Benchmark(device, g, &options).Run(iters);
+ testing::ItemsProcessed(num_ops * iters);
+}
+
+// BS: batch_size
+// R: tensor_in_rows
+// C: tensor_in_cols
+// ID: input_depth
+// OD: output_depth
+// KR: kernel_rows
+// KC: kernel_cols
+#define BM_ConvFloatFwd(BS, R, C, ID, OD, KR, KC, STR, PAD, LABEL) \
+ static void BM_ConvFloatFwdCPU1_##LABEL(int iters) { \
+ BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 1, STR, \
+ PAD, false, \
+ strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \
+ KR, "_", KC, "_", STR, "_", PAD, "_cpu1")); \
+ } \
+ static void BM_ConvFloatFwdCPU4_##LABEL(int iters) { \
+ BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 4, STR, \
+ PAD, false, \
+ strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \
+ KR, "_", KC, "_", STR, "_", PAD, "_cpu4")); \
+ } \
+ static void BM_ConvFloatFwdGPU_##LABEL(int iters) { \
+ BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 1, STR, \
+ PAD, true, \
+ strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \
+ KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \
+ } \
+ BENCHMARK(BM_ConvFloatFwdCPU1_##LABEL); \
+ BENCHMARK(BM_ConvFloatFwdCPU4_##LABEL); \
+ BENCHMARK(BM_ConvFloatFwdGPU_##LABEL)
+
+BM_ConvFloatFwd(32, 5, 5, 1248, 128, 1, 1, 1, SAME, conv0);
+BM_ConvFloatFwd(32, 8, 8, 384, 384, 1, 3, 1, SAME, conv1);
+BM_ConvFloatFwd(32, 8, 8, 384, 384, 3, 1, 1, SAME, conv2);
+BM_ConvFloatFwd(32, 8, 8, 2048, 192, 1, 1, 1, SAME, conv3);
+BM_ConvFloatFwd(32, 8, 8, 448, 384, 3, 3, 1, SAME, conv4);
+BM_ConvFloatFwd(32, 8, 8, 2048, 320, 1, 1, 1, SAME, conv5);
+BM_ConvFloatFwd(32, 8, 8, 2048, 448, 1, 1, 1, SAME, conv6);
+BM_ConvFloatFwd(32, 8, 8, 2048, 384, 1, 1, 1, SAME, conv7);
+BM_ConvFloatFwd(32, 8, 8, 1760, 384, 1, 1, 1, SAME, conv8);
+BM_ConvFloatFwd(32, 8, 8, 1760, 192, 1, 1, 1, SAME, conv9);
+BM_ConvFloatFwd(32, 8, 8, 1760, 448, 1, 1, 1, SAME, conv10);
+BM_ConvFloatFwd(32, 8, 8, 1760, 320, 1, 1, 1, SAME, conv11);
+BM_ConvFloatFwd(32, 17, 17, 192, 192, 3, 3, 2, VALID, conv12);
+BM_ConvFloatFwd(32, 17, 17, 192, 192, 3, 3, 1, SAME, conv13);
+BM_ConvFloatFwd(32, 17, 17, 1248, 192, 1, 1, 1, SAME, conv14);
+BM_ConvFloatFwd(32, 17, 17, 128, 320, 3, 3, 2, VALID, conv15);
+BM_ConvFloatFwd(32, 17, 17, 1248, 128, 1, 1, 1, SAME, conv16);
+BM_ConvFloatFwd(32, 17, 17, 224, 224, 1, 3, 1, SAME, conv17);
+BM_ConvFloatFwd(32, 17, 17, 192, 256, 3, 1, 1, SAME, conv18);
+BM_ConvFloatFwd(32, 17, 17, 192, 256, 1, 3, 1, SAME, conv19);
+BM_ConvFloatFwd(32, 17, 17, 1216, 192, 1, 1, 1, SAME, conv20);
+BM_ConvFloatFwd(32, 17, 17, 1216, 96, 1, 1, 1, SAME, conv21);
+BM_ConvFloatFwd(32, 17, 17, 224, 224, 3, 1, 1, SAME, conv22);
+BM_ConvFloatFwd(32, 17, 17, 192, 224, 3, 3, 1, SAME, conv23);
+BM_ConvFloatFwd(32, 17, 17, 192, 192, 1, 3, 1, SAME, conv24);
+BM_ConvFloatFwd(32, 17, 17, 1152, 192, 1, 1, 1, SAME, conv25);
+BM_ConvFloatFwd(32, 17, 17, 1152, 128, 1, 1, 1, SAME, conv26);
+BM_ConvFloatFwd(32, 17, 17, 192, 192, 3, 1, 1, SAME, conv27);
+BM_ConvFloatFwd(32, 17, 17, 160, 192, 3, 3, 1, SAME, conv28);
+BM_ConvFloatFwd(32, 17, 17, 1152, 160, 1, 1, 1, SAME, conv29);
+BM_ConvFloatFwd(32, 17, 17, 1024, 128, 1, 1, 1, SAME, conv30);
+BM_ConvFloatFwd(32, 17, 17, 128, 192, 1, 3, 1, SAME, conv31);
+BM_ConvFloatFwd(32, 17, 17, 1024, 160, 1, 1, 1, SAME, conv32);
+BM_ConvFloatFwd(32, 17, 17, 128, 192, 3, 1, 1, SAME, conv33);
+BM_ConvFloatFwd(32, 17, 17, 1024, 256, 1, 1, 1, SAME, conv34);
+BM_ConvFloatFwd(32, 17, 17, 128, 128, 3, 1, 1, SAME, conv35);
+BM_ConvFloatFwd(32, 17, 17, 768, 192, 1, 1, 1, SAME, conv36);
+BM_ConvFloatFwd(32, 17, 17, 128, 128, 1, 3, 1, SAME, conv37);
+BM_ConvFloatFwd(32, 17, 17, 128, 128, 3, 3, 1, SAME, conv38);
+BM_ConvFloatFwd(32, 17, 17, 768, 128, 1, 1, 1, SAME, conv39);
+BM_ConvFloatFwd(32, 17, 17, 768, 320, 1, 1, 1, SAME, conv40);
+BM_ConvFloatFwd(32, 35, 35, 96, 96, 3, 3, 2, VALID, conv41);
+BM_ConvFloatFwd(32, 35, 35, 288, 384, 3, 3, 2, VALID, conv42);
+BM_ConvFloatFwd(32, 35, 35, 64, 96, 3, 3, 1, SAME, conv43);
+BM_ConvFloatFwd(32, 35, 35, 288, 64, 1, 1, 1, SAME, conv44);
+BM_ConvFloatFwd(32, 35, 35, 256, 64, 1, 1, 1, SAME, conv45);
+BM_ConvFloatFwd(32, 35, 35, 48, 64, 5, 5, 1, SAME, conv46);
+BM_ConvFloatFwd(32, 35, 35, 256, 48, 1, 1, 1, SAME, conv47);
+BM_ConvFloatFwd(32, 35, 35, 96, 96, 3, 3, 1, SAME, conv48);
+BM_ConvFloatFwd(32, 35, 35, 192, 32, 1, 1, 1, SAME, conv49);
+BM_ConvFloatFwd(32, 35, 35, 192, 64, 1, 1, 1, SAME, conv50);
+BM_ConvFloatFwd(32, 35, 35, 192, 48, 1, 1, 1, SAME, conv51);
+BM_ConvFloatFwd(32, 73, 73, 64, 192, 3, 3, 1, VALID, conv52);
+BM_ConvFloatFwd(32, 73, 73, 64, 64, 1, 1, 1, VALID, conv53);
+BM_ConvFloatFwd(32, 147, 147, 24, 64, 1, 1, 1, VALID, conv54);
+
+#define BM_ConvFloatBkInAndFilter(BS, R, C, ID, OD, KR, KC, STR, PAD, LABEL) \
+ static void BM_ConvFloatBkInCPU1_##LABEL(int iters) { \
+ BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 1, \
+ STR, PAD, false, \
+ strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \
+ KR, "_", KC, "_", STR, "_", PAD, "_cpu1")); \
+ } \
+ static void BM_ConvFloatBkInCPU4_##LABEL(int iters) { \
+ BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 4, \
+ STR, PAD, false, \
+ strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \
+ KR, "_", KC, "_", STR, "_", PAD, "_cpu4")); \
+ } \
+ static void BM_ConvFloatBkInGPU_##LABEL(int iters) { \
+ BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 1, \
+ STR, PAD, true, \
+ strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \
+ KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \
+ } \
+ static void BM_ConvFloatBkFilterCPU1_##LABEL(int iters) { \
+ BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \
+ STR, PAD, false, \
+ strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \
+ KR, "_", KC, "_", STR, "_", PAD, "_cpu1")); \
+ } \
+ static void BM_ConvFloatBkFilterCPU4_##LABEL(int iters) { \
+ BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 4, \
+ STR, PAD, false, \
+ strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \
+ KR, "_", KC, "_", STR, "_", PAD, "_cpu4")); \
+ } \
+ static void BM_ConvFloatBkFilterGPU_##LABEL(int iters) { \
+ BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \
+ STR, PAD, true, \
+ strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \
+ KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \
+ } \
+ BENCHMARK(BM_ConvFloatBkInCPU1_##LABEL); \
+ BENCHMARK(BM_ConvFloatBkInCPU4_##LABEL); \
+ BENCHMARK(BM_ConvFloatBkInGPU_##LABEL); \
+ BENCHMARK(BM_ConvFloatBkFilterCPU1_##LABEL); \
+ BENCHMARK(BM_ConvFloatBkFilterCPU4_##LABEL); \
+ BENCHMARK(BM_ConvFloatBkFilterGPU_##LABEL)
+
+// Benchmarks from the inception model
+
+BM_ConvFloatBkInAndFilter(32, 5, 5, 1248, 128, 1, 1, 1, SAME, conv0);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 384, 384, 1, 3, 1, SAME, conv1);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 384, 384, 3, 1, 1, SAME, conv2);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 192, 1, 1, 1, SAME, conv3);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 448, 384, 3, 3, 1, SAME, conv4);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 320, 1, 1, 1, SAME, conv5);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 448, 1, 1, 1, SAME, conv6);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 384, 1, 1, 1, SAME, conv7);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 384, 1, 1, 1, SAME, conv8);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 192, 1, 1, 1, SAME, conv9);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 448, 1, 1, 1, SAME, conv10);
+BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 320, 1, 1, 1, SAME, conv11);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 3, 3, 2, VALID, conv12);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 3, 3, 1, SAME, conv13);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1248, 192, 1, 1, 1, SAME, conv14);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 320, 3, 3, 2, VALID, conv15);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1248, 128, 1, 1, 1, SAME, conv16);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 224, 224, 1, 3, 1, SAME, conv17);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 256, 3, 1, 1, SAME, conv18);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 256, 1, 3, 1, SAME, conv19);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1216, 192, 1, 1, 1, SAME, conv20);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1216, 96, 1, 1, 1, SAME, conv21);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 224, 224, 3, 1, 1, SAME, conv22);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 224, 3, 3, 1, SAME, conv23);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 1, 3, 1, SAME, conv24);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1152, 192, 1, 1, 1, SAME, conv25);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1152, 128, 1, 1, 1, SAME, conv26);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 3, 1, 1, SAME, conv27);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 160, 192, 3, 3, 1, SAME, conv28);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1152, 160, 1, 1, 1, SAME, conv29);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1024, 128, 1, 1, 1, SAME, conv30);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 192, 1, 3, 1, SAME, conv31);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1024, 160, 1, 1, 1, SAME, conv32);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 192, 3, 1, 1, SAME, conv33);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 1024, 256, 1, 1, 1, SAME, conv34);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 128, 3, 1, 1, SAME, conv35);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 768, 192, 1, 1, 1, SAME, conv36);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 128, 1, 3, 1, SAME, conv37);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 128, 3, 3, 1, SAME, conv38);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 768, 128, 1, 1, 1, SAME, conv39);
+BM_ConvFloatBkInAndFilter(32, 17, 17, 768, 320, 1, 1, 1, SAME, conv40);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 96, 96, 3, 3, 2, VALID, conv41);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 288, 384, 3, 3, 2, VALID, conv42);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 64, 96, 3, 3, 1, SAME, conv43);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 288, 64, 1, 1, 1, SAME, conv44);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 256, 64, 1, 1, 1, SAME, conv45);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 48, 64, 5, 5, 1, SAME, conv46);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 256, 48, 1, 1, 1, SAME, conv47);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 96, 96, 3, 3, 1, SAME, conv48);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 192, 32, 1, 1, 1, SAME, conv49);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 192, 64, 1, 1, 1, SAME, conv50);
+BM_ConvFloatBkInAndFilter(32, 35, 35, 192, 48, 1, 1, 1, SAME, conv51);
+BM_ConvFloatBkInAndFilter(32, 73, 73, 64, 192, 3, 3, 1, VALID, conv52);
+BM_ConvFloatBkInAndFilter(32, 73, 73, 64, 64, 1, 1, 1, VALID, conv53);
+BM_ConvFloatBkInAndFilter(32, 147, 147, 24, 64, 1, 1, 1, VALID, conv54);
+
+#define BM_ConvFloatBkFCPU(BS, R, C, ID, OD, KR, KC, TH, LABEL) \
+ static void \
+ BM_ConvFloatBkFCPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC##_##TH( \
+ int iters) { \
+ BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, TH, \
+ 1, VALID, false, LABEL); \
+ } \
+ BENCHMARK( \
+ BM_ConvFloatBkFCPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC##_##TH)
+
+// Benchmarks from https://github.com/soumith/convnet-benchmarks
+BM_ConvFloatBkFCPU(128, 128, 128, 3, 96, 11, 11, 4, "convnet-layer1");
+BM_ConvFloatBkFCPU(128, 64, 64, 64, 128, 9, 9, 4, "convnet-layer2");
+BM_ConvFloatBkFCPU(128, 32, 32, 128, 128, 9, 9, 4, "convnet-layer3");
+BM_ConvFloatBkFCPU(128, 16, 16, 128, 128, 7, 7, 4, "convnet-layer4");
+BM_ConvFloatBkFCPU(128, 13, 13, 384, 384, 3, 3, 4, "convnet-layer5");
+
+#define BM_ConvFloatBkFGPU(BS, R, C, ID, OD, KR, KC, LABEL) \
+ static void BM_ConvFloatBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC( \
+ int iters) { \
+ BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \
+ 1, VALID, true, LABEL); \
+ } \
+ BENCHMARK(BM_ConvFloatBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC)
+
+// Benchmarks from https://github.com/soumith/convnet-benchmarks
+BM_ConvFloatBkFGPU(128, 128, 128, 3, 96, 11, 11, "convnet-layer1");
+BM_ConvFloatBkFGPU(128, 64, 64, 64, 128, 9, 9, "convnet-layer2");
+BM_ConvFloatBkFGPU(128, 32, 32, 128, 128, 9, 9, "convnet-layer3");
+BM_ConvFloatBkFGPU(128, 16, 16, 128, 128, 7, 7, "convnet-layer4");
+BM_ConvFloatBkFGPU(128, 13, 13, 384, 384, 3, 3, "convnet-layer5");
+
+static void BM_LRNFloat(int iters, int depth, int cols, int rows,
+ int batch_size, int range, int num_threads,
+ const string& label) {
+ tensorflow::testing::StopTiming();
+ std::unique_ptr<Device> device(
+ DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+ thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
+ EigenThreadPoolWrapper wrapper(&threadpool);
+ Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads);
+ device->set_eigen_cpu_device(&eigen_cpu_device);
+
+ gtl::InlinedVector<TensorValue, 4> inputs;
+ TensorShape shape({batch_size, rows, cols, depth});
+
+ Tensor input(DT_FLOAT, shape);
+ test::FillIota<float>(&input, 1.0);
+ inputs.push_back({nullptr, &input});
+
+ // Convolution op.
+ NodeDef lrn_node_def;
+ TF_CHECK_OK(NodeDefBuilder("lrn_op", "LRN")
+ .Input("input", 0, DT_FLOAT)
+ .Attr("depth_radius", range)
+ .Attr("bias", 1.0)
+ .Attr("alpha", 0.1)
+ .Attr("beta", 0.5)
+ .Finalize(&lrn_node_def));
+
+ Status status;
+ std::unique_ptr<OpKernel> op(CreateOpKernel(
+ DEVICE_CPU, device.get(), cpu_allocator(), lrn_node_def, &status));
+ TF_CHECK_OK(status);
+
+ OpKernelContext::Params params;
+ params.device = device.get();
+ params.frame_iter = FrameAndIter(0, 0);
+ params.inputs = &inputs;
+ params.op_kernel = op.get();
+ params.output_alloc_attr = [&device, &op, &params](int index) {
+ AllocatorAttributes attr;
+ const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+ attr.set_on_host(on_host);
+ return attr;
+ };
+
+ std::unique_ptr<OpKernelContext> context(new OpKernelContext(params));
+
+ op->Compute(context.get());
+ tensorflow::testing::StartTiming();
+ for (int i = 0; i < iters; ++i) {
+ delete context->release_output(0).tensor;
+ op->Compute(context.get());
+ }
+ tensorflow::testing::StopTiming();
+ testing::ItemsProcessed(context->mutable_output(0)->NumElements() * iters *
+ (2 * range + 1) * 2);
+ testing::SetLabel(label);
+}
+
+#define BM_LRNFloatFwdCPU(DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL) \
+ static void \
+ BM_LRNFloat_##DEPTH##_##COLS##_##ROWS##_##BATCH##_##RANGE##_##THREADS( \
+ int iters) { \
+ BM_LRNFloat(iters, DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL); \
+ } \
+ BENCHMARK( \
+ BM_LRNFloat_##DEPTH##_##COLS##_##ROWS##_##BATCH##_##RANGE##_##THREADS)
+
+// clang-format off
+// DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL
+BM_LRNFloatFwdCPU(64, 56, 56, 32, 5, 1, "lrn 1 thread");
+BM_LRNFloatFwdCPU(192, 28, 28, 64, 2, 1, "lrn 1 thread");
+BM_LRNFloatFwdCPU(192, 56, 56, 32, 5, 1, "lrn 1 thread");
+BM_LRNFloatFwdCPU(64, 56, 56, 32, 5, 4, "lrn 4 threads");
+BM_LRNFloatFwdCPU(192, 28, 28, 64, 2, 4, "lrn 4 threads");
+BM_LRNFloatFwdCPU(192, 56, 56, 32, 5, 4, "lrn 4 threads");
+BM_LRNFloatFwdCPU(64, 56, 56, 32, 5, 8, "lrn 8 threads");
+BM_LRNFloatFwdCPU(192, 28, 28, 64, 2, 8, "lrn 8 threads");
+BM_LRNFloatFwdCPU(192, 56, 56, 32, 5, 8, "lrn 8 threads");
+// clang-format on
+
+/*
+AvgPooling Op
+*/
+static void BM_AvgPool(int iters, int batch_size, int rows, int cols, int depth,
+ int kernel_rows, int kernel_cols, int stride,
+ Padding padding, int num_threads, const string& label) {
+ tensorflow::testing::StopTiming();
+ std::unique_ptr<Device> device(
+ DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+ thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
+ EigenThreadPoolWrapper wrapper(&threadpool);
+ Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads);
+ device->set_eigen_cpu_device(&eigen_cpu_device);
+
+ gtl::InlinedVector<TensorValue, 4> inputs;
+ TensorShape shape1({batch_size, rows, cols, depth});
+ Tensor input1(DT_FLOAT, shape1);
+ test::FillIota<float>(&input1, 1.0);
+ inputs.push_back({nullptr, &input1});
+
+ // AvgPooling op.
+ NodeDef avgpool_node_def;
+ CHECK_EQ(kernel_rows, kernel_cols);
+ Status status = NodeDefBuilder("avgpool_op", "AvgPool")
+ .Input(FakeInput(DT_FLOAT))
+ .Attr("ksize", {1, kernel_rows, kernel_cols, 1})
+ .Attr("strides", {1, stride, stride, 1})
+ .Attr("padding", padding == VALID ? "VALID" : "SAME")
+ .Finalize(&avgpool_node_def);
+ TF_CHECK_OK(status);
+
+ std::unique_ptr<OpKernel> op(CreateOpKernel(
+ DEVICE_CPU, device.get(), cpu_allocator(), avgpool_node_def, &status));
+ TF_CHECK_OK(status);
+ OpKernelContext::Params params;
+ params.device = device.get();
+ params.frame_iter = FrameAndIter(0, 0);
+ params.inputs = &inputs;
+ params.op_kernel = op.get();
+ params.output_alloc_attr = [&device, &op, &params](int index) {
+ AllocatorAttributes attr;
+ const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+ attr.set_on_host(on_host);
+ return attr;
+ };
+
+ std::unique_ptr<OpKernelContext> avgpool_context(new OpKernelContext(params));
+
+ op->Compute(avgpool_context.get());
+ tensorflow::testing::StartTiming();
+ for (int i = 0; i < iters; ++i) {
+ delete avgpool_context->release_output(0).tensor;
+ op->Compute(avgpool_context.get());
+ }
+ tensorflow::testing::StopTiming();
+ testing::ItemsProcessed(avgpool_context->mutable_output(0)->NumElements() *
+ iters);
+ testing::SetLabel(label);
+}
+
+// BS: batch_size
+// IR: input_rows
+// IC: input_cols
+// ND: node_depth
+// KR: kernel_rows
+// KC: kernel_cols
+// ST: stride. We use the same stride for both directions.
+// PT: padding
+#define BM_AvgPoolFwdCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL) \
+ static void \
+ BM_AvgPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH( \
+ int iters) { \
+ BM_AvgPool(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL); \
+ } \
+ BENCHMARK( \
+ BM_AvgPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH)
+
+// Labels are taken from the 2014-July-24 version of imagenet
+BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 1, "avgpool0_VALID");
+BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 1, "avgpool1_VALID");
+BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 1, "avgpool4_VALID");
+BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 1, "avgpool10_VALID");
+BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 1, "avgpool0_SAME");
+BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 1, "avgpool1_SAME");
+BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 1, "avgpool4_SAME");
+BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 1, "avgpool10_SAME");
+BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 4, "avgpool0_VALID");
+BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 4, "avgpool1_VALID");
+BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 4, "avgpool4_VALID");
+BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 4, "avgpool10_VALID");
+BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 4, "avgpool0_SAME");
+BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 4, "avgpool1_SAME");
+BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 4, "avgpool4_SAME");
+BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 4, "avgpool10_SAME");
+
+static void BM_AvgPoolBk(int iters, int batch_size, int rows, int cols,
+ int depth, int kernel_rows, int kernel_cols,
+ int stride, Padding padding, int num_threads,
+ const string& label) {
+ tensorflow::testing::StopTiming();
+ std::unique_ptr<Device> device(
+ DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+ thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
+ EigenThreadPoolWrapper wrapper(&threadpool);
+ Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads);
+ device->set_eigen_cpu_device(&eigen_cpu_device);
+
+ gtl::InlinedVector<TensorValue, 4> inputs;
+
+ int out_height, out_width, pad_rows, pad_cols;
+ Status status =
+ Get2dOutputSize(rows, cols, kernel_rows, kernel_cols, stride, stride,
+ padding, &out_height, &out_width, &pad_rows, &pad_cols);
+ TF_CHECK_OK(status);
+ TensorShape output_shape({batch_size, out_height, out_width, depth});
+ TensorShape shape2({4});
+ Tensor input_shape_tensor(DT_INT32, shape2);
+ int32 input_dims[] = {batch_size, rows, cols, depth};
+ for (int i = 0; i < 4; i++) {
+ input_shape_tensor.flat<int32>()(i) = input_dims[i];
+ }
+ inputs.push_back({nullptr, &input_shape_tensor});
+
+ Tensor output_backprop(DT_FLOAT, output_shape);
+ test::FillIota<float>(&output_backprop, 11.0);
+ inputs.push_back({nullptr, &output_backprop});
+
+ // AvgPoolGrad op.
+ NodeDef avgpool_grad_node_def;
+ status = NodeDefBuilder("avgpool_grad_op", "AvgPoolGrad")
+ .Input(FakeInput())
+ .Input(FakeInput(DT_FLOAT))
+ .Attr("ksize", {1, kernel_rows, kernel_cols, 1})
+ .Attr("strides", {1, stride, stride, 1})
+ .Attr("padding", padding == VALID ? "VALID" : "SAME")
+ .Finalize(&avgpool_grad_node_def);
+ TF_CHECK_OK(status);
+ std::unique_ptr<OpKernel> op(CreateOpKernel(
+ DEVICE_CPU, nullptr, cpu_allocator(), avgpool_grad_node_def, &status));
+ TF_CHECK_OK(status);
+ OpKernelContext::Params params;
+ params.device = device.get();
+ params.frame_iter = FrameAndIter(0, 0);
+ params.inputs = &inputs;
+ params.op_kernel = op.get();
+ params.output_alloc_attr = [&device, &op, &params](int index) {
+ AllocatorAttributes attr;
+ const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+ attr.set_on_host(on_host);
+ return attr;
+ };
+
+ std::unique_ptr<OpKernelContext> avgpool_context(new OpKernelContext(params));
+
+ op->Compute(avgpool_context.get());
+ tensorflow::testing::StartTiming();
+ for (int i = 0; i < iters; ++i) {
+ delete avgpool_context->release_output(0).tensor;
+ op->Compute(avgpool_context.get());
+ }
+ tensorflow::testing::StopTiming();
+ testing::ItemsProcessed(avgpool_context->mutable_output(0)->NumElements() *
+ iters);
+ testing::SetLabel(label);
+}
+
+// BS: batch_size
+// IR: input_rows
+// IC: input_cols
+// ND: node_depth
+// KR: kernel_rows
+// KC: kernel_cols
+// ST: stride. We use the same stride for both directions.
+// PT: padding
+// The resulted symbol is too long. Need to use two macros to fit in 80-chars
+#define BM_AvgPoolBkCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL) \
+ static void \
+ BM_AvgPoolBk_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH( \
+ int iters) { \
+ BM_AvgPoolBk(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL); \
+ } \
+ BENCHMARK( \
+ BM_AvgPoolBk_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH)
+
+// Shapes taken from the 2015/05/16 inception model
+BM_AvgPoolBkCPU(32, 35, 35, 192, 3, 3, 1, SAME, 1, "avgpool_grad0_SAME");
+BM_AvgPoolBkCPU(32, 35, 35, 256, 3, 3, 1, SAME, 1, "avgpool_grad1_SAME");
+BM_AvgPoolBkCPU(32, 17, 17, 768, 3, 3, 1, SAME, 1, "avgpool_grad2_SAME");
+BM_AvgPoolBkCPU(32, 17, 17, 1024, 3, 3, 1, SAME, 1, "avgpool_grad3_SAME");
+BM_AvgPoolBkCPU(32, 17, 17, 1152, 3, 3, 1, SAME, 1, "avgpool_grad4_SAME");
+BM_AvgPoolBkCPU(32, 17, 17, 1216, 3, 3, 1, SAME, 1, "avgpool_grad5_SAME");
+BM_AvgPoolBkCPU(32, 17, 17, 1248, 5, 5, 3, VALID, 1, "avgpool_grad6_VALID");
+BM_AvgPoolBkCPU(32, 8, 8, 1760, 3, 3, 1, SAME, 1, "avgpool_grad7_SAME");
+BM_AvgPoolBkCPU(32, 8, 8, 2048, 8, 8, 1, VALID, 1, "avgpool_grad8_VALID");
+
+/*
+MaxPooling Op
+*/
+static void BM_MaxPool(int iters, int batch_size, int rows, int cols, int depth,
+ int kernel_rows, int kernel_cols, int stride,
+ Padding padding, int num_threads, const string& label) {
+ tensorflow::testing::StopTiming();
+ std::unique_ptr<Device> device(
+ DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+ thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
+ EigenThreadPoolWrapper wrapper(&threadpool);
+ Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads);
+ device->set_eigen_cpu_device(&eigen_cpu_device);
+
+ gtl::InlinedVector<TensorValue, 4> inputs;
+ TensorShape shape1({batch_size, rows, cols, depth});
+ Tensor input1(DT_FLOAT, shape1);
+ test::FillIota<float>(&input1, 1.0);
+ inputs.push_back({nullptr, &input1});
+
+ // MaxPooling op.
+ NodeDef maxpool_node_def;
+ CHECK_EQ(kernel_rows, kernel_cols);
+ Status status = NodeDefBuilder("maxpool_op", "MaxPool")
+ .Input(FakeInput())
+ .Attr("ksize", {1, kernel_rows, kernel_cols, 1})
+ .Attr("strides", {1, stride, stride, 1})
+ .Attr("padding", padding == VALID ? "VALID" : "SAME")
+ .Finalize(&maxpool_node_def);
+ TF_CHECK_OK(status);
+ std::unique_ptr<OpKernel> op(CreateOpKernel(
+ DEVICE_CPU, device.get(), cpu_allocator(), maxpool_node_def, &status));
+ TF_CHECK_OK(status);
+ OpKernelContext::Params params;
+ params.device = device.get();
+ params.frame_iter = FrameAndIter(0, 0);
+ params.inputs = &inputs;
+ params.op_kernel = op.get();
+ params.output_alloc_attr = [&device, &op, &params](int index) {
+ AllocatorAttributes attr;
+ const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+ attr.set_on_host(on_host);
+ return attr;
+ };
+
+ std::unique_ptr<OpKernelContext> maxpool_context(new OpKernelContext(params));
+
+ op->Compute(maxpool_context.get());
+ tensorflow::testing::StartTiming();
+ for (int i = 0; i < iters; ++i) {
+ delete maxpool_context->release_output(0).tensor;
+ op->Compute(maxpool_context.get());
+ }
+ tensorflow::testing::StopTiming();
+ testing::ItemsProcessed(maxpool_context->mutable_output(0)->NumElements() *
+ iters);
+ testing::SetLabel(label);
+}
+
+// BS: batch_size
+// IR: input_rows
+// IC: input_cols
+// ND: node_depth
+// KR: kernel_rows
+// KC: kernel_cols
+// ST: stride. We use the same stride for both directions.
+// PT: padding
+#define BM_MaxPoolFwdCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL) \
+ static void \
+ BM_MaxPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH( \
+ int iters) { \
+ BM_MaxPool(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL); \
+ } \
+ BENCHMARK( \
+ BM_MaxPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH)
+
+// Labels are taken from the 2014-July-24 version of imagenet
+BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 1, "maxpool0_VALID");
+BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 1, "maxpool1_VALID");
+BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 1, "maxpool4_VALID");
+BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 1, "maxpool10_VALID");
+BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 1, "maxpool0_SAME");
+BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 1, "maxpool1_SAME");
+BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 1, "maxpool4_SAME");
+BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 1, "maxpool10_SAME");
+BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 4, "maxpool0_VALID");
+BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 4, "maxpool1_VALID");
+BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 4, "maxpool4_VALID");
+BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 4, "maxpool10_VALID");
+BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 4, "maxpool0_SAME");
+BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 4, "maxpool1_SAME");
+BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 4, "maxpool4_SAME");
+BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 4, "maxpool10_SAME");
+
+static void BM_MaxPoolBk(int iters, int batch_size, int rows, int cols,
+ int depth, int kernel_rows, int kernel_cols,
+ int stride, Padding padding, int num_threads,
+ bool use_gpu, const string& label) {
+ GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+
+ int out_height, out_width, pad_rows, pad_cols;
+ Status status =
+ Get2dOutputSize(rows, cols, kernel_rows, kernel_cols, stride, stride,
+ padding, &out_height, &out_width, &pad_rows, &pad_cols);
+ TF_CHECK_OK(status);
+
+ Tensor input_data(DT_FLOAT, TensorShape({batch_size, rows, cols, depth}));
+ input_data.flat<float>().setRandom();
+ Node* input_data_node = ops::Const(input_data, b.opts());
+
+ Tensor output_data(DT_FLOAT,
+ TensorShape({batch_size, out_height, out_width, depth}));
+ output_data.flat<float>().setRandom();
+ Node* output_data_node = ops::Const(output_data, b.opts());
+
+ Tensor output_diff(DT_FLOAT,
+ TensorShape({batch_size, out_height, out_width, depth}));
+ output_diff.flat<float>().setRandom();
+ Node* output_diff_node = ops::Const(output_diff, b.opts());
+
+ CHECK_EQ(kernel_rows, kernel_cols);
+ ops::MaxPoolGrad(input_data_node, output_data_node, output_diff_node,
+ {1, kernel_rows, kernel_cols, 1} /* ksize */,
+ {1, stride, stride, 1} /* stride */,
+ padding == VALID ? "VALID" : "SAME", b.opts());
+ Graph* g = new Graph(OpRegistry::Global());
+ TF_CHECK_OK(b.ToGraph(g));
+ string device = use_gpu ? "gpu" : "cpu";
+ test::Benchmark(device, g).Run(iters);
+
+ testing::ItemsProcessed(batch_size * rows * cols * depth * iters);
+ testing::SetLabel(label);
+}
+
+// BS: batch_size
+// IR: input_rows
+// IC: input_cols
+// ND: node_depth
+// KR: kernel_rows
+// KC: kernel_cols
+// ST: stride. We use the same stride for both directions.
+// PT: padding
+// The resulted symbol is too long. Need to use two macros to fit in 80-chars
+// clang-format off
+#define BM_MaxPoolBkGPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL) \
+ static void \
+ BM_MaxPoolBk_GPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_ \
+ ##PT##_##TH( \
+ int iters) { \
+ BM_MaxPoolBk(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, true, LABEL); \
+ } \
+ BENCHMARK( \
+ BM_MaxPoolBk_GPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_ \
+ ##PT##_##TH) \
+
+#define BM_MaxPoolBkCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL) \
+ static void \
+ BM_MaxPoolBk_CPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_ \
+ ##PT##_##TH( \
+ int iters) { \
+ BM_MaxPoolBk(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, false, LABEL); \
+ } \
+ BENCHMARK( \
+ BM_MaxPoolBk_CPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_ \
+ ##PT##_##TH)
+// clang-format on
+
+// Shapes taken from the 2015/05/16 inception model
+BM_MaxPoolBkGPU(32, 147, 147, 64, 3, 3, 2, VALID, 1, "maxpool_grad0_VALID");
+BM_MaxPoolBkGPU(32, 71, 71, 192, 3, 3, 2, VALID, 1, "maxpool_grad1_VALID");
+BM_MaxPoolBkGPU(32, 35, 35, 288, 3, 3, 2, VALID, 1, "maxpool_grad2_VALID");
+BM_MaxPoolBkGPU(32, 17, 17, 1248, 3, 3, 2, VALID, 1, "maxpool_grad3_VALID");
+BM_MaxPoolBkGPU(32, 8, 8, 2048, 3, 3, 2, VALID, 1, "maxpool_grad4_VALID");
+
+BM_MaxPoolBkCPU(32, 147, 147, 64, 3, 3, 2, VALID, 1, "maxpool_grad0_VALID");
+BM_MaxPoolBkCPU(32, 71, 71, 192, 3, 3, 2, VALID, 1, "maxpool_grad1_VALID");
+BM_MaxPoolBkCPU(32, 35, 35, 288, 3, 3, 2, VALID, 1, "maxpool_grad2_VALID");
+BM_MaxPoolBkCPU(32, 17, 17, 1248, 3, 3, 2, VALID, 1, "maxpool_grad3_VALID");
+BM_MaxPoolBkCPU(32, 8, 8, 2048, 3, 3, 2, VALID, 1, "maxpool_grad4_VALID");
+
+/*
+Relu Op
+Run benchmark with:
+*/
+static void BM_ReluFloat(int iters, int batch_size, int rows, int cols,
+ int depth, int num_threads, const string& label) {
+ tensorflow::testing::StopTiming();
+ std::unique_ptr<Device> device(
+ DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+ thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
+ EigenThreadPoolWrapper wrapper(&threadpool);
+ Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads);
+ device->set_eigen_cpu_device(&eigen_cpu_device);
+
+ gtl::InlinedVector<TensorValue, 4> inputs;
+ TensorShape shape1({batch_size, rows, cols, depth});
+ Tensor input1(DT_FLOAT, shape1);
+ test::FillIota<float>(&input1, 1.0);
+ inputs.push_back({nullptr, &input1});
+
+ // Reluing op.
+ NodeDef relu_node_def;
+ Status status = NodeDefBuilder("relu_op", "Relu")
+ .Input(FakeInput(DT_FLOAT))
+ .Finalize(&relu_node_def);
+ TF_CHECK_OK(status);
+ std::unique_ptr<OpKernel> op(CreateOpKernel(
+ DEVICE_CPU, device.get(), cpu_allocator(), relu_node_def, &status));
+ TF_CHECK_OK(status);
+ OpKernelContext::Params params;
+ params.device = device.get();
+ params.frame_iter = FrameAndIter(0, 0);
+ params.inputs = &inputs;
+ params.op_kernel = op.get();
+ params.output_alloc_attr = [&device, &op, &params](int index) {
+ AllocatorAttributes attr;
+ const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+ attr.set_on_host(on_host);
+ return attr;
+ };
+
+ std::unique_ptr<OpKernelContext> relu_context(new OpKernelContext(params));
+
+ op->Compute(relu_context.get());
+ tensorflow::testing::StartTiming();
+ for (int i = 0; i < iters; ++i) {
+ delete relu_context->release_output(0).tensor;
+ op->Compute(relu_context.get());
+ }
+ tensorflow::testing::StopTiming();
+ testing::ItemsProcessed(relu_context->mutable_output(0)->NumElements() *
+ iters);
+ testing::SetLabel(label);
+}
+
+// BS: batch_size
+// IR: input_rows
+// IC: input_cols
+// ND: node_depth
+#define BM_Relu(BS, IR, IC, ND, TH, LABEL) \
+ static void BM_ReluFloat_##BS##_##IR##_##IC##_##ND##_##TH(int iters) { \
+ BM_ReluFloat(iters, BS, IR, IC, ND, TH, LABEL); \
+ } \
+ BENCHMARK(BM_ReluFloat_##BS##_##IR##_##IC##_##ND##_##TH)
+
+BM_Relu(32, 112, 112, 64, 1, "relu0");
+BM_Relu(32, 56, 56, 192, 1, "relu1");
+BM_Relu(32, 28, 28, 352, 1, "relu4");
+BM_Relu(32, 14, 14, 576, 1, "relu10");
+BM_Relu(32, 112, 112, 64, 4, "relu0");
+BM_Relu(32, 56, 56, 192, 4, "relu1");
+BM_Relu(32, 28, 28, 352, 4, "relu4");
+BM_Relu(32, 14, 14, 576, 4, "relu10");
+
+static void BM_ImageNetSoftmaxFwd(int iters, int batch_size, int node_depth,
+ int num_threads, const string& label) {
+ tensorflow::testing::StopTiming();
+ std::unique_ptr<Device> device(
+ DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+ thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
+ EigenThreadPoolWrapper wrapper(&threadpool);
+ Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads);
+ device->set_eigen_cpu_device(&eigen_cpu_device);
+
+ gtl::InlinedVector<TensorValue, 4> inputs;
+ TensorShape shape1({node_depth, batch_size});
+ Tensor* input1 = new Tensor(DT_FLOAT, shape1);
+ test::FillIota<float>(input1, 1.0);
+ inputs.push_back({nullptr, input1});
+
+ // Softmax op.
+ NodeDef softmax_node_def;
+ TF_CHECK_OK(NodeDefBuilder("softmax_op", "Softmax")
+ .Input("input", 0, DT_FLOAT)
+ .Finalize(&softmax_node_def));
+ Status status;
+ std::unique_ptr<OpKernel> op(CreateOpKernel(
+ DEVICE_CPU, device.get(), cpu_allocator(), softmax_node_def, &status));
+ TF_CHECK_OK(status);
+ OpKernelContext::Params params;
+ params.device = device.get();
+ params.frame_iter = FrameAndIter(0, 0);
+ params.inputs = &inputs;
+ params.op_kernel = op.get();
+ params.output_alloc_attr = [&device, &op, &params](int index) {
+ AllocatorAttributes attr;
+ const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+ attr.set_on_host(on_host);
+ return attr;
+ };
+
+ std::unique_ptr<OpKernelContext> softmax_context(new OpKernelContext(params));
+
+ op->Compute(softmax_context.get());
+ tensorflow::testing::StartTiming();
+ for (int i = 0; i < iters; ++i) {
+ delete softmax_context->release_output(0).tensor;
+ op->Compute(softmax_context.get());
+ }
+ tensorflow::testing::StopTiming();
+ testing::ItemsProcessed(softmax_context->mutable_output(0)->NumElements() *
+ iters);
+ testing::SetLabel(label);
+}
+
+#define BM_ImageNetSoftmaxFwdCPU(BATCH_SIZE, NODE_DEPTH, TH, LABEL) \
+ static void BM_ImageNetSoftmaxFwd_##BATCH_SIZE##_##NODE_DEPTH##_##TH( \
+ int iters) { \
+ BM_ImageNetSoftmaxFwd(iters, BATCH_SIZE, NODE_DEPTH, TH, LABEL); \
+ } \
+ BENCHMARK(BM_ImageNetSoftmaxFwd_##BATCH_SIZE##_##NODE_DEPTH##_##TH)
+
+// Labels are taken from the 2014-July-24 version of imagenet
+BM_ImageNetSoftmaxFwdCPU(32, 1008, 1, "softmax32");
+BM_ImageNetSoftmaxFwdCPU(128, 1008, 1, "softmax128");
+BM_ImageNetSoftmaxFwdCPU(32, 1008, 4, "softmax32");
+BM_ImageNetSoftmaxFwdCPU(128, 1008, 4, "softmax128");
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/count_up_to_op.cc b/tensorflow/core/kernels/count_up_to_op.cc
new file mode 100644
index 0000000000..7cf4bdb6d0
--- /dev/null
+++ b/tensorflow/core/kernels/count_up_to_op.cc
@@ -0,0 +1,51 @@
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/port.h"
+
+namespace tensorflow {
+
+template <class T>
+class CountUpToOp : public OpKernel {
+ public:
+ explicit CountUpToOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("limit", &limit_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ T before_increment;
+ {
+ mutex_lock l(*context->input_ref_mutex(0));
+ Tensor tensor = context->mutable_input(0, true);
+ OP_REQUIRES(context, TensorShapeUtils::IsScalar(tensor.shape()),
+ errors::InvalidArgument("input is not a scalar: ",
+ tensor.shape().DebugString()));
+ T* ptr = &tensor.scalar<T>()();
+ before_increment = *ptr;
+ if (*ptr >= limit_) {
+ context->SetStatus(errors::OutOfRange("Reached limit of ", limit_));
+ return;
+ }
+ ++*ptr;
+ }
+ // Output if no error.
+ Tensor* out_tensor;
+ OP_REQUIRES_OK(context, context->allocate_output("output", TensorShape({}),
+ &out_tensor));
+ out_tensor->scalar<T>()() = before_increment;
+ }
+
+ private:
+ T limit_;
+};
+
+#define REGISTER(TYPE) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("CountUpTo").TypeConstraint<TYPE>("T").Device(DEVICE_CPU), \
+ CountUpToOp<TYPE>)
+
+REGISTER(int32);
+REGISTER(int64);
+
+#undef REGISTER
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
new file mode 100644
index 0000000000..5d39b88166
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -0,0 +1,23 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(UnaryOp, CPU, "Abs", functor::abs, float, double, int32, int64);
+#ifndef __ANDROID__
+REGISTER_KERNEL_BUILDER(Name("ComplexAbs").Device(DEVICE_CPU),
+ UnaryOp<CPUDevice, functor::abs<complex64>>);
+#endif
+#if GOOGLE_CUDA
+REGISTER3(UnaryOp, GPU, "Abs", functor::abs, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Abs")
+ .Device(DEVICE_GPU)
+ .HostMemory("x")
+ .HostMemory("y")
+ .TypeConstraint<int32>("T"),
+ UnaryOp<CPUDevice, functor::abs<int32>>);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_add.cc b/tensorflow/core/kernels/cwise_op_add.cc
new file mode 100644
index 0000000000..a6cd4bddbe
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_add.cc
@@ -0,0 +1,21 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER7(BinaryOp, CPU, "Add", functor::add, float, double, int32, int64, int8,
+ int16, complex64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Add", functor::add, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Add")
+ .Device(DEVICE_GPU)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::add<int32>>);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc
new file mode 100644
index 0000000000..0a8f1313f8
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER2(UnaryOp, CPU, "Ceil", functor::ceil, float, double);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Ceil", functor::ceil, float, double);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_complex.cc b/tensorflow/core/kernels/cwise_op_complex.cc
new file mode 100644
index 0000000000..825181bc35
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_complex.cc
@@ -0,0 +1,10 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER_KERNEL_BUILDER(Name("Complex").Device(DEVICE_CPU),
+ BinaryOp<CPUDevice, functor::make_complex<float>>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("Complex").Device(DEVICE_GPU),
+ BinaryOp<GPUDevice, functor::make_complex<float>>);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_conj.cc b/tensorflow/core/kernels/cwise_op_conj.cc
new file mode 100644
index 0000000000..ba445d1c3d
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_conj.cc
@@ -0,0 +1,10 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER_KERNEL_BUILDER(Name("Conj").Device(DEVICE_CPU),
+ UnaryOp<CPUDevice, functor::conj<complex64>>);
+#if GOOGLE_CUDA
+// REGISTER_KERNEL_BUILDER(Name("Conj").Device(DEVICE_GPU),
+// UnaryOp<GPUDevice, functor::conj<complex64>>);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc
new file mode 100644
index 0000000000..45e24fc2ec
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "Cos", functor::cos, float, double, complex64);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Cos", functor::cos, float, double);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
new file mode 100644
index 0000000000..76d606ed03
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -0,0 +1,21 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER5(BinaryOp, CPU, "Div", functor::div, float, double, int32, int64,
+ complex64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Div", functor::div, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Div")
+ .Device(DEVICE_GPU)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::div<int32>>);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_equal_to.cc b/tensorflow/core/kernels/cwise_op_equal_to.cc
new file mode 100644
index 0000000000..8369299332
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_equal_to.cc
@@ -0,0 +1,21 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER5(BinaryOp, CPU, "Equal", functor::equal_to, float, double, int32,
+ int64, complex64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Equal", functor::equal_to, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Equal")
+ .Device(DEVICE_GPU)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::equal_to<int32>>);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_exp.cc b/tensorflow/core/kernels/cwise_op_exp.cc
new file mode 100644
index 0000000000..b2603a1b4c
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_exp.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "Exp", functor::exp, float, double, complex64);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Exp", functor::exp, float, double);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc
new file mode 100644
index 0000000000..83c8203953
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER2(UnaryOp, CPU, "Floor", functor::floor, float, double);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Floor", functor::floor, float, double);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_gpu_abs.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_abs.cu.cc
new file mode 100644
index 0000000000..59436afbc0
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_abs.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY3(abs, float, double, int64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc
new file mode 100644
index 0000000000..edf8e0d1a5
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(add, float, double, int64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_ceil.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_ceil.cu.cc
new file mode 100644
index 0000000000..f24c4b8b73
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_ceil.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(ceil, float, double);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_complex.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_complex.cu.cc
new file mode 100644
index 0000000000..29086b5c71
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_complex.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY1(make_complex, float);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
new file mode 100644
index 0000000000..cae22cea8e
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+// DEFINE_UNARY1(conj, complex64); // not working
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_cos.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_cos.cu.cc
new file mode 100644
index 0000000000..c8412496a8
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_cos.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(cos, float, double);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc
new file mode 100644
index 0000000000..c581c0487e
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(div, float, double, int64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
new file mode 100644
index 0000000000..f994822a74
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY4(equal_to, float, double, int64, complex64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_exp.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_exp.cu.cc
new file mode 100644
index 0000000000..caeaa19cef
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_exp.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(exp, float, double);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_floor.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_floor.cu.cc
new file mode 100644
index 0000000000..0a06ff2978
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_floor.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(floor, float, double);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_greater.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_greater.cu.cc
new file mode 100644
index 0000000000..e1278e077b
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_greater.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(greater, float, double, int64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_greater_equal.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_greater_equal.cu.cc
new file mode 100644
index 0000000000..fafcf9b28a
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_greater_equal.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(greater_equal, float, double, int64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_imag.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_imag.cu.cc
new file mode 100644
index 0000000000..0370782c96
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_imag.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY1(get_imag, complex64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_inverse.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_inverse.cu.cc
new file mode 100644
index 0000000000..020abef210
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_inverse.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY3(inverse, float, double, int64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_isfinite.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_isfinite.cu.cc
new file mode 100644
index 0000000000..7a3a273af7
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_isfinite.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(isfinite, float, double);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_isinf.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_isinf.cu.cc
new file mode 100644
index 0000000000..cfc4be3d25
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_isinf.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(isinf, float, double);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_isnan.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_isnan.cu.cc
new file mode 100644
index 0000000000..c93b74387e
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_isnan.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(isnan, float, double);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_less.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_less.cu.cc
new file mode 100644
index 0000000000..8e2b28ac60
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_less.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(less, float, double, int64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_less_equal.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_less_equal.cu.cc
new file mode 100644
index 0000000000..be8e34a58b
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_less_equal.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(less_equal, float, double, int64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_log.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_log.cu.cc
new file mode 100644
index 0000000000..7d183cce50
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_log.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(log, float, double);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_logical_and.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_logical_and.cu.cc
new file mode 100644
index 0000000000..ba7046f9f0
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_logical_and.cu.cc
@@ -0,0 +1,13 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+template struct BinaryFunctor<GPUDevice, logical_and, 1>;
+template struct BinaryFunctor<GPUDevice, logical_and, 2>;
+template struct BinaryFunctor<GPUDevice, logical_and, 3>;
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_logical_not.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_logical_not.cu.cc
new file mode 100644
index 0000000000..34a43a76ef
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_logical_not.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+template struct UnaryFunctor<GPUDevice, logical_not>;
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_logical_or.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_logical_or.cu.cc
new file mode 100644
index 0000000000..47a7bd68dc
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_logical_or.cu.cc
@@ -0,0 +1,13 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+template struct BinaryFunctor<GPUDevice, logical_or, 1>;
+template struct BinaryFunctor<GPUDevice, logical_or, 2>;
+template struct BinaryFunctor<GPUDevice, logical_or, 3>;
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_maximum.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_maximum.cu.cc
new file mode 100644
index 0000000000..8f7ab90e9a
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_maximum.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(maximum, float, double, int64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_minimum.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_minimum.cu.cc
new file mode 100644
index 0000000000..75fd7f89b4
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_minimum.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(minimum, float, double, int64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_mod.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_mod.cu.cc
new file mode 100644
index 0000000000..d08a17a94d
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_mod.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+// No GPU ops for mod yet.
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
new file mode 100644
index 0000000000..e0a6738bef
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(mul, float, double, int64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
new file mode 100644
index 0000000000..3031afbb75
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY4(neg, float, double, int32, int64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_not_equal_to.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_not_equal_to.cu.cc
new file mode 100644
index 0000000000..59c76ee88b
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_not_equal_to.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY4(not_equal_to, float, double, int64, complex64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_pow.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_pow.cu.cc
new file mode 100644
index 0000000000..50177495bc
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_pow.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(pow, float, double, int64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_real.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_real.cu.cc
new file mode 100644
index 0000000000..3b1d465914
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_real.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY1(get_real, complex64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_rsqrt.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_rsqrt.cu.cc
new file mode 100644
index 0000000000..682e2d2d4b
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_rsqrt.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(rsqrt, float, double);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
new file mode 100644
index 0000000000..b5125648e3
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
@@ -0,0 +1,15 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+template struct SelectFunctor<GPUDevice, float>;
+template struct SelectFunctor<GPUDevice, double>;
+template struct SelectFunctor<GPUDevice, int32>;
+template struct SelectFunctor<GPUDevice, int64>;
+template struct SelectFunctor<GPUDevice, complex64>;
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_sigmoid.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sigmoid.cu.cc
new file mode 100644
index 0000000000..9c250f3071
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_sigmoid.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(sigmoid, float, double);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_sign.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sign.cu.cc
new file mode 100644
index 0000000000..f413480ecc
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_sign.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY3(sign, float, double, int64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_sin.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sin.cu.cc
new file mode 100644
index 0000000000..6135f3b780
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_sin.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(sin, float, double);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_sqrt.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sqrt.cu.cc
new file mode 100644
index 0000000000..9bdf3b9e30
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_sqrt.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(sqrt, float, double);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_square.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_square.cu.cc
new file mode 100644
index 0000000000..6b900e994d
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_square.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY3(square, float, double, int64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_sub.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sub.cu.cc
new file mode 100644
index 0000000000..6fd5ea0d38
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_sub.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY3(sub, float, double, int64);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_tanh.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_tanh.cu.cc
new file mode 100644
index 0000000000..e0393f6c2a
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_tanh.cu.cc
@@ -0,0 +1,11 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(tanh, float, double);
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc
new file mode 100644
index 0000000000..9ae31dcdfe
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_greater.cc
@@ -0,0 +1,21 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(BinaryOp, CPU, "Greater", functor::greater, float, double, int32,
+ int64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Greater", functor::greater, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Greater")
+ .Device(DEVICE_GPU)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::greater<int32>>);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc
new file mode 100644
index 0000000000..be4cc5dc79
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc
@@ -0,0 +1,22 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(BinaryOp, CPU, "GreaterEqual", functor::greater_equal, float, double,
+ int32, int64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "GreaterEqual", functor::greater_equal, float, double,
+ int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
+ .Device(DEVICE_GPU)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::greater_equal<int32>>);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_imag.cc b/tensorflow/core/kernels/cwise_op_imag.cc
new file mode 100644
index 0000000000..c2432326fc
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_imag.cc
@@ -0,0 +1,10 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER_KERNEL_BUILDER(Name("Imag").Device(DEVICE_CPU),
+ UnaryOp<CPUDevice, functor::get_imag<complex64>>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("Imag").Device(DEVICE_GPU),
+ UnaryOp<GPUDevice, functor::get_imag<complex64>>);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_inverse.cc b/tensorflow/core/kernels/cwise_op_inverse.cc
new file mode 100644
index 0000000000..6af883e755
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_inverse.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "Inv", functor::inverse, float, double, complex64);
+#if GOOGLE_CUDA
+REGISTER3(UnaryOp, GPU, "Inv", functor::inverse, float, double, int64);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc
new file mode 100644
index 0000000000..e52d199a8f
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER2(UnaryOp, CPU, "IsFinite", functor::isfinite, float, double);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "IsFinite", functor::isfinite, float, double);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc
new file mode 100644
index 0000000000..868204f86e
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER2(UnaryOp, CPU, "IsInf", functor::isinf, float, double);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "IsInf", functor::isinf, float, double);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc
new file mode 100644
index 0000000000..a8f4d60d0f
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_isnan.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER2(UnaryOp, CPU, "IsNan", functor::isnan, float, double);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "IsNan", functor::isnan, float, double);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
new file mode 100644
index 0000000000..3b5f75445c
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@@ -0,0 +1,20 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(BinaryOp, CPU, "Less", functor::less, float, double, int32, int64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Less", functor::less, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Less")
+ .Device(DEVICE_GPU)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::less<int32>>);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc
new file mode 100644
index 0000000000..507c7c2908
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@@ -0,0 +1,22 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(BinaryOp, CPU, "LessEqual", functor::less_equal, float, double, int32,
+ int64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "LessEqual", functor::less_equal, float, double,
+ int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("LessEqual")
+ .Device(DEVICE_GPU)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::less_equal<int32>>);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc
new file mode 100644
index 0000000000..ebc7cbcc4e
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "Log", functor::log, float, double, complex64);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Log", functor::log, float, double);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_logical_and.cc b/tensorflow/core/kernels/cwise_op_logical_and.cc
new file mode 100644
index 0000000000..a4075088f4
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_logical_and.cc
@@ -0,0 +1,10 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER_KERNEL_BUILDER(Name("LogicalAnd").Device(DEVICE_CPU),
+ BinaryOp<CPUDevice, functor::logical_and>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("LogicalAnd").Device(DEVICE_GPU),
+ BinaryOp<GPUDevice, functor::logical_and>);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_logical_not.cc b/tensorflow/core/kernels/cwise_op_logical_not.cc
new file mode 100644
index 0000000000..b2e97bf70c
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_logical_not.cc
@@ -0,0 +1,10 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER_KERNEL_BUILDER(Name("LogicalNot").Device(DEVICE_CPU),
+ UnaryOp<CPUDevice, functor::logical_not>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("LogicalNot").Device(DEVICE_GPU),
+ UnaryOp<GPUDevice, functor::logical_not>);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_logical_or.cc b/tensorflow/core/kernels/cwise_op_logical_or.cc
new file mode 100644
index 0000000000..0d1df082f7
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_logical_or.cc
@@ -0,0 +1,10 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER_KERNEL_BUILDER(Name("LogicalOr").Device(DEVICE_CPU),
+ BinaryOp<CPUDevice, functor::logical_or>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("LogicalOr").Device(DEVICE_GPU),
+ BinaryOp<GPUDevice, functor::logical_or>);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
new file mode 100644
index 0000000000..c0c9e3f6f5
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -0,0 +1,21 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(BinaryOp, CPU, "Maximum", functor::maximum, float, double, int32,
+ int64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Maximum", functor::maximum, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Maximum")
+ .Device(DEVICE_GPU)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::maximum<int32>>);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
new file mode 100644
index 0000000000..4c6bf7df05
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -0,0 +1,21 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(BinaryOp, CPU, "Minimum", functor::minimum, float, double, int32,
+ int64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Minimum", functor::minimum, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Minimum")
+ .Device(DEVICE_GPU)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::minimum<int32>>);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mod.cc b/tensorflow/core/kernels/cwise_op_mod.cc
new file mode 100644
index 0000000000..17f2834030
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_mod.cc
@@ -0,0 +1,6 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER2(BinaryOp, CPU, "Mod", functor::mod, int32, int64);
+REGISTER2(BinaryOp, CPU, "Mod", functor::fmod, float, double);
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul.cc b/tensorflow/core/kernels/cwise_op_mul.cc
new file mode 100644
index 0000000000..15f65012cd
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_mul.cc
@@ -0,0 +1,21 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER7(BinaryOp, CPU, "Mul", functor::mul, float, double, int32, int64, int8,
+ int16, complex64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Mul", functor::mul, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Mul")
+ .Device(DEVICE_GPU)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::mul<int32>>);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg.cc
new file mode 100644
index 0000000000..3a19b2e94f
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_neg.cc
@@ -0,0 +1,9 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER5(UnaryOp, CPU, "Neg", functor::neg, float, double, int32, complex64,
+ int64);
+#if GOOGLE_CUDA
+REGISTER4(UnaryOp, GPU, "Neg", functor::neg, float, double, int32, int64);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to.cc b/tensorflow/core/kernels/cwise_op_not_equal_to.cc
new file mode 100644
index 0000000000..02d434a1c2
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to.cc
@@ -0,0 +1,10 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER5(BinaryOp, CPU, "NotEqual", functor::not_equal_to, float, double,
+ int32, int64, complex64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "NotEqual", functor::not_equal_to, float, double,
+ int64);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc
new file mode 100644
index 0000000000..d10dced85f
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_pow.cc
@@ -0,0 +1,9 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER5(BinaryOp, CPU, "Pow", functor::pow, float, double, int32, int64,
+ complex64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Pow", functor::pow, float, double, int64);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_real.cc b/tensorflow/core/kernels/cwise_op_real.cc
new file mode 100644
index 0000000000..84295a5a16
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_real.cc
@@ -0,0 +1,10 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER_KERNEL_BUILDER(Name("Real").Device(DEVICE_CPU),
+ UnaryOp<CPUDevice, functor::get_real<complex64>>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("Real").Device(DEVICE_GPU),
+ UnaryOp<GPUDevice, functor::get_real<complex64>>);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_rsqrt.cc b/tensorflow/core/kernels/cwise_op_rsqrt.cc
new file mode 100644
index 0000000000..a22b1209de
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_rsqrt.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "Rsqrt", functor::rsqrt, float, double, complex64);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Rsqrt", functor::rsqrt, float, double);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
new file mode 100644
index 0000000000..baa821690a
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -0,0 +1,17 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER_SELECT(CPU, "Select", "", float);
+REGISTER_SELECT(CPU, "Select", "", double);
+REGISTER_SELECT(CPU, "Select", "", int32);
+REGISTER_SELECT(CPU, "Select", "", int64);
+REGISTER_SELECT(CPU, "Select", "", complex64);
+REGISTER_SELECT(CPU, "Select", "", string);
+#if GOOGLE_CUDA
+REGISTER_SELECT(GPU, "Select", "", float);
+REGISTER_SELECT(GPU, "Select", "", double);
+REGISTER_SELECT(GPU, "Select", "", int32);
+REGISTER_SELECT(GPU, "Select", "", int64);
+REGISTER_SELECT(GPU, "Select", "", complex64);
+#endif // GOOGLE_CUDA
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sigmoid.cc b/tensorflow/core/kernels/cwise_op_sigmoid.cc
new file mode 100644
index 0000000000..e03b5d54dd
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_sigmoid.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "Sigmoid", functor::sigmoid, float, double, complex64);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Sigmoid", functor::sigmoid, float, double);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc
new file mode 100644
index 0000000000..59a0bfa1ed
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@@ -0,0 +1,19 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(UnaryOp, CPU, "Sign", functor::sign, float, double, int32, int64);
+#if GOOGLE_CUDA
+REGISTER3(UnaryOp, GPU, "Sign", functor::sign, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Sign")
+ .Device(DEVICE_GPU)
+ .HostMemory("x")
+ .HostMemory("y")
+ .TypeConstraint<int32>("T"),
+ UnaryOp<CPUDevice, functor::sign<int32>>);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc
new file mode 100644
index 0000000000..e7c87374d7
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_sin.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "Sin", functor::sin, float, double, complex64);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Sin", functor::sin, float, double);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc
new file mode 100644
index 0000000000..f43241264a
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_sqrt.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "Sqrt", functor::sqrt, float, double, complex64);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Sqrt", functor::sqrt, float, double);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc
new file mode 100644
index 0000000000..510fda49aa
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_square.cc
@@ -0,0 +1,9 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER5(UnaryOp, CPU, "Square", functor::square, float, double, int32,
+ complex64, int64);
+#if GOOGLE_CUDA
+REGISTER3(UnaryOp, GPU, "Square", functor::square, float, double, int64);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc
new file mode 100644
index 0000000000..c3c5952f8d
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_sub.cc
@@ -0,0 +1,21 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER5(BinaryOp, CPU, "Sub", functor::sub, float, double, int32, int64,
+ complex64);
+#if GOOGLE_CUDA
+REGISTER3(BinaryOp, GPU, "Sub", functor::sub, float, double, int64);
+#endif
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Sub")
+ .Device(DEVICE_GPU)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::sub<int32>>);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc
new file mode 100644
index 0000000000..31f4743449
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_tanh.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "Tanh", functor::tanh, float, double, complex64);
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Tanh", functor::tanh, float, double);
+#endif
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
new file mode 100644
index 0000000000..7d818cfbbf
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -0,0 +1,607 @@
+#ifndef TENSORFLOW_KERNELS_CWISE_OPS_H_
+#define TENSORFLOW_KERNELS_CWISE_OPS_H_
+
+#include <cmath>
+#include <functional>
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+// The following functors (sign, tanh, sigmoid, etc.) are not defined
+// by Eigen. When their equivalent are added into the Eigen, we can
+// replace them using type aliases.
+
+namespace Eigen {
+namespace internal {
+
+template <typename T>
+struct scalar_sign_op {
+ // TODO(zhifengc): this only works for real types. In theory,
+ // sign(x) = x / |x| works for both real and complex values.
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op);
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
+ return T(x > T(0)) - T(x < T(0));
+ }
+};
+
+// TODO(zhifengc): Eigen::internal::pow_impl does not have proper
+// EIGEN host/device decoration. We duplicate code here for now.
+template <typename T, bool IsInteger>
+struct pow {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T
+ operator()(const T& x, const T& y) const {
+ return std::pow(x, y);
+ }
+};
+
+template <typename T>
+struct pow<T, true> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(T x, T y) const {
+ T res(1);
+ if (y & 1) res *= x;
+ y >>= 1;
+ while (y) {
+ x *= x;
+ if (y & 1) res *= x;
+ y >>= 1;
+ }
+ return res;
+ }
+};
+
+template <typename T>
+struct scalar_pow2_op : pow<T, NumTraits<T>::IsInteger> {};
+
+template <typename T>
+struct functor_traits<scalar_pow2_op<T> > {
+ enum {
+ Cost = 5 * NumTraits<T>::MulCost,
+ PacketAccess = false,
+ };
+};
+
+template <typename T>
+struct scalar_fmod2_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod2_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a,
+ const T& b) const {
+ return fmod(a, b);
+ }
+};
+
+template <typename T>
+struct scalar_mod2_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T
+ operator()(const T& a, const T& b) const {
+ return a % b;
+ }
+};
+
+template <typename T>
+struct functor_traits<scalar_mod2_op<T> > {
+ enum {
+ Cost = 5, // Roughly the cost of a div
+ PacketAccess = false,
+ };
+};
+
+// scalar_left and scalar_right are template helpers to partially
+// apply a binary function.
+//
+// Suppose Binary is a binary functor f(x, y), scalar_left<> is a
+// unary functor g_x(y) = f(x, y), where x is provided via the
+// constructor. Similarly, scalar_right<> is a unary functor g_y(x) =
+// f(x, y).
+
+template <typename Tout, typename Tin, typename Binary,
+ bool PacketAccess = functor_traits<Binary>::PacketAccess>
+struct scalar_left {
+ typedef Tout result_type;
+ const Tin* left;
+ EIGEN_DEVICE_FUNC inline scalar_left(
+ const scalar_left& other) // NOLINT(runtime/explicit)
+ : left(other.left) {}
+ EIGEN_DEVICE_FUNC inline explicit scalar_left(const Tin* c) : left(c) {}
+ EIGEN_DEVICE_FUNC inline Tout operator()(const Tin& right) const {
+ return Binary()(*left, right);
+ }
+};
+
+template <typename Tout, typename Tin, typename Binary>
+struct scalar_left<Tout, Tin, Binary, true> {
+ typedef Tout result_type;
+ const Tin* left;
+ EIGEN_DEVICE_FUNC inline scalar_left(
+ const scalar_left& other) // NOLINT(runtime/explicit)
+ : left(other.left) {}
+ EIGEN_DEVICE_FUNC inline explicit scalar_left(const Tin* c) : left(c) {}
+ EIGEN_DEVICE_FUNC inline Tout operator()(const Tin& right) const {
+ return Binary()(*left, right);
+ }
+
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& right_packet) const {
+ const Packet left_packet = Eigen::internal::pset1<Packet>(*left);
+ return Binary().packetOp(left_packet, right_packet);
+ }
+};
+
+template <typename Tout, typename Tin, typename Binary>
+struct functor_traits<scalar_left<Tout, Tin, Binary> > {
+ enum {
+ Cost = functor_traits<Binary>::Cost,
+ PacketAccess = functor_traits<Binary>::PacketAccess,
+ };
+};
+
+template <typename Tout, typename Tin, typename Binary,
+ bool PacketAccess = functor_traits<Binary>::PacketAccess>
+struct scalar_right {
+ typedef Tout result_type;
+ const Tin* right;
+ EIGEN_DEVICE_FUNC inline scalar_right(
+ const scalar_right& other) // NOLINT(runtime/explicit)
+ : right(other.right) {}
+ EIGEN_DEVICE_FUNC inline explicit scalar_right(const Tin* c) : right(c) {}
+ EIGEN_DEVICE_FUNC inline Tout operator()(const Tin& left) const {
+ return Binary()(left, *right);
+ }
+};
+
+template <typename Tout, typename Tin, typename Binary>
+struct scalar_right<Tout, Tin, Binary, true> {
+ typedef Tout result_type;
+ const Tin* right;
+ EIGEN_DEVICE_FUNC inline scalar_right(
+ const scalar_right& other) // NOLINT(runtime/explicit)
+ : right(other.right) {}
+ EIGEN_DEVICE_FUNC inline explicit scalar_right(const Tin* c) : right(c) {}
+ EIGEN_DEVICE_FUNC inline Tout operator()(const Tin& left) const {
+ return Binary()(left, *right);
+ }
+
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& left_packet) const {
+ const Packet right_packet = Eigen::internal::pset1<Packet>(*right);
+ return Binary().packetOp(left_packet, right_packet);
+ }
+};
+
+template <typename Tout, typename Tin, typename Binary>
+struct functor_traits<scalar_right<Tout, Tin, Binary> > {
+ enum {
+ Cost = functor_traits<Binary>::Cost,
+ PacketAccess = functor_traits<Binary>::PacketAccess,
+ };
+};
+
+// similar to std::equal_to, but with the DEVICE_FUNC qualifier
+template <class T>
+struct equal_to : std::binary_function<T, T, bool> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ bool operator()(const T& x, const T& y) const { return x == y; }
+};
+
+// similar to std::not_equal_to, but with the DEVICE_FUNC qualifier
+template <class T>
+struct not_equal_to : std::binary_function<T, T, bool> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ bool operator()(const T& x, const T& y) const { return x != y; }
+};
+
+// similar to std::greater, but with the DEVICE_FUNC qualifier
+template <class T>
+struct greater : std::binary_function<T, T, bool> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ bool operator()(const T& x, const T& y) const { return x > y; }
+};
+
+// similar to std::less, but with the DEVICE_FUNC qualifier
+template <class T>
+struct less : std::binary_function<T, T, bool> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ bool operator()(const T& x, const T& y) const { return x < y; }
+};
+
+// similar to std::greater_equal, but with the DEVICE_FUNC qualifier
+template <class T>
+struct greater_equal : std::binary_function<T, T, bool> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ bool operator()(const T& x, const T& y) const { return x >= y; }
+};
+
+// similar to std::less_equal, but with the DEVICE_FUNC qualifier
+template <class T>
+struct less_equal : std::binary_function<T, T, bool> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ bool operator()(const T& x, const T& y) const { return x <= y; }
+};
+
+} // end namespace internal
+} // end namespace Eigen
+
+namespace tensorflow {
+namespace functor {
+
+////////////////////////////////////////////////////////////////////////////////
+// Helpers
+////////////////////////////////////////////////////////////////////////////////
+
+// Base template for functors whose input scalar type is T and
+// output scalar type is R.
+template <typename T, typename F, typename R = T>
+struct base {
+ // func defines operator() and its vectorized version packetOp().
+ typedef F func;
+
+ // If true, the functor's corresponding binary op will instantiate
+ // specialized kernels to perform an optimized broadcast
+ // operation. Each functor for which this is enabled increases the
+ // code size, so by default this is disabled for binary functors and
+ // is enabled on a per-op basis as needed.
+ static const bool use_bcast_optimization = false;
+
+ // operator() has the signature:
+ // out_type operator()(in_type in0, in_type in1 ...)
+ typedef R out_type;
+ typedef T in_type;
+
+ // TensorFlow provides tensor-ized version of "func". Roughly
+ // speaking, the tensorflow operation has the signature:
+ // tout_type op(tin_type in0)
+ // tout_type op(tin_type in0, tin_type in1)
+ // tout_type op(tin_type in0, in_type scalar)
+ typedef typename TTypes<out_type>::Flat tout_type;
+ typedef typename TTypes<in_type>::ConstFlat tin_type;
+ typedef typename TTypes<in_type>::ConstScalar tscalar_type;
+};
+
+// For now, we only apply certain speed optimization for
+// float/double's broadcast binary op.
+template <typename T>
+struct use_bcast_optimization {
+ static const bool value = false;
+};
+
+template <>
+struct use_bcast_optimization<float> {
+ static const bool value = true;
+};
+
+template <>
+struct use_bcast_optimization<double> {
+ static const bool value = true;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Unary functors
+////////////////////////////////////////////////////////////////////////////////
+
+// abs(x) = |x|
+// neg(x) = - x
+// inverse(x) = 1 / x
+// square(x) = x^2
+// sqrt(x) = x^(1/2)
+// rsqrt(x) = x^(-1/2)
+// exp(x) = e^x
+// log(x) = natural logrithm of x
+// tanh = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+// sigmoid = 1 / (1 + exp(-x)) // a.k.a, logistic
+//
+// NOTE: We may eventually implement common functions used in NN
+// here. E.g., rectifier, softplus, derivatives of tanh, sigmod, etc.
+// For reference, see speech/lstm/eigen_functors.h.
+
+template <typename T>
+struct abs : base<T, Eigen::internal::scalar_abs_op<T>,
+ typename Eigen::internal::scalar_abs_op<T>::result_type> {};
+
+template <typename T>
+struct neg : base<T, Eigen::internal::scalar_opposite_op<T> > {};
+
+template <typename T>
+struct inverse : base<T, Eigen::internal::scalar_inverse_op<T> > {};
+
+template <typename T>
+struct square : base<T, Eigen::internal::scalar_square_op<T> > {};
+
+template <typename T>
+struct sqrt : base<T, Eigen::internal::scalar_sqrt_op<T> > {};
+
+template <typename T>
+struct rsqrt : base<T, Eigen::internal::scalar_rsqrt_op<T> > {};
+
+template <typename T>
+struct exp : base<T, Eigen::internal::scalar_exp_op<T> > {};
+
+template <typename T>
+struct log : base<T, Eigen::internal::scalar_log_op<T> > {};
+
+template <typename T>
+struct sign : base<T, Eigen::internal::scalar_sign_op<T> > {};
+
+template <typename T>
+struct tanh : base<T, Eigen::internal::scalar_tanh_op<T> > {};
+
+template <typename T>
+struct sigmoid : base<T, Eigen::internal::scalar_sigmoid_op<T> > {};
+
+template <typename T>
+struct sin : base<T, Eigen::internal::scalar_sin_op<T> > {};
+
+template <typename T>
+struct cos : base<T, Eigen::internal::scalar_cos_op<T> > {};
+
+struct logical_not : base<bool, std::logical_not<bool> > {};
+
+namespace impl {
+
+#ifndef __CUDACC__
+// Uses STL std cmath functions.
+template <typename T>
+bool isinf(T v) {
+ return std::isinf(v);
+}
+
+template <typename T>
+bool isnan(T v) {
+ return std::isnan(v);
+}
+
+template <typename T>
+bool isfinite(T v) {
+ return std::isfinite(v);
+}
+
+template <typename T>
+T floor(T v) {
+ return std::floor(v);
+}
+
+template <typename T>
+T ceil(T v) {
+ return std::ceil(v);
+}
+#else
+// Uses CUDA's functions for float and double.
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isinf(T v) {
+ return ::isinf(v);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isnan(T v) {
+ return ::isnan(v);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isfinite(T v) {
+ return ::isfinite(v);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T floor(T v) {
+ return ::floor(v);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T ceil(T v) {
+ return ::ceil(v);
+}
+#endif
+} // end namespace impl
+
+// NOTE: std::isinf, std::isnan, std::isfinite are plain function.
+// Therefore we need to wrap them in functors to be used with Eigen's
+// type system.
+
+template <typename T>
+struct isinf_func {
+ typedef bool result_type;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(T x) const {
+ return impl::isinf(x);
+ }
+};
+
+template <typename T>
+struct isinf : base<T, isinf_func<T>, bool> {};
+
+template <typename T>
+struct isnan_func {
+ typedef bool result_type;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(T x) const {
+ return impl::isnan(x);
+ }
+};
+
+template <typename T>
+struct isnan : base<T, isnan_func<T>, bool> {};
+
+template <typename T>
+struct isfinite_func {
+ typedef bool result_type;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(T x) const {
+ return impl::isfinite(x);
+ }
+};
+
+template <typename T>
+struct isfinite : base<T, isfinite_func<T>, bool> {};
+
+template <typename T>
+struct floor_func {
+ typedef T result_type;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(T x) const {
+ return impl::floor(x);
+ }
+};
+
+template <typename T>
+struct floor : base<T, floor_func<T> > {};
+
+template <typename T>
+struct ceil_func {
+ typedef T result_type;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(T x) const {
+ return impl::ceil(x);
+ }
+};
+
+template <typename T>
+struct ceil : base<T, ceil_func<T> > {};
+
+////////////////////////////////////////////////////////////////////////////////
+// Binary functors
+////////////////////////////////////////////////////////////////////////////////
+
+// Binary functors:
+//
+// add(x, y) = x + y
+// sub(x, y) = x - y
+// mul(x, y) = x * y
+// div(x, y) = x / y
+// mod(x, y) = x % y (int32 and int64 only)
+// fmod(x, y) = fmod(x, y) (float and double only)
+// pow(x, y) = x ^ y
+// maximum(x, y) = x > y ? x : y
+// minimum(x, y) = x < y ? x : y
+
+template <typename T>
+struct add : base<T, Eigen::internal::scalar_sum_op<T> > {
+ static const bool use_bcast_optimization = true;
+};
+
+template <typename T>
+struct sub : base<T, Eigen::internal::scalar_difference_op<T> > {
+ static const bool use_bcast_optimization = true;
+};
+
+template <typename T>
+struct mul : base<T, Eigen::internal::scalar_product_op<T> > {};
+
+template <typename T>
+struct div : base<T, Eigen::internal::scalar_quotient_op<T> > {};
+
+template <typename T>
+struct fmod : base<T, Eigen::internal::scalar_fmod2_op<T> > {};
+
+template <typename T>
+struct mod : base<T, Eigen::internal::scalar_mod2_op<T> > {};
+
+template <typename T>
+struct pow : base<T, Eigen::internal::scalar_pow2_op<T> > {};
+
+template <typename T>
+struct maximum : base<T, Eigen::internal::scalar_max_op<T> > {};
+
+template <typename T>
+struct minimum : base<T, Eigen::internal::scalar_min_op<T> > {};
+
+template <typename T>
+struct less : base<T, Eigen::internal::less<T>, bool> {};
+
+template <typename T>
+struct less_equal : base<T, Eigen::internal::less_equal<T>, bool> {};
+
+template <typename T>
+struct greater : base<T, Eigen::internal::greater<T>, bool> {};
+
+template <typename T>
+struct greater_equal : base<T, Eigen::internal::greater_equal<T>, bool> {};
+
+template <typename T>
+struct equal_to : base<T, Eigen::internal::equal_to<T>, bool> {};
+
+template <typename T>
+struct not_equal_to : base<T, Eigen::internal::not_equal_to<T>, bool> {};
+
+struct logical_and : base<bool, Eigen::internal::scalar_boolean_and_op> {};
+
+struct logical_or : base<bool, Eigen::internal::scalar_boolean_or_op> {};
+
+template <typename T>
+struct make_complex_func {
+ typedef std::complex<T> result_type;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ result_type operator()(T real, T imag) const {
+ return std::complex<T>(real, imag);
+ }
+};
+
+template <typename T>
+struct make_complex : base<T, make_complex_func<T>, std::complex<T> > {};
+
+template <typename T>
+struct get_real
+ : base<T, Eigen::internal::scalar_real_op<T>, typename T::value_type> {};
+
+template <typename T>
+struct get_imag
+ : base<T, Eigen::internal::scalar_imag_op<T>, typename T::value_type> {};
+
+template <typename T>
+struct conj : base<T, Eigen::internal::scalar_conjugate_op<T> > {};
+
+////////////////////////////////////////////////////////////////////////////////
+// Functors takes 1 or 2 tensors, computes the base functor on
+// coefficient of the input tensors and puts the results in the output
+// tensor.
+////////////////////////////////////////////////////////////////////////////////
+template <typename Device, typename Functor>
+struct UnaryFunctor {
+ // Computes on device "d": out[i] = Functor(in[i])
+ void operator()(const Device& d, typename Functor::tout_type out,
+ typename Functor::tin_type in);
+};
+
+template <typename Device, typename Functor, int NDIMS>
+struct BinaryFunctor {
+ // Computes on device "d": out[i] = Functor(in0[i], in1[i])
+ void operator()(const Device& d, typename Functor::tout_type out,
+ typename Functor::tin_type in0,
+ typename Functor::tin_type in1);
+
+ // Computes on device "d": out[i] = Functor(scalar[0], in[i])
+ void Left(const Device& d, typename Functor::tout_type out,
+ typename Functor::tscalar_type scalar,
+ typename Functor::tin_type in);
+
+ // Computes on device "d": out[i] = Functor(in[i], scalar[0])
+ void Right(const Device& d, typename Functor::tout_type out,
+ typename Functor::tin_type in,
+ typename Functor::tscalar_type scalar);
+
+ // Computes on device "d":
+ // out = Functor(in0.broadcast(bcast0), in1.broadcast(bcast01))
+ //
+ // TODO(zhifengc): makes BCast a template member function on NDIMS
+ // instead making BinaryFunctor templates on NDIMS.
+ void BCast(const Device& d,
+ typename TTypes<typename Functor::out_type, NDIMS>::Tensor out,
+ typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in0,
+ typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast0,
+ typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in1,
+ typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1);
+};
+
+template <int NDIMS>
+bool AllOne(const typename Eigen::array<Eigen::DenseIndex, NDIMS>& a) {
+ for (int i = 0; i < a.size(); ++i) {
+ if (a[i] != 1) return false;
+ }
+ return true;
+}
+
+template <typename Device, typename T>
+struct SelectFunctor {
+ void operator()(const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<bool>::ConstFlat cond_flat,
+ typename TTypes<T>::ConstFlat then_flat,
+ typename TTypes<T>::ConstFlat else_flat);
+};
+
+} // end namespace functor
+} // end namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_CWISE_OPS_H_
diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc
new file mode 100644
index 0000000000..f86d2ddd9a
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_ops_common.cc
@@ -0,0 +1,42 @@
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+
+BinaryOpShared::BinaryOpShared(OpKernelConstruction* ctx, DataType out,
+ DataType in)
+ : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, ctx->MatchSignature({in, in}, {out}));
+}
+
+void BinaryOpShared::SetUnimplementedError(OpKernelContext* ctx) {
+ ctx->SetStatus(errors::Unimplemented(
+ "Broadcast between ", ctx->input(0).shape().ShortDebugString(), " and ",
+ ctx->input(1).shape().ShortDebugString(), " is not supported yet."));
+}
+
+static BCast::Vec FromShape(const TensorShape& shape) {
+ BCast::Vec ret;
+ for (int i = 0; i < shape.dims(); ++i) ret.push_back(shape.dim_size(i));
+ return ret;
+}
+
+static TensorShape ToShape(const BCast::Vec& vec) {
+ TensorShape shape;
+ for (auto elem : vec) shape.AddDim(elem);
+ return shape;
+}
+
+BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx)
+ : bcast(FromShape(ctx->input(0).shape()),
+ FromShape(ctx->input(1).shape())) {
+ if (!bcast.IsValid()) {
+ ctx->SetStatus(errors::InvalidArgument(
+ "Incompatible shapes: ", ctx->input(0).shape().ShortDebugString(),
+ " vs. ", ctx->input(1).shape().ShortDebugString()));
+ return;
+ }
+ OP_REQUIRES_OK(ctx,
+ ctx->allocate_output(0, ToShape(bcast.output_shape()), &out));
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h
new file mode 100644
index 0000000000..cf848b86d1
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@@ -0,0 +1,390 @@
+#ifndef TENSORFLOW_KERNELS_CWISE_OPS_COMMON_H_
+#define TENSORFLOW_KERNELS_CWISE_OPS_COMMON_H_
+
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/cwise_ops.h"
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/bcast.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+class BinaryOpShared : public OpKernel {
+ public:
+ explicit BinaryOpShared(OpKernelConstruction* ctx, DataType out, DataType in);
+
+ protected:
+ struct BinaryOpState {
+ // Sets up bcast with the shape of in0 and in1, ensures that the bcast
+ // is valid, and if so, allocates out using ctx->output(...).
+ // Caller must check ctx->status() upon return for non-ok status.
+ // If ctx->status().ok() is true, then out is guaranteed to be allocated.
+ BinaryOpState(OpKernelContext* ctx);
+
+ BCast bcast;
+ Tensor* out = nullptr;
+ };
+
+ template <int NDIMS>
+ static Eigen::array<Eigen::DenseIndex, NDIMS> ToIndexArray(
+ const BCast::Vec& vec) {
+ CHECK_EQ(vec.size(), NDIMS);
+ Eigen::array<Eigen::DenseIndex, NDIMS> ret;
+ for (int i = 0; i < NDIMS; ++i) ret[i] = vec[i];
+ return ret;
+ }
+ void SetUnimplementedError(OpKernelContext* ctx);
+};
+
+// Coefficient-wise binary operations:
+// Device: E.g., CPUDevice, GPUDevice.
+// Functor: defined in cwise_functors.h. E.g., functor::add2.
+template <typename Device, typename Functor>
+class BinaryOp : public BinaryOpShared {
+ public:
+ typedef typename Functor::in_type Tin; // Input scalar data type.
+ typedef typename Functor::out_type Tout; // Output scalar data type.
+
+ explicit BinaryOp(OpKernelConstruction* ctx)
+ : BinaryOpShared(ctx, DataTypeToEnum<Tout>::v(),
+ DataTypeToEnum<Tin>::v()) {}
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor& in0 = ctx->input(0);
+ const Tensor& in1 = ctx->input(1);
+ // 'state': Shared helper not dependent on T to reduce code size
+ BinaryOpState state(ctx);
+ if (!ctx->status().ok()) return;
+ Tensor* out = state.out;
+ BCast* bcast = &state.bcast;
+ if (out->NumElements() == 0) {
+ return;
+ }
+ const int ndims = bcast->x_reshape().size();
+ if (ndims <= 1) {
+ if (in1.NumElements() == 1) {
+ // tensor op scalar
+ functor::BinaryFunctor<Device, Functor, 1>().Right(
+ ctx->eigen_device<Device>(), out->flat<Tout>(), in0.flat<Tin>(),
+ in1.scalar<Tin>());
+ return;
+ }
+ if (in0.NumElements() == 1) {
+ // scalar op tensor
+ functor::BinaryFunctor<Device, Functor, 1>().Left(
+ ctx->eigen_device<Device>(), out->flat<Tout>(), in0.scalar<Tin>(),
+ in1.flat<Tin>());
+ return;
+ }
+ functor::BinaryFunctor<Device, Functor, 1>()(
+ ctx->eigen_device<Device>(), out->flat<Tout>(), in0.flat<Tin>(),
+ in1.flat<Tin>());
+ return;
+ }
+
+ if (ndims == 2) {
+ functor::BinaryFunctor<Device, Functor, 2>().BCast(
+ ctx->eigen_device<Device>(),
+ out->shaped<Tout, 2>(bcast->result_shape()),
+ in0.shaped<Tin, 2>(bcast->x_reshape()),
+ ToIndexArray<2>(bcast->x_bcast()),
+ in1.shaped<Tin, 2>(bcast->y_reshape()),
+ ToIndexArray<2>(bcast->y_bcast()));
+ return;
+ }
+
+ if (ndims == 3) {
+ functor::BinaryFunctor<Device, Functor, 3>().BCast(
+ ctx->eigen_device<Device>(),
+ out->shaped<Tout, 3>(bcast->result_shape()),
+ in0.shaped<Tin, 3>(bcast->x_reshape()),
+ ToIndexArray<3>(bcast->x_bcast()),
+ in1.shaped<Tin, 3>(bcast->y_reshape()),
+ ToIndexArray<3>(bcast->y_bcast()));
+ return;
+ }
+
+ SetUnimplementedError(ctx);
+ }
+
+ private:
+};
+
+// Coefficient-wise unary operations:
+// Device: E.g., CPUDevice, GPUDevice.
+// Functor: defined in cwise_functors.h. E.g., functor::sqrt.
+template <typename Device, typename Functor>
+class UnaryOp : public OpKernel {
+ public:
+ typedef typename Functor::in_type Tin; // Input scalar data type.
+ typedef typename Functor::out_type Tout; // Output scalar data type.
+ // Tin may be different from Tout. E.g., abs: complex64 -> float
+
+ explicit UnaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ auto in = DataTypeToEnum<Tin>::v();
+ auto out = DataTypeToEnum<Tout>::v();
+ OP_REQUIRES_OK(ctx, ctx->MatchSignature({in}, {out}));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor& inp = ctx->input(0);
+ Tensor* out = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, inp.shape(), &out));
+ functor::UnaryFunctor<Device, Functor>()(
+ ctx->eigen_device<Device>(), out->flat<Tout>(), inp.flat<Tin>());
+ }
+};
+
+// Coefficient-wise select operation.
+// Device: E.g., CPUDevice, GPUDevice.
+template <typename Device, typename T>
+class SelectOp : public OpKernel {
+ public:
+ explicit SelectOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ auto dt = DataTypeToEnum<T>::v();
+ OP_REQUIRES_OK(ctx, ctx->MatchSignature({DT_BOOL, dt, dt}, {dt}));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor& in0 = ctx->input(0);
+ const Tensor& in1 = ctx->input(1);
+ const Tensor& in2 = ctx->input(2);
+ if (!ctx->ValidateInputsAreSameShape(this)) return;
+ Tensor* out = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in0.shape(), &out));
+ functor::SelectFunctor<Device, T> func;
+ func(ctx->eigen_device<Device>(), out->flat<T>(), in0.flat<bool>(),
+ in1.flat<T>(), in2.flat<T>());
+ }
+};
+
+namespace functor {
+
+// For CPUDevice, we do operations inline if the resulting tensor is
+// modestly sized.
+static bool DoInline(size_t size) { return size <= 32768; }
+
+template <typename D, typename OUT, typename RHS>
+void Assign(const D& d, OUT out, RHS rhs) {
+ if (DoInline(out.size())) {
+ out = rhs;
+ } else {
+ out.device(d) = rhs;
+ }
+}
+
+// Partial specialization of BinaryFunctor<Device=CPUDevice, Functor>.
+template <typename Functor, int NDIMS>
+struct BinaryFunctor<CPUDevice, Functor, NDIMS> {
+ void operator()(const CPUDevice& d, typename Functor::tout_type out,
+ typename Functor::tin_type in0,
+ typename Functor::tin_type in1) {
+ Assign(d, out, in0.binaryExpr(in1, typename Functor::func()));
+ }
+
+ void Left(const CPUDevice& d, typename Functor::tout_type out,
+ typename Functor::tscalar_type scalar,
+ typename Functor::tin_type in) {
+ typedef typename Functor::out_type Tout;
+ typedef typename Functor::in_type Tin;
+ typedef typename Functor::func Binary;
+ typedef typename Eigen::internal::scalar_left<Tout, Tin, Binary> Unary;
+ Assign(d, out, in.unaryExpr(Unary(scalar.data())));
+ }
+
+ void Right(const CPUDevice& d, typename Functor::tout_type out,
+ typename Functor::tin_type in,
+ typename Functor::tscalar_type scalar) {
+ typedef typename Functor::out_type Tout;
+ typedef typename Functor::in_type Tin;
+ typedef typename Functor::func Binary;
+ typedef typename Eigen::internal::scalar_right<Tout, Tin, Binary> Unary;
+ Assign(d, out, in.unaryExpr(Unary(scalar.data())));
+ }
+
+#if !defined(EIGEN_HAS_INDEX_LIST)
+ inline Eigen::DSizes<int, 2> NByOne(int n) {
+ return Eigen::DSizes<int, 2>(n, 1);
+ }
+ inline Eigen::DSizes<int, 2> OneByM(int m) {
+ return Eigen::DSizes<int, 2>(1, m);
+ }
+#else
+ inline Eigen::IndexList<int, Eigen::type2index<1>> NByOne(int n) {
+ Eigen::IndexList<int, Eigen::type2index<1>> ret;
+ ret.set(0, n);
+ return ret;
+ }
+ inline Eigen::IndexList<Eigen::type2index<1>, int> OneByM(int m) {
+ Eigen::IndexList<Eigen::type2index<1>, int> ret;
+ ret.set(1, m);
+ return ret;
+ }
+#endif
+
+ void BCast(const CPUDevice& dev,
+ typename TTypes<typename Functor::out_type, NDIMS>::Tensor out,
+ typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in0,
+ typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast0,
+ typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in1,
+ typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1) {
+ typedef typename Functor::in_type T;
+ typename Functor::func func;
+ if ((NDIMS == 2) && Functor::use_bcast_optimization &&
+ use_bcast_optimization<T>::value) {
+ // Optimize for speed by using Eigen::type2index and avoid
+ // .broadcast() when we know its a no-op.
+ //
+ // Here, we need to handle 6 cases depending on how many "1"
+ // exist in in0 and in1's shapes (4 numbers in total). It's not
+ // possible that two shapes have more than 2 1s because those
+ // are simplified to NDIMS==1 case.
+ //
+ // Because this optimization increases the binary size for each
+ // Functor (+, -, *, /, <, <=, etc.), type and ndim combination.
+ // we only apply such optimization for selected ops/types/ndims.
+ //
+ // Because NDIMS, Functor::use_broadcast_optimization and
+ // use_broadcast_optimization<T> are compile-time constant, gcc
+ // does a decent job avoiding generating code when conditions
+ // are not met.
+ const int a = in0.dimension(0); // in0 is shape [a, b]
+ const int b = in0.dimension(1);
+ const int c = in1.dimension(0); // in1 is shape [c, d]
+ const int d = in1.dimension(1);
+ if ((a == 1) && (d == 1)) {
+ auto lhs = in0.reshape(OneByM(b)).broadcast(NByOne(c));
+ auto rhs = in1.reshape(NByOne(c)).broadcast(OneByM(b));
+ Assign(dev, out, lhs.binaryExpr(rhs, func));
+ return;
+ }
+ if ((b == 1) && (c == 1)) {
+ auto lhs = in0.reshape(NByOne(a)).broadcast(OneByM(d));
+ auto rhs = in1.reshape(OneByM(d)).broadcast(NByOne(a));
+ Assign(dev, out, lhs.binaryExpr(rhs, func));
+ return;
+ }
+ if (a == 1) {
+ auto lhs = in0.reshape(OneByM(b)).broadcast(NByOne(c));
+ auto rhs = in1;
+ Assign(dev, out, lhs.binaryExpr(rhs, func));
+ return;
+ }
+ if (b == 1) {
+ auto lhs = in0.reshape(NByOne(a)).broadcast(OneByM(d));
+ auto rhs = in1;
+ Assign(dev, out, lhs.binaryExpr(rhs, func));
+ return;
+ }
+ if (c == 1) {
+ auto lhs = in0;
+ auto rhs = in1.reshape(OneByM(d)).broadcast(NByOne(a));
+ Assign(dev, out, lhs.binaryExpr(rhs, func));
+ return;
+ }
+ if (d == 1) {
+ auto lhs = in0;
+ auto rhs = in1.reshape(NByOne(c)).broadcast(OneByM(b));
+ Assign(dev, out, lhs.binaryExpr(rhs, func));
+ return;
+ }
+
+ const bool bcast0_all_one = AllOne<NDIMS>(bcast0);
+ const bool bcast1_all_one = AllOne<NDIMS>(bcast1);
+ if (bcast0_all_one && !bcast1_all_one) {
+ auto lhs = in0; // No need to do broadcast for in0
+ auto rhs = in1.broadcast(bcast1);
+ Assign(dev, out, lhs.binaryExpr(rhs, func));
+ return;
+ }
+
+ if (!bcast0_all_one && bcast1_all_one) {
+ auto lhs = in0.broadcast(bcast0);
+ auto rhs = in1; // No need to do broadcast for in1
+ Assign(dev, out, lhs.binaryExpr(rhs, func));
+ return;
+ }
+ }
+
+ // Fallback path. Always work and probably slower.
+ auto lhs = in0.broadcast(bcast0);
+ auto rhs = in1.broadcast(bcast1);
+ Assign(dev, out, lhs.binaryExpr(rhs, func));
+ }
+};
+
+// Partial specialization of UnaryFunctor<Device=CPUDevice, Functor>.
+template <typename Functor>
+struct UnaryFunctor<CPUDevice, Functor> {
+ void operator()(const CPUDevice& d, typename Functor::tout_type out,
+ typename Functor::tin_type in) {
+ Assign(d, out, in.unaryExpr(typename Functor::func()));
+ }
+};
+
+template <typename T>
+struct SelectFunctor<CPUDevice, T> {
+ void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<bool>::ConstFlat cond_flat,
+ typename TTypes<T>::ConstFlat then_flat,
+ typename TTypes<T>::ConstFlat else_flat) {
+ Assign(d, out, cond_flat.select(then_flat, else_flat));
+ }
+};
+
+} // end namespace functor
+
+#define REGISTER_SELECT(D, N, F, T) \
+ REGISTER_KERNEL_BUILDER(Name(N).Device(DEVICE_##D).TypeConstraint<T>("T"), \
+ SelectOp<D##Device, T>)
+
+#define REGISTER(OP, D, N, F, T) \
+ REGISTER_KERNEL_BUILDER(Name(N).Device(DEVICE_##D).TypeConstraint<T>("T"), \
+ OP<D##Device, F<T>>);
+
+// Macros to register kernels for multiple types (T0, T1, etc.) on
+// device type "D" (CPU or GPU) for operatin "N" (e.g., sqrt) using
+// the functor "F" (e.g., functor:sqrt).
+
+#ifdef __ANDROID__
+// On Android, only register the first type (float)
+#define REGISTER2(OP, D, N, F, T0, T1) REGISTER(OP, D, N, F, T0)
+#define REGISTER3(OP, D, N, F, T0, T1, T2) REGISTER(OP, D, N, F, T0)
+#define REGISTER4(OP, D, N, F, T0, T1, T2, T3) REGISTER(OP, D, N, F, T0)
+#define REGISTER5(OP, D, N, F, T0, T1, T2, T3, T4) REGISTER(OP, D, N, F, T0)
+#define REGISTER6(OP, D, N, F, T0, T1, T2, T3, T4, T5) REGISTER(OP, D, N, F, T0)
+#define REGISTER7(OP, D, N, F, T0, T1, T2, T3, T4, T5, T6) \
+ REGISTER(OP, D, N, F, T0)
+#else // !__ANDROID__
+#define REGISTER2(OP, D, N, F, T0, T1) \
+ REGISTER(OP, D, N, F, T0) \
+ REGISTER(OP, D, N, F, T1)
+#define REGISTER3(OP, D, N, F, T0, T1, T2) \
+ REGISTER2(OP, D, N, F, T0, T1) \
+ REGISTER(OP, D, N, F, T2)
+#define REGISTER4(OP, D, N, F, T0, T1, T2, T3) \
+ REGISTER2(OP, D, N, F, T0, T1) \
+ REGISTER2(OP, D, N, F, T2, T3)
+#define REGISTER5(OP, D, N, F, T0, T1, T2, T3, T4) \
+ REGISTER3(OP, D, N, F, T0, T1, T2) \
+ REGISTER2(OP, D, N, F, T3, T4)
+#define REGISTER6(OP, D, N, F, T0, T1, T2, T3, T4, T5) \
+ REGISTER3(OP, D, N, F, T0, T1, T2) \
+ REGISTER3(OP, D, N, F, T3, T4, T5)
+#define REGISTER7(OP, D, N, F, T0, T1, T2, T3, T4, T5, T6) \
+ REGISTER4(OP, D, N, F, T0, T1, T2, T3) \
+ REGISTER3(OP, D, N, F, T4, T5, T6)
+#endif // __ANDROID__
+
+} // end namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_CWISE_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
new file mode 100644
index 0000000000..b0dc027144
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
@@ -0,0 +1,135 @@
+#if !GOOGLE_CUDA
+#error This file must only be included when building with Cuda support
+#endif
+
+#ifndef TENSORFLOW_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
+#define TENSORFLOW_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
+
+#define EIGEN_USE_GPU
+
+#include <complex>
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/kernels/cwise_ops.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+#include "tensorflow/core/platform/logging.h"
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+typedef std::complex<float> complex64;
+
+// Partial specialization of UnaryFunctor<Device=GPUDevice, Functor>.
+template <typename Functor>
+struct UnaryFunctor<GPUDevice, Functor> {
+ void operator()(const GPUDevice& d, typename Functor::tout_type out,
+ typename Functor::tin_type in) {
+ out.device(d) = in.unaryExpr(typename Functor::func());
+ }
+};
+
+// Partial specialization of BinaryFunctor<Device=GPUDevice, Functor>.
+template <typename Functor, int NDIMS>
+struct BinaryFunctor<GPUDevice, Functor, NDIMS> {
+ void operator()(const GPUDevice& d, typename Functor::tout_type out,
+ typename Functor::tin_type in0,
+ typename Functor::tin_type in1) {
+ out.device(d) = in0.binaryExpr(in1, typename Functor::func());
+ }
+
+ void Left(const GPUDevice& d, typename Functor::tout_type out,
+ typename Functor::tscalar_type scalar,
+ typename Functor::tin_type in) {
+ typedef typename Functor::out_type Tout;
+ typedef typename Functor::in_type Tin;
+ typedef typename Functor::func Binary;
+ typedef typename Eigen::internal::scalar_left<Tout, Tin, Binary> Unary;
+ out.device(d) = in.unaryExpr(Unary(scalar.data()));
+ }
+
+ void Right(const GPUDevice& d, typename Functor::tout_type out,
+ typename Functor::tin_type in,
+ typename Functor::tscalar_type scalar) {
+ typedef typename Functor::out_type Tout;
+ typedef typename Functor::in_type Tin;
+ typedef typename Functor::func Binary;
+ typedef typename Eigen::internal::scalar_right<Tout, Tin, Binary> Unary;
+ out.device(d) = in.unaryExpr(Unary(scalar.data()));
+ }
+
+ void BCast(const GPUDevice& d,
+ typename TTypes<typename Functor::out_type, NDIMS>::Tensor out,
+ typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in0,
+ typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast0,
+ typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in1,
+ typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1) {
+ typedef typename Functor::in_type T;
+ typename Functor::func func;
+ if ((NDIMS == 2) && Functor::use_bcast_optimization &&
+ use_bcast_optimization<T>::value) {
+ const bool bcast0_all_one = AllOne<NDIMS>(bcast0);
+ const bool bcast1_all_one = AllOne<NDIMS>(bcast1);
+ if (bcast0_all_one && !bcast1_all_one) {
+ out.device(d) = in0.binaryExpr(in1.broadcast(bcast1), func);
+ return;
+ }
+ if (!bcast0_all_one && bcast1_all_one) {
+ out.device(d) = in0.broadcast(bcast0).binaryExpr(in1, func);
+ return;
+ }
+ }
+ out.device(d) =
+ in0.broadcast(bcast0).binaryExpr(in1.broadcast(bcast1), func);
+ }
+};
+
+template <typename T>
+struct SelectFunctor<GPUDevice, T> {
+ void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+ typename TTypes<bool>::ConstFlat cond_flat,
+ typename TTypes<T>::ConstFlat then_flat,
+ typename TTypes<T>::ConstFlat else_flat) {
+ out.device(d) = cond_flat.select(then_flat, else_flat);
+ }
+};
+
+// Macros to explicitly instantiate kernels on GPU for multiple types
+// (T0, T1, etc.) for UnaryFunctor (e.g., functor:sqrt).
+#define DEFINE_UNARY1(F, T) template struct UnaryFunctor<GPUDevice, F<T> >
+#define DEFINE_UNARY2(F, T0, T1) \
+ DEFINE_UNARY1(F, T0); \
+ DEFINE_UNARY1(F, T1)
+#define DEFINE_UNARY3(F, T0, T1, T2) \
+ DEFINE_UNARY2(F, T0, T1); \
+ DEFINE_UNARY1(F, T2)
+#define DEFINE_UNARY4(F, T0, T1, T2, T3) \
+ DEFINE_UNARY2(F, T0, T1); \
+ DEFINE_UNARY2(F, T2, T3)
+#define DEFINE_UNARY5(F, T0, T1, T2, T3, T4) \
+ DEFINE_UNARY2(F, T0, T1); \
+ DEFINE_UNARY3(F, T2, T3, T4)
+
+// Macros to explicitly instantiate kernels on GPU for multiple types
+// (T0, T1, etc.) for BinaryFunctor.
+#define DEFINE_BINARY1(F, T) \
+ template struct BinaryFunctor<GPUDevice, F<T>, 1>; \
+ template struct BinaryFunctor<GPUDevice, F<T>, 2>; \
+ template struct BinaryFunctor<GPUDevice, F<T>, 3>
+#define DEFINE_BINARY2(F, T0, T1) \
+ DEFINE_BINARY1(F, T0); \
+ DEFINE_BINARY1(F, T1)
+#define DEFINE_BINARY3(F, T0, T1, T2) \
+ DEFINE_BINARY2(F, T0, T1); \
+ DEFINE_BINARY1(F, T2)
+#define DEFINE_BINARY4(F, T0, T1, T2, T3) \
+ DEFINE_BINARY2(F, T0, T1); \
+ DEFINE_BINARY2(F, T2, T3)
+#define DEFINE_BINARY5(F, T0, T1, T2, T3, T4) \
+ DEFINE_BINARY2(F, T0, T1); \
+ DEFINE_BINARY3(F, T2, T3, T4)
+
+} // end namespace functor
+} // end namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
new file mode 100644
index 0000000000..56af248117
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -0,0 +1,167 @@
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+
+// Creates a Graph which applies a unary "func" on a 3D float tensor
+// of "num" elements.
+static Graph* Unary(const string& func, int num) {
+ RequireDefaultOps();
+ Graph* g = new Graph(OpRegistry::Global());
+ Tensor data(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
+ CHECK_GT(data.NumElements(), 0);
+ data.flat<float>().setRandom();
+ test::graph::Unary(g, func, test::graph::Constant(g, data), 0);
+ return g;
+}
+
+static int kRows = 100000;
+
+static int RowsAndColsArg(int r, int c) { return r * kRows + c; }
+static int RowsFromArg(int arg) { return (arg / kRows); }
+static int ColsFromArg(int arg) { return (arg % kRows); }
+
+#define BM_UNARY(DEVICE, FUNC) \
+ static void BM_##DEVICE##_##FUNC(int iters, int num) { \
+ const int64 tot = static_cast<int64>(iters) * num; \
+ testing::ItemsProcessed(tot); \
+ testing::BytesProcessed(tot * sizeof(float)); \
+ test::Benchmark(#DEVICE, Unary(#FUNC, num)).Run(iters); \
+ } \
+ BENCHMARK(BM_##DEVICE##_##FUNC)->Range(4 << 10, 1 << 20);
+
+BM_UNARY(cpu, Floor);
+BM_UNARY(gpu, Floor);
+
+// data func scalar.
+static Graph* BinaryScalar(int num, const string& func) {
+ RequireDefaultOps();
+ Graph* g = new Graph(OpRegistry::Global());
+ Tensor lhs(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
+ lhs.flat<float>().setRandom();
+ Tensor rhs(DT_FLOAT, TensorShape({}));
+ rhs.flat<float>().setRandom();
+ test::graph::Binary(g, func, test::graph::Constant(g, lhs),
+ test::graph::Constant(g, rhs));
+ return g;
+}
+
+#define BM_BINARY_SCALAR(DEVICE, FUNC) \
+ static void BM_##DEVICE##_##FUNC##_scalar(int iters, int num) { \
+ const int64 tot = static_cast<int64>(iters) * num; \
+ testing::ItemsProcessed(tot); \
+ testing::BytesProcessed(tot * sizeof(float)); \
+ test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC)).Run(iters); \
+ } \
+ BENCHMARK(BM_##DEVICE##_##FUNC##_scalar) \
+ ->Arg(4096) /* must >= 4096 */ \
+ ->Arg(32768) \
+ ->Arg(131072) \
+ ->Arg(1048576);
+
+BM_BINARY_SCALAR(cpu, Less);
+BM_BINARY_SCALAR(gpu, Less);
+BM_BINARY_SCALAR(cpu, Add);
+BM_BINARY_SCALAR(gpu, Add);
+#undef BM_BINARY_SCALAR
+
+static Graph* BiasAdd(int rows, int cols) {
+ RequireDefaultOps();
+ Graph* g = new Graph(OpRegistry::Global());
+ Tensor lhs(DT_FLOAT, TensorShape({rows, cols}));
+ lhs.flat<float>().setRandom();
+ TensorShape rhs_shape;
+ rhs_shape = TensorShape({cols});
+ Tensor rhs(DT_FLOAT, rhs_shape);
+ rhs.flat<float>().setRandom();
+ test::graph::Binary(g, "BiasAdd", test::graph::Constant(g, lhs),
+ test::graph::Constant(g, rhs));
+ return g;
+}
+
+#define BM_BIAS_ADD(DEVICE, R, C) \
+ static void BM_##DEVICE##_BiasAdd_R##R##_C##C(int iters, int arg) { \
+ const int rows = RowsFromArg(arg); \
+ const int cols = ColsFromArg(arg); \
+ const int64 tot = static_cast<int64>(iters) * rows * cols; \
+ testing::ItemsProcessed(tot); \
+ testing::BytesProcessed(tot * sizeof(float)); \
+ test::Benchmark(#DEVICE, BiasAdd(rows, cols)).Run(iters); \
+ } \
+ BENCHMARK(BM_##DEVICE##_BiasAdd_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
+
+#define BM_BIAS_ADD_ALL(DEVICE) \
+ BM_BIAS_ADD(DEVICE, 512, 2048); \
+ BM_BIAS_ADD(DEVICE, 512, 4096); \
+ BM_BIAS_ADD(DEVICE, 2048, 512); \
+ BM_BIAS_ADD(DEVICE, 4096, 512);
+
+BM_BIAS_ADD_ALL(cpu);
+BM_BIAS_ADD_ALL(gpu);
+#undef BM_BIAS_ADD_ALL
+#undef BM_BIAS_ADD
+
+static Graph* BcastAdd(int rows, int cols, int dim) {
+ RequireDefaultOps();
+ Graph* g = new Graph(OpRegistry::Global());
+ Tensor lhs(DT_FLOAT, TensorShape({rows, cols}));
+ lhs.flat<float>().setRandom();
+ TensorShape rhs_shape;
+ if (dim == 0) {
+ rhs_shape = TensorShape({rows, 1});
+ } else {
+ rhs_shape = TensorShape({cols});
+ }
+ Tensor rhs(DT_FLOAT, rhs_shape);
+ rhs.flat<float>().setRandom();
+ test::graph::Binary(g, "Add", test::graph::Constant(g, lhs),
+ test::graph::Constant(g, rhs));
+ return g;
+}
+
+#define BM_BCAST_ADD_ROW(DEVICE, R, C) \
+ static void BM_##DEVICE##_BcastAddRow_R##R##_C##C(int iters, int arg) { \
+ const int rows = RowsFromArg(arg); \
+ const int cols = ColsFromArg(arg); \
+ const int64 tot = static_cast<int64>(iters) * rows * cols; \
+ testing::ItemsProcessed(tot); \
+ testing::BytesProcessed(tot * sizeof(float)); \
+ test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0)).Run(iters); \
+ } \
+ BENCHMARK(BM_##DEVICE##_BcastAddRow_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
+
+#define BM_BCAST_ADD_ROW_ALL(DEVICE) \
+ BM_BCAST_ADD_ROW(DEVICE, 512, 2048); \
+ BM_BCAST_ADD_ROW(DEVICE, 512, 4096); \
+ BM_BCAST_ADD_ROW(DEVICE, 2048, 512); \
+ BM_BCAST_ADD_ROW(DEVICE, 4096, 512);
+BM_BCAST_ADD_ROW_ALL(cpu);
+BM_BCAST_ADD_ROW_ALL(gpu);
+#undef BM_BCAST_ADD_ROW_ALL
+#undef BM_BCAST_ADD_ROW
+
+#define BM_BCAST_ADD_COL(DEVICE, R, C) \
+ static void BM_##DEVICE##_BcastAddCol_R##R##_C##C(int iters, int arg) { \
+ const int rows = RowsFromArg(arg); \
+ const int cols = ColsFromArg(arg); \
+ const int64 tot = static_cast<int64>(iters) * rows * cols; \
+ testing::ItemsProcessed(tot); \
+ testing::BytesProcessed(tot * sizeof(float)); \
+ test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1)).Run(iters); \
+ } \
+ BENCHMARK(BM_##DEVICE##_BcastAddCol_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
+
+#define BM_BCAST_ADD_COL_ALL(DEVICE) \
+ BM_BCAST_ADD_COL(DEVICE, 512, 2048); \
+ BM_BCAST_ADD_COL(DEVICE, 512, 4096); \
+ BM_BCAST_ADD_COL(DEVICE, 2048, 512); \
+ BM_BCAST_ADD_COL(DEVICE, 4096, 512);
+BM_BCAST_ADD_COL_ALL(cpu);
+BM_BCAST_ADD_COL_ALL(gpu);
+#undef BM_BCAST_ADD_COL_ALL
+#undef BM_BCAST_ADD_COL
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc
new file mode 100644
index 0000000000..0919bab96f
--- /dev/null
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@@ -0,0 +1,222 @@
+// See docs in ../ops/parsing_ops.cc.
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+class DecodeCSVOp : public OpKernel {
+ public:
+ explicit DecodeCSVOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ string delim;
+
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("OUT_TYPE", &out_type_));
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("field_delim", &delim));
+
+ OP_REQUIRES(ctx, delim.size() == 1,
+ errors::InvalidArgument("field_delim should be only 1 char"));
+
+ delim_ = delim[0];
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor* records;
+ OpInputList record_defaults;
+
+ OP_REQUIRES_OK(ctx, ctx->input("records", &records));
+ OP_REQUIRES_OK(ctx, ctx->input_list("record_defaults", &record_defaults));
+
+ for (int i = 0; i < record_defaults.size(); ++i) {
+ OP_REQUIRES(ctx, record_defaults[i].NumElements() < 2,
+ errors::InvalidArgument(
+ "There should only be 1 default per field but field ", i,
+ " has ", record_defaults[i].NumElements()));
+ }
+
+ auto records_t = records->flat<string>();
+ int records_size = records_t.size();
+
+ OpOutputList output;
+ OP_REQUIRES_OK(ctx, ctx->output_list("output", &output));
+
+ for (size_t i = 0; i < out_type_.size(); ++i) {
+ Tensor* out = nullptr;
+ output.allocate(i, records->shape(), &out);
+ }
+
+ for (int i = 0; i < records_size; ++i) {
+ const StringPiece record(records_t(i));
+ std::vector<string> fields;
+ ExtractFields(ctx, record, &fields);
+ OP_REQUIRES(ctx, fields.size() == out_type_.size(),
+ errors::InvalidArgument("Expect ", out_type_.size(),
+ " fields but have ", fields.size(),
+ " in record ", i));
+
+ // Check each field in the record
+ for (size_t f = 0; f < out_type_.size(); ++f) {
+ const DataType& dtype = out_type_[f];
+ switch (dtype) {
+ case DT_INT32: {
+ // If this field is empty, check if default is given:
+ // If yes, use default value; Otherwise report error.
+ if (fields[f].empty()) {
+ OP_REQUIRES(ctx, record_defaults[f].NumElements() == 1,
+ errors::InvalidArgument(
+ "Field ", f,
+ " is required but missing in record ", i, "!"));
+
+ output[f]->flat<int32>()(i) = record_defaults[f].flat<int32>()(0);
+ } else {
+ int32 value;
+ OP_REQUIRES(ctx, strings::safe_strto32(fields[f].c_str(), &value),
+ errors::InvalidArgument("Field ", f, " in record ", i,
+ " is not a valid int32: ",
+ fields[f]));
+ output[f]->flat<int32>()(i) = value;
+ }
+ break;
+ }
+ case DT_INT64: {
+ // If this field is empty, check if default is given:
+ // If yes, use default value; Otherwise report error.
+ if (fields[f].empty()) {
+ OP_REQUIRES(ctx, record_defaults[f].NumElements() == 1,
+ errors::InvalidArgument(
+ "Field ", f,
+ " is required but missing in record ", i, "!"));
+
+ output[f]->flat<int64>()(i) = record_defaults[f].flat<int64>()(0);
+ } else {
+ int64 value;
+ OP_REQUIRES(ctx, strings::safe_strto64(fields[f].c_str(), &value),
+ errors::InvalidArgument("Field ", f, " in record ", i,
+ " is not a valid int64: ",
+ fields[f]));
+ output[f]->flat<int64>()(i) = value;
+ }
+ break;
+ }
+ case DT_FLOAT: {
+ // If this field is empty, check if default is given:
+ // If yes, use default value; Otherwise report error.
+ if (fields[f].empty()) {
+ OP_REQUIRES(ctx, record_defaults[f].NumElements() == 1,
+ errors::InvalidArgument(
+ "Field ", f,
+ " is required but missing in record ", i, "!"));
+ output[f]->flat<float>()(i) = record_defaults[f].flat<float>()(0);
+ } else {
+ float value;
+ OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value),
+ errors::InvalidArgument("Field ", f, " in record ", i,
+ " is not a valid float: ",
+ fields[f]));
+ output[f]->flat<float>()(i) = value;
+ }
+ break;
+ }
+ case DT_STRING: {
+ // If this field is empty, check if default is given:
+ // If yes, use default value; Otherwise report error.
+ if (fields[f].empty()) {
+ OP_REQUIRES(ctx, record_defaults[f].NumElements() == 1,
+ errors::InvalidArgument(
+ "Field ", f,
+ " is required but missing in record ", i, "!"));
+ output[f]->flat<string>()(i) =
+ record_defaults[f].flat<string>()(0);
+ } else {
+ output[f]->flat<string>()(i) = fields[f];
+ }
+ break;
+ }
+ default:
+ OP_REQUIRES(ctx, false,
+ errors::InvalidArgument("csv: data type ", dtype,
+ " not supported in field ", f));
+ }
+ }
+ }
+ }
+
+ private:
+ std::vector<DataType> out_type_;
+ char delim_;
+
+ void ExtractFields(OpKernelContext* ctx, StringPiece input,
+ std::vector<string>* result) {
+ int current_idx = 0;
+ if (!input.empty()) {
+ while (static_cast<size_t>(current_idx) < input.size()) {
+ if (input[current_idx] == '\n' || input[current_idx] == '\r') {
+ current_idx++;
+ continue;
+ }
+
+ bool quoted = false;
+ if (input[current_idx] == '"') {
+ quoted = true;
+ current_idx++;
+ }
+
+ // This is the body of the field;
+ string field;
+ if (!quoted) {
+ while (static_cast<size_t>(current_idx) < input.size() &&
+ input[current_idx] != delim_) {
+ OP_REQUIRES(ctx, input[current_idx] != '"' &&
+ input[current_idx] != '\n' &&
+ input[current_idx] != '\r',
+ errors::InvalidArgument(
+ "Unquoted fields cannot have quotes/CRLFs inside"));
+ field += input[current_idx];
+ current_idx++;
+ }
+
+ // Go to next field or the end
+ current_idx++;
+ } else {
+ // Quoted field needs to be ended with '"' and delim or end
+ while (
+ (static_cast<size_t>(current_idx) < input.size() - 1) &&
+ (input[current_idx] != '"' || input[current_idx + 1] != delim_)) {
+ if (input[current_idx] != '"') {
+ field += input[current_idx];
+ current_idx++;
+ } else {
+ OP_REQUIRES(
+ ctx, input[current_idx + 1] == '"',
+ errors::InvalidArgument("Quote inside a string has to be "
+ "escaped by another quote"));
+ field += '"';
+ current_idx += 2;
+ }
+ }
+
+ OP_REQUIRES(
+ ctx,
+ input[current_idx] == '"' &&
+ (static_cast<size_t>(current_idx) == input.size() - 1 ||
+ input[current_idx + 1] == delim_),
+ errors::InvalidArgument("Quoted field has to end with quote "
+ "followed by delim or end"));
+
+ current_idx += 2;
+ }
+
+ result->push_back(field);
+ }
+
+ // Check if the last field is missing
+ if (input[input.size() - 1] == delim_) result->push_back(string());
+ }
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("DecodeCSV").Device(DEVICE_CPU), DecodeCSVOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_jpeg_op.cc b/tensorflow/core/kernels/decode_jpeg_op.cc
new file mode 100644
index 0000000000..e41d3f3e11
--- /dev/null
+++ b/tensorflow/core/kernels/decode_jpeg_op.cc
@@ -0,0 +1,72 @@
+// See docs in ../ops/image_ops.cc
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/lib/jpeg/jpeg_mem.h"
+
+namespace tensorflow {
+
+// Decode the contents of a JPEG file
+class DecodeJpegOp : public OpKernel {
+ public:
+ explicit DecodeJpegOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("channels", &flags_.components));
+ OP_REQUIRES(context, flags_.components == 0 || flags_.components == 1 ||
+ flags_.components == 3,
+ errors::InvalidArgument("channels must be 0, 1, or 3, got ",
+ flags_.components));
+ OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
+ OP_REQUIRES(context, flags_.ratio == 1 || flags_.ratio == 2 ||
+ flags_.ratio == 4 || flags_.ratio == 8,
+ errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
+ flags_.ratio));
+ OP_REQUIRES_OK(
+ context, context->GetAttr("fancy_upscaling", &flags_.fancy_upscaling));
+ OP_REQUIRES_OK(context,
+ context->GetAttr("try_recover_truncated",
+ &flags_.try_recover_truncated_jpeg));
+ OP_REQUIRES_OK(context, context->GetAttr("acceptable_fraction",
+ &flags_.min_acceptable_fraction));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& contents = context->input(0);
+ OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
+ errors::InvalidArgument("contents must be scalar, got shape ",
+ contents.shape().ShortDebugString()));
+ const StringPiece input = contents.scalar<string>()();
+ OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
+ errors::InvalidArgument("JPEG contents are too large for int: ",
+ input.size()));
+
+ // Decode image, allocating tensor once the image size is known
+ Tensor* output = NULL;
+ OP_REQUIRES(
+ context,
+ jpeg::Uncompress(
+ input.data(), input.size(), flags_, NULL,
+ [=, &output](int width, int height, int channels) -> uint8* {
+ Status status(context->allocate_output(
+ 0, TensorShape({height, width, channels}), &output));
+ if (!status.ok()) {
+ VLOG(1) << status;
+ context->SetStatus(status);
+ return nullptr;
+ }
+ return output->flat<uint8>().data();
+ }),
+ errors::InvalidArgument("Invalid JPEG data, size ", input.size()));
+ }
+
+ private:
+ jpeg::UncompressFlags flags_;
+};
+REGISTER_KERNEL_BUILDER(Name("DecodeJpeg").Device(DEVICE_CPU), DecodeJpegOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_png_op.cc b/tensorflow/core/kernels/decode_png_op.cc
new file mode 100644
index 0000000000..e8071526f9
--- /dev/null
+++ b/tensorflow/core/kernels/decode_png_op.cc
@@ -0,0 +1,69 @@
+// See docs in ../ops/image_ops.cc
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/lib/png/png_io.h"
+
+namespace tensorflow {
+
+// Decode the contents of a PNG file
+class DecodePngOp : public OpKernel {
+ public:
+ explicit DecodePngOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
+ OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3 ||
+ channels_ == 4,
+ errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ",
+ channels_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& contents = context->input(0);
+ OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
+ errors::InvalidArgument("contents must be scalar, got shape ",
+ contents.shape().ShortDebugString()));
+
+ // Start decoding image to get shape details
+ const StringPiece data = contents.scalar<string>()();
+ png::DecodeContext decode;
+ OP_REQUIRES(
+ context, png::CommonInitDecode(data, channels_, 8, &decode),
+ errors::InvalidArgument("Invalid PNG header, data size ", data.size()));
+
+ // Verify that width and height don't overflow int
+ const int width = decode.width;
+ const int height = decode.height;
+ if (width != static_cast<int64>(decode.width) ||
+ height != static_cast<int64>(decode.height)) {
+ png::CommonFreeDecode(&decode);
+ OP_REQUIRES(context, false,
+ errors::InvalidArgument("PNG size too large for int: ",
+ decode.width, " by ", decode.height));
+ }
+
+ // Allocate tensor
+ Tensor* output = nullptr;
+ const auto status = context->allocate_output(
+ 0, TensorShape({height, width, decode.channels}), &output);
+ if (!status.ok()) png::CommonFreeDecode(&decode);
+ OP_REQUIRES_OK(context, status);
+
+ // Finish decoding image
+ OP_REQUIRES(
+ context, png::CommonFinishDecode(output->flat<uint8>().data(),
+ decode.channels * width, &decode),
+ errors::InvalidArgument("Invalid PNG data, size ", data.size()));
+ }
+
+ private:
+ int channels_;
+};
+REGISTER_KERNEL_BUILDER(Name("DecodePng").Device(DEVICE_CPU), DecodePngOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc
new file mode 100644
index 0000000000..ef24c333a4
--- /dev/null
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@@ -0,0 +1,90 @@
+// See docs in ../ops/parse_ops.cc.
+
+#include <algorithm>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+template <typename T>
+class DecodeRawOp : public OpKernel {
+ public:
+ explicit DecodeRawOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("little_endian", &little_endian_));
+ OP_REQUIRES_OK(context, context->GetAttr("out_type", &out_type_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const auto& input = context->input(0);
+ int str_size = -1;
+ auto flat_in = input.flat<string>();
+ for (int i = 0; i < flat_in.size(); ++i) {
+ const string& in_str = flat_in(i);
+ if (str_size == -1) {
+ str_size = in_str.size();
+ } else {
+ OP_REQUIRES(context, str_size == in_str.size(),
+ errors::InvalidArgument(
+ "DecodeRaw requires input strings to all be the same "
+ "size, but element ",
+ i, " has size ", str_size, " != ", in_str.size()));
+ }
+ }
+ TensorShape out_shape = input.shape();
+ if (str_size == -1) { // Empty input
+ out_shape.AddDim(1);
+ Tensor* output_tensor = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output("output", out_shape,
+ &output_tensor));
+ return;
+ }
+ OP_REQUIRES(
+ context, str_size % sizeof(T) == 0,
+ errors::InvalidArgument("Input to DecodeRaw has length ", str_size,
+ " that is not a multiple of ", sizeof(T),
+ ", the size of ", DataTypeString(out_type_)));
+ const int added_dim = str_size / sizeof(T);
+ out_shape.AddDim(added_dim);
+ Tensor* output_tensor = nullptr;
+ OP_REQUIRES_OK(
+ context, context->allocate_output("output", out_shape, &output_tensor));
+ auto out = output_tensor->flat_inner_dims<T>();
+ DCHECK_EQ(flat_in.size(), out.dimensions()[0]);
+ OP_REQUIRES(
+ context,
+ little_endian_ == ::tensorflow::port::kLittleEndian || sizeof(T) == 1,
+ errors::Unimplemented("Unimplemented support for little_endian=",
+ little_endian_ ? "true" : "false"));
+ // Endianness matches, so just copy each string byte-for-byte.
+ T* out_data = out.data();
+ for (int i = 0; i < flat_in.size(); ++i) {
+ const T* in_data = reinterpret_cast<const T*>(flat_in(i).data());
+ memcpy(out_data, in_data, str_size);
+ out_data += added_dim;
+ }
+ }
+
+ private:
+ bool little_endian_;
+ DataType out_type_;
+};
+
+#define REGISTER(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("DecodeRaw").Device(DEVICE_CPU).TypeConstraint<type>("out_type"), \
+ DecodeRawOp<type>)
+
+REGISTER(float);
+REGISTER(double);
+REGISTER(int32);
+REGISTER(uint8);
+REGISTER(int16);
+REGISTER(int8);
+REGISTER(int64);
+
+#undef REGISTER
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
new file mode 100644
index 0000000000..f56c37b4ef
--- /dev/null
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -0,0 +1,136 @@
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/assign_op.h"
+#include "tensorflow/core/kernels/dense_update_ops.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+template <typename Device, typename T>
+class AssignOpT : public AssignOp {
+ public:
+ using AssignOp::AssignOp;
+
+ void Copy(OpKernelContext* context, Tensor* lhs, const Tensor& rhs) override {
+ functor::DenseUpdate<Device, T, ASSIGN> copy;
+ copy(context->eigen_device<Device>(), lhs->flat<T>(), rhs.flat<T>());
+ }
+};
+
+// TODO(jeff): Get rid of use_exclusive_lock_ option
+template <typename Device, typename T, DenseUpdateType OP>
+class DenseUpdateOp : public OpKernel {
+ public:
+ explicit DenseUpdateOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context,
+ context->GetAttr("use_locking", &use_exclusive_lock_));
+ const DataType dt = DataTypeToEnum<T>::v();
+ OP_REQUIRES_OK(context, context->MatchSignature({MakeRefType(dt), dt},
+ {MakeRefType(dt)}));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ // We always return the input ref.
+ context->forward_ref_input_to_ref_output(0, 0);
+
+ if (use_exclusive_lock_) {
+ mutex_lock l(*context->input_ref_mutex(0));
+ DoUpdate(context);
+ } else {
+ DoUpdate(context);
+ }
+ }
+
+ private:
+ void DoUpdate(OpKernelContext* context) {
+ Tensor Tparams = context->mutable_input(0, use_exclusive_lock_);
+ const Tensor& Tupdate = context->input(1);
+ OP_REQUIRES(context, Tparams.IsInitialized(),
+ errors::FailedPrecondition("Attempting to use uninitialized "
+ "parameters: ",
+ def().input(0)));
+ OP_REQUIRES(
+ context, Tparams.IsSameSize(Tupdate),
+ errors::InvalidArgument("Parameters and update must be the same size"));
+
+ functor::DenseUpdate<Device, T, OP> update_functor;
+ update_functor(context->eigen_device<Device>(), Tparams.flat<T>(),
+ Tupdate.flat<T>());
+ }
+
+ bool use_exclusive_lock_;
+};
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Assign").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ AssignOpT<CPUDevice, type>);
+
+TF_CALL_ALL_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+// Only register 'Assign' on GPU for the subset of types also supported by
+// 'Variable' (see variable_ops.cc.)
+#define REGISTER_GPU_KERNELS(type) \
+ namespace functor { \
+ template <> \
+ void DenseUpdate<GPUDevice, type, ASSIGN>::operator()( \
+ const GPUDevice& d, typename TTypes<type>::Flat lhs, \
+ typename TTypes<type>::ConstFlat rhs); \
+ extern template struct DenseUpdate<GPUDevice, type, ASSIGN>; \
+ } \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Assign").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+ AssignOpT<GPUDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+#endif // GOOGLE_CUDA
+
+#define REGISTER_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("AssignAdd").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ DenseUpdateOp<CPUDevice, type, DenseUpdateType::ADD>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("AssignSub").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ DenseUpdateOp<CPUDevice, type, DenseUpdateType::SUB>);
+
+TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC_FOR_OP(T, OP) \
+ template <> \
+ void DenseUpdate<GPUDevice, T, OP>::operator()( \
+ const GPUDevice& d, typename TTypes<T>::Flat params, \
+ typename TTypes<T>::ConstFlat update); \
+ extern template struct DenseUpdate<GPUDevice, T, OP>
+#define DECLARE_GPU_SPEC(T) \
+ DECLARE_GPU_SPEC_FOR_OP(T, DenseUpdateType::ADD); \
+ DECLARE_GPU_SPEC_FOR_OP(T, DenseUpdateType::SUB)
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+#undef DECLARE_GPU_SPEC
+#undef DECLARE_GPU_SPEC_FOR_OP
+} // namespace functor
+
+#define REGISTER_GPU_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("AssignAdd").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+ DenseUpdateOp<GPUDevice, type, DenseUpdateType::ADD>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("AssignSub").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+ DenseUpdateOp<GPUDevice, type, DenseUpdateType::SUB>);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+#endif // end GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_ops.h b/tensorflow/core/kernels/dense_update_ops.h
new file mode 100644
index 0000000000..d32c9a4af2
--- /dev/null
+++ b/tensorflow/core/kernels/dense_update_ops.h
@@ -0,0 +1,43 @@
+#ifndef TENSORFLOW_KERNELS_DENSE_UPDATE_OPS_H_
+#define TENSORFLOW_KERNELS_DENSE_UPDATE_OPS_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+enum DenseUpdateType { ADD, SUB, ASSIGN };
+
+namespace functor {
+
+template <typename Device, typename T, DenseUpdateType OP>
+struct DenseUpdate;
+
+template <typename Device, typename T>
+struct DenseUpdate<Device, T, ADD> {
+ void operator()(const Device& d, typename TTypes<T>::Flat params,
+ typename TTypes<T>::ConstFlat update) {
+ params.device(d) += update;
+ }
+};
+
+template <typename Device, typename T>
+struct DenseUpdate<Device, T, SUB> {
+ void operator()(const Device& d, typename TTypes<T>::Flat params,
+ typename TTypes<T>::ConstFlat update) {
+ params.device(d) -= update;
+ }
+};
+
+template <typename Device, typename T>
+struct DenseUpdate<Device, T, ASSIGN> {
+ void operator()(const Device& d, typename TTypes<T>::Flat params,
+ typename TTypes<T>::ConstFlat update) {
+ params.device(d) = update;
+ }
+};
+
+} // end namespace functor
+} // end namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_DENSE_UPDATE_OPS_H_
diff --git a/tensorflow/core/kernels/dense_update_ops_gpu.cu.cc b/tensorflow/core/kernels/dense_update_ops_gpu.cu.cc
new file mode 100644
index 0000000000..8e80901c71
--- /dev/null
+++ b/tensorflow/core/kernels/dense_update_ops_gpu.cu.cc
@@ -0,0 +1,22 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/dense_update_ops.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_GPU_KERNELS(T) \
+ template struct functor::DenseUpdate<GPUDevice, T, ADD>; \
+ template struct functor::DenseUpdate<GPUDevice, T, SUB>; \
+ template struct functor::DenseUpdate<GPUDevice, T, ASSIGN>;
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+#undef DEFINE_GPU_KERNELS
+
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/determinant_op.cc b/tensorflow/core/kernels/determinant_op.cc
new file mode 100644
index 0000000000..d34aab7a44
--- /dev/null
+++ b/tensorflow/core/kernels/determinant_op.cc
@@ -0,0 +1,66 @@
+// See docs in ../ops/linalg_ops.cc.
+#include <cmath>
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/Eigen/LU"
+
+namespace tensorflow {
+
+template <class Scalar, bool SupportsBatchOperationT>
+class DeterminantOp : public LinearAlgebraOp<Scalar, SupportsBatchOperationT> {
+ public:
+ explicit DeterminantOp(OpKernelConstruction* context)
+ : LinearAlgebraOp<Scalar, SupportsBatchOperationT>(context) {}
+ ~DeterminantOp() override {}
+
+ TensorShape GetOutputMatrixShape(
+ const TensorShape& input_matrix_shape) override {
+ return TensorShape({});
+ }
+
+ int64 GetCostPerUnit(const TensorShape& input_matrix_shape) override {
+ const int64 rows = input_matrix_shape.dim_size(0);
+ if (rows > (1LL << 20)) {
+ // A big number to cap the cost in case overflow.
+ return kint32max;
+ } else {
+ return rows * rows * rows;
+ }
+ }
+
+ using typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::MatrixMap;
+ using
+ typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::ConstMatrixMap;
+
+ void ComputeMatrix(OpKernelContext* context, const ConstMatrixMap& input,
+ MatrixMap* output) override {
+ OP_REQUIRES(context, input.rows() == input.cols(),
+ errors::InvalidArgument("Input matrix must be square."));
+ Scalar determinant;
+ if (input.rows() == 0) {
+ // An empty matrix' determinant is defined to be 1. See
+ // wikipedia.
+ determinant = 1;
+ } else {
+ determinant = input.determinant();
+ }
+ OP_REQUIRES(context, std::isfinite(determinant),
+ errors::Internal("The determinant is not finite."));
+ (*output)(0, 0) = determinant;
+ }
+};
+
+REGISTER_LINALG_OP("MatrixDeterminant", (DeterminantOp<float, false>), float);
+REGISTER_LINALG_OP("MatrixDeterminant", (DeterminantOp<double, false>), double);
+REGISTER_LINALG_OP("BatchMatrixDeterminant", (DeterminantOp<float, true>),
+ float);
+REGISTER_LINALG_OP("BatchMatrixDeterminant", (DeterminantOp<double, true>),
+ double);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/diag_op.cc b/tensorflow/core/kernels/diag_op.cc
new file mode 100644
index 0000000000..83e39d33a9
--- /dev/null
+++ b/tensorflow/core/kernels/diag_op.cc
@@ -0,0 +1,93 @@
+// See docs in ../ops/array_ops.cc
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace {
+template <typename T, size_t NumDims, size_t DoubleNumDims>
+class DiagonalGenerator {
+ public:
+ explicit DiagonalGenerator(const Tensor& diagonal) : diagonal_(diagonal) {
+ static_assert(DoubleNumDims == 2 * NumDims,
+ "The second size must be the double of the first size.");
+ CHECK_EQ(diagonal.dims(), NumDims);
+ }
+ T operator()(
+ const Eigen::array<Eigen::DenseIndex, DoubleNumDims>& coordinates) const {
+ Eigen::array<Eigen::DenseIndex, NumDims> index;
+ for (int i = 0; i < NumDims; ++i) {
+ if (coordinates[i] != coordinates[NumDims + i]) {
+ return T(0);
+ }
+ index[i] = coordinates[i];
+ }
+ return diagonal_.tensor<T, NumDims>()(index);
+ }
+
+ private:
+ Tensor diagonal_;
+};
+} // namespace
+
+// Generate the diagonal tensor with the diagonal set to the input tensor.
+// It only allows up to rank 3 input tensor, so the output tensor is up to
+// rank 6.
+template <typename T>
+class DiagOp : public OpKernel {
+ public:
+ explicit DiagOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& diagonal = context->input(0);
+ const int num_dims = diagonal.dims();
+ OP_REQUIRES(context, 1 <= num_dims,
+ errors::InvalidArgument(
+ "The rank of the diagonal should be between 1 and 3."));
+ OP_REQUIRES(context, 3 >= num_dims,
+ errors::InvalidArgument(
+ "The rank of the diagonal should be between 1 and 3."));
+ TensorShape out_shape;
+ for (int i = 0; i < num_dims; ++i) {
+ out_shape.AddDim(diagonal.dim_size(i));
+ }
+ for (int i = 0; i < num_dims; ++i) {
+ out_shape.AddDim(diagonal.dim_size(i));
+ }
+ Tensor* output_tensor = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, out_shape, &output_tensor));
+ switch (num_dims) {
+ case 1:
+ output_tensor->tensor<T, 2>() = output_tensor->tensor<T, 2>().generate(
+ DiagonalGenerator<T, 1, 2>(diagonal));
+ break;
+ case 2:
+ output_tensor->tensor<T, 4>() = output_tensor->tensor<T, 4>().generate(
+ DiagonalGenerator<T, 2, 4>(diagonal));
+ break;
+ case 3:
+ output_tensor->tensor<T, 6>() = output_tensor->tensor<T, 6>().generate(
+ DiagonalGenerator<T, 3, 6>(diagonal));
+ break;
+ default:
+ context->SetStatus(errors::Unimplemented(
+ "Diagonal of rank ", num_dims, " tensor is not supported yet."));
+ return;
+ }
+ }
+};
+
+#define REGISTER_DIAGOP(T) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Diag").Device(DEVICE_CPU).TypeConstraint<T>("T"), DiagOp<T>)
+
+REGISTER_DIAGOP(double);
+REGISTER_DIAGOP(float);
+REGISTER_DIAGOP(int32);
+REGISTER_DIAGOP(int64);
+
+#undef REGISTER_DIAGOP
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
new file mode 100644
index 0000000000..f1b44861b5
--- /dev/null
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -0,0 +1,154 @@
+// See docs in ../ops/data_flow_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+// Shared code that is not dependent on the type of T. We do this to reduce
+// code size by not duplicating all this for all T (float, double, int32, etc.)
+class DynamicPartitionOp_Shared : public OpKernel {
+ public:
+ explicit DynamicPartitionOp_Shared(OpKernelConstruction* c) : OpKernel(c) {
+ OP_REQUIRES_OK(c, c->GetAttr("num_partitions", &num_partitions_));
+ // QUESTION: It'd be nice to support DT_INT16, DT_UINT8, etc.
+ // to input[1]. Should we have the framework do some sort of
+ // integer promotion automatically, or should that be something
+ // that users have to do explicitly with a conversion operator
+ // in the graph?
+ }
+
+ void ValidateAndAllocateOutputs(OpKernelContext* c, const Tensor** data,
+ const Tensor** partitions,
+ OpOutputList* Tout) {
+ OP_REQUIRES_OK(c, c->input("data", data));
+ OP_REQUIRES_OK(c, c->input("partitions", partitions));
+ OP_REQUIRES(c, TensorShapeUtils::StartsWith((*data)->shape(),
+ (*partitions)->shape()),
+ errors::InvalidArgument(
+ "data.shape must start with partitions.shape, ",
+ "got data.shape = ", (*data)->shape().ShortDebugString(),
+ ", partitions.shape = ",
+ (*partitions)->shape().ShortDebugString()));
+
+ // Count how many occurrences of each partition id we have in partitions
+ gtl::InlinedVector<int, 32> partition_count(num_partitions_);
+ auto e_partitions = (*partitions)->flat<int32>();
+ const int64 N = e_partitions.dimension(0);
+ for (int64 i = 0; i < N; i++) {
+ const int32 p = e_partitions(i);
+ OP_REQUIRES(c, p >= 0 && p < num_partitions_,
+ errors::InvalidArgument(
+ "partitions", SliceString((*partitions)->shape(), i),
+ " = ", p, " is not in [0, ", num_partitions_, ")"));
+ partition_count[p]++;
+ }
+
+ // Allocate output tensors of the right size
+ OP_REQUIRES_OK(c, c->output_list("outputs", Tout));
+ for (int p = 0; p < num_partitions_; p++) {
+ TensorShape shape;
+ shape.AddDim(partition_count[p]);
+ for (int i = (*partitions)->dims(); i < (*data)->dims(); i++) {
+ shape.AddDim((*data)->dim_size(i));
+ }
+ Tensor* out;
+ OP_REQUIRES_OK(c, Tout->allocate(p, shape, &out));
+ }
+ }
+
+ protected:
+ int num_partitions_;
+
+ static string SliceString(const TensorShape& shape, const int64 flat) {
+ // Special case rank 0 and 1
+ const int dims = shape.dims();
+ if (dims == 0) return "";
+ if (dims == 1) return strings::StrCat("[", flat, "]");
+
+ // Compute strides
+ gtl::InlinedVector<int64, 32> strides(dims);
+ strides.back() = 1;
+ for (int i = dims - 2; i >= 0; i--) {
+ strides[i] = strides[i + 1] * shape.dim_size(i + 1);
+ }
+
+ // Unflatten index
+ int64 left = flat;
+ string result;
+ for (int i = 0; i < dims; i++) {
+ strings::StrAppend(&result, i ? "," : "[", left / strides[i]);
+ left %= strides[i];
+ }
+ strings::StrAppend(&result, "]");
+ return result;
+ }
+};
+
+template <class T>
+class DynamicPartitionOp : public DynamicPartitionOp_Shared {
+ public:
+ explicit DynamicPartitionOp(OpKernelConstruction* c)
+ : DynamicPartitionOp_Shared(c) {}
+ void Compute(OpKernelContext* c) override {
+ const Tensor* data;
+ const Tensor* partitions;
+ OpOutputList outputs;
+ ValidateAndAllocateOutputs(c, &data, &partitions, &outputs);
+ if (!c->status().ok()) return;
+ if (num_partitions_ == 0 || data->NumElements() == 0) return;
+
+ auto e_partitions = partitions->flat<int32>();
+ const int64 N = e_partitions.dimension(0);
+ gtl::InlinedVector<int, 32> output_index(num_partitions_);
+
+ if (partitions->dims() == data->dims()) {
+ // Walk through data and copy the data to the appropriate output tensor
+ const auto data_flat = data->flat<T>();
+ std::vector<Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>,
+ Eigen::Aligned> > out_vec;
+ for (int p = 0; p < num_partitions_; p++) {
+ out_vec.push_back(outputs[p]->vec<T>());
+ }
+ for (int64 i = 0; i < N; i++) {
+ const int32 p = e_partitions(i);
+ out_vec[p](output_index[p]) = data_flat(i);
+ output_index[p]++;
+ }
+ } else {
+ // If data has extra dimensions, use Eigen slices
+ std::vector<Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
+ Eigen::Aligned> > out_flat;
+ for (int p = 0; p < num_partitions_; p++) {
+ out_flat.push_back(outputs[p]->flat_outer_dims<T>());
+ }
+
+ // Walk through data and copy the data to the appropriate output tensor
+ const int64 slice_size = data->NumElements() / N;
+ const auto data_flat = data->shaped<T, 2>({N, slice_size});
+ Eigen::DSizes<Eigen::DenseIndex, 2> sizes(1, slice_size);
+ for (int64 i = 0; i < N; i++) {
+ const int32 p = e_partitions(i);
+ // outputs[p][output_index[p]++] = data[i]
+ Eigen::DSizes<Eigen::DenseIndex, 2> out_indices(output_index[p], 0);
+ Eigen::DSizes<Eigen::DenseIndex, 2> data_indices(i, 0);
+ out_flat[p].slice(out_indices, sizes) =
+ data_flat.slice(data_indices, sizes);
+ output_index[p]++;
+ }
+ }
+ }
+};
+
+#define REGISTER_DYNAMIC_PARTITION(T) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("DynamicPartition").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+ DynamicPartitionOp<T>)
+
+TF_CALL_ALL_TYPES(REGISTER_DYNAMIC_PARTITION);
+#undef REGISTER_DYNAMIC_PARTITION
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/dynamic_partition_op_test.cc b/tensorflow/core/kernels/dynamic_partition_op_test.cc
new file mode 100644
index 0000000000..b0e5e7deb0
--- /dev/null
+++ b/tensorflow/core/kernels/dynamic_partition_op_test.cc
@@ -0,0 +1,145 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+class DynamicPartitionOpTest : public OpsTestBase {
+ protected:
+ void MakeOp() {
+ RequireDefaultOps();
+ ASSERT_OK(NodeDefBuilder("myop", "DynamicPartition")
+ .Input(FakeInput(DT_FLOAT))
+ .Input(FakeInput(DT_INT32))
+ .Attr("num_partitions", 4)
+ .Finalize(node_def()));
+ ASSERT_OK(InitOp());
+ }
+};
+
+TEST_F(DynamicPartitionOpTest, Simple_OneD) {
+ MakeOp();
+
+ // Similar to how we would use this to split embedding ids to be looked up
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({6}), {0, 13, 2, 39, 4, 17});
+ AddInputFromArray<int32>(TensorShape({6}), {0, 0, 2, 3, 2, 1});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the output sizes
+ { // Output 0
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({2}));
+ test::FillValues<float>(&expected, {0, 13});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+ }
+ { // Output 1
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({1}));
+ test::FillValues<float>(&expected, {17});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(1));
+ }
+ { // Output 2
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({2}));
+ test::FillValues<float>(&expected, {2, 4});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(2));
+ }
+ { // Output 3
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({1}));
+ test::FillValues<float>(&expected, {39});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(3));
+ }
+}
+
+TEST_F(DynamicPartitionOpTest, Simple_TwoD) {
+ MakeOp();
+
+ // Feed and run
+ AddInputFromArray<float>(
+ TensorShape({6, 3}),
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
+ AddInputFromArray<int32>(TensorShape({6}), {0, 0, 2, 3, 2, 1});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the output sizes
+ { // Output 0
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+ test::FillValues<float>(&expected, {0, 1, 2, 3, 4, 5});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+ }
+ { // Output 1
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3}));
+ test::FillValues<float>(&expected, {15, 16, 17});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(1));
+ }
+ { // Output 2
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+ test::FillValues<float>(&expected, {6, 7, 8, 12, 13, 14});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(2));
+ }
+ { // Output 3
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3}));
+ test::FillValues<float>(&expected, {9, 10, 11});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(3));
+ }
+}
+
+TEST_F(DynamicPartitionOpTest, SomeOutputsEmpty) {
+ MakeOp();
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({6}), {0, 13, 2, 39, 4, 17});
+ AddInputFromArray<int32>(TensorShape({6}), {0, 0, 2, 2, 0, 2});
+ ASSERT_OK(RunOpKernel());
+
+ TensorShape empty_one_dim;
+ empty_one_dim.AddDim(0);
+ Tensor expected_empty(allocator(), DT_FLOAT, empty_one_dim);
+
+ // Check the output sizes
+ { // Output 0
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
+ test::FillValues<float>(&expected, {0, 13, 4});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+ }
+ { // Output 1
+ test::ExpectTensorEqual<float>(expected_empty, *GetOutput(1));
+ }
+ { // Output 2
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
+ test::FillValues<float>(&expected, {2, 39, 17});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(2));
+ }
+ { // Output 3
+ test::ExpectTensorEqual<float>(expected_empty, *GetOutput(3));
+ }
+}
+
+TEST_F(DynamicPartitionOpTest, Error_IndexOutOfRange) {
+ MakeOp();
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({5, 3}),
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+ AddInputFromArray<int32>(TensorShape({5}), {0, 2, 99, 2, 2});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(
+ StringPiece(s.ToString()).contains("partitions[2] = 99 is not in [0, 4)"))
+ << s;
+}
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc
new file mode 100644
index 0000000000..a5623685fb
--- /dev/null
+++ b/tensorflow/core/kernels/dynamic_stitch_op.cc
@@ -0,0 +1,158 @@
+// See docs in ../ops/data_flow_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+template <class T>
+class DynamicStitchOp : public OpKernel {
+ public:
+ explicit DynamicStitchOp(OpKernelConstruction* c) : OpKernel(c) {
+ // Compute expected input signature
+ const DataType dt = DataTypeToEnum<T>::v();
+ const int n = c->num_inputs() / 2;
+ DataTypeVector expected;
+ for (int i = 0; i < n; i++) {
+ expected.push_back(DT_INT32);
+ }
+ for (int i = 0; i < n; i++) {
+ expected.push_back(dt);
+ }
+ OP_REQUIRES_OK(c, c->MatchSignature(expected, {dt}));
+ OP_REQUIRES(
+ c, c->num_inputs() > 0,
+ errors::InvalidArgument("DynamicStitchOp: Must have some inputs"));
+ OP_REQUIRES(c, c->num_inputs() % 2 == 0,
+ errors::InvalidArgument(
+ "DynamicStitchOp: Must have even number of arguments"));
+ }
+
+ void Compute(OpKernelContext* c) override {
+ // Find maximum index in the indices vectors
+ OpInputList indices_inputs;
+ OP_REQUIRES_OK(c, c->input_list("indices", &indices_inputs));
+
+ int32 max_index = -1;
+ for (const Tensor& indices : indices_inputs) {
+ Eigen::Tensor<int32, 0, Eigen::RowMajor> m =
+ indices.flat<int32>().maximum();
+ max_index = std::max(m(), max_index);
+ }
+ const int first_dim_size = max_index + 1;
+
+ // Validate that data[i].shape = indices[i].shape + constant
+ OpInputList data_inputs;
+ OP_REQUIRES_OK(c, c->input_list("data", &data_inputs));
+ const Tensor& data0 = data_inputs[0];
+ const Tensor& indices0 = indices_inputs[0];
+ for (int input_num = 0; input_num < indices_inputs.size(); input_num++) {
+ const Tensor& indices = indices_inputs[input_num];
+ const Tensor& data = data_inputs[input_num];
+ OP_REQUIRES(
+ c, TensorShapeUtils::StartsWith(data.shape(), indices.shape()),
+ errors::InvalidArgument(
+ "data[", input_num, "].shape = ", data.shape().ShortDebugString(),
+ " does not start with indices[", input_num, "].shape = ",
+ indices.shape().ShortDebugString()));
+ OP_REQUIRES(
+ c, input_num == 0 || SameExtraShape(data0, indices0, data, indices),
+ errors::InvalidArgument(
+ "Need data[0].shape[", indices0.dims(), ":] = data[", input_num,
+ "].shape[", indices.dims(), ":], got data[0].shape = ",
+ data0.shape().ShortDebugString(), ", data[", input_num,
+ "].shape = ", data.shape().ShortDebugString(),
+ ", indices[0].shape = ", indices0.shape().ShortDebugString(),
+ ", indices[", input_num, "].shape = ",
+ indices.shape().ShortDebugString()));
+ }
+
+ // Allocate result tensor of shape
+ // [first_dim_size] + data.shape[indices.dims:]
+ TensorShape result_shape;
+ result_shape.AddDim(first_dim_size);
+ for (int d = indices0.dims(); d < data0.dims(); d++) {
+ result_shape.AddDim(data0.dim_size(d));
+ }
+ Tensor* merged = nullptr;
+ OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &merged));
+
+ // TODO(jeff): Currently we leave uninitialized any portions of
+ // merged that aren't covered by an index in indices. What should we do?
+ if (first_dim_size > 0) {
+ auto merged_flat = merged->flat_outer_dims<T>();
+ const int slice_size = merged_flat.dimension(1);
+ for (int input_num = 0; input_num < indices_inputs.size(); input_num++) {
+ const Tensor& indices = indices_inputs[input_num];
+ auto indices_vec = indices.flat<int32>();
+ const Tensor& data = data_inputs[input_num];
+ auto data_flat =
+ data.shaped<T, 2>({indices_vec.dimension(0), slice_size});
+
+ if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
+ T* merged_base = &merged_flat(0, 0);
+ const T* data_base = &data_flat(0, 0);
+ const size_t slice_bytes = slice_size * sizeof(T);
+ for (int i = 0; i < indices_vec.size(); i++) {
+ memcpy(merged_base + indices_vec(i) * slice_size,
+ data_base + i * slice_size, slice_bytes);
+ }
+ } else {
+ Eigen::DSizes<Eigen::DenseIndex, 2> sizes(1, slice_size);
+ for (int i = 0; i < indices_vec.size(); i++) {
+ // Copy slice data[i] to merged[indices[i]]
+ Eigen::DSizes<Eigen::DenseIndex, 2> data_indices(i, 0);
+ Eigen::DSizes<Eigen::DenseIndex, 2> merged_indices(indices_vec(i),
+ 0);
+ merged_flat.slice(merged_indices, sizes) =
+ data_flat.slice(data_indices, sizes);
+ }
+ }
+ }
+ }
+ }
+
+ private:
+ // Check if data0.shape[indices0.dims():] == data1.shape[indices1.dims():]
+ static bool SameExtraShape(const Tensor& data0, const Tensor& indices0,
+ const Tensor& data1, const Tensor& indices1) {
+ const int extra0 = data0.dims() - indices0.dims();
+ const int extra1 = data1.dims() - indices1.dims();
+ if (extra0 != extra1) return false;
+ for (int i = 0; i < extra0; i++) {
+ if (data0.dim_size(indices0.dims() + i) !=
+ data1.dim_size(indices1.dims() + i)) {
+ return false;
+ }
+ }
+ return true;
+ }
+};
+
+#define REGISTER_DYNAMIC_STITCH(type) \
+ REGISTER_KERNEL_BUILDER(Name("DynamicStitch") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("indices"), \
+ DynamicStitchOp<type>)
+
+TF_CALL_ALL_TYPES(REGISTER_DYNAMIC_STITCH);
+#undef REGISTER_DYNAMIC_STITCH
+
+#if GOOGLE_CUDA
+#define REGISTER_DYNAMIC_STITCH_GPU(type) \
+ REGISTER_KERNEL_BUILDER(Name("DynamicStitch") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("indices") \
+ .HostMemory("data") \
+ .HostMemory("merged"), \
+ DynamicStitchOp<type>)
+
+TF_CALL_ALL_TYPES(REGISTER_DYNAMIC_STITCH_GPU);
+#undef REGISTER_DYNAMIC_STITCH_GPU
+
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/dynamic_stitch_op_test.cc b/tensorflow/core/kernels/dynamic_stitch_op_test.cc
new file mode 100644
index 0000000000..8c71f0fd0f
--- /dev/null
+++ b/tensorflow/core/kernels/dynamic_stitch_op_test.cc
@@ -0,0 +1,133 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+class DynamicStitchOpTest : public OpsTestBase {
+ protected:
+ void MakeOp(int n, DataType dt) {
+ RequireDefaultOps();
+ ASSERT_OK(NodeDefBuilder("myop", "DynamicStitch")
+ .Input(FakeInput(n, DT_INT32))
+ .Input(FakeInput(n, dt))
+ .Finalize(node_def()));
+ ASSERT_OK(InitOp());
+ }
+};
+
+TEST_F(DynamicStitchOpTest, Simple_OneD) {
+ MakeOp(2, DT_FLOAT);
+
+ // Feed and run
+ AddInputFromArray<int32>(TensorShape({3}), {0, 4, 7});
+ AddInputFromArray<int32>(TensorShape({5}), {1, 6, 2, 3, 5});
+ AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
+ AddInputFromArray<float>(TensorShape({5}), {10, 60, 20, 30, 50});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({8}));
+ test::FillValues<float>(&expected, {0, 10, 20, 30, 40, 50, 60, 70});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(DynamicStitchOpTest, Simple_TwoD) {
+ MakeOp(3, DT_FLOAT);
+
+ // Feed and run
+ AddInputFromArray<int32>(TensorShape({3}), {0, 4, 7});
+ AddInputFromArray<int32>(TensorShape({2}), {1, 6});
+ AddInputFromArray<int32>(TensorShape({3}), {2, 3, 5});
+ AddInputFromArray<float>(TensorShape({3, 2}), {0, 1, 40, 41, 70, 71});
+ AddInputFromArray<float>(TensorShape({2, 2}), {10, 11, 60, 61});
+ AddInputFromArray<float>(TensorShape({3, 2}), {20, 21, 30, 31, 50, 51});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({8, 2}));
+ test::FillValues<float>(&expected, {0, 1, 10, 11, 20, 21, 30, 31, 40, 41, 50,
+ 51, 60, 61, 70, 71});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(DynamicStitchOpTest, Error_IndicesMultiDimensional) {
+ MakeOp(2, DT_FLOAT);
+
+ // Feed and run
+ AddInputFromArray<int32>(TensorShape({3}), {0, 4, 7});
+ AddInputFromArray<int32>(TensorShape({1, 5}), {1, 6, 2, 3, 5});
+ AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
+ AddInputFromArray<float>(TensorShape({5}), {10, 60, 20, 30, 50});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(StringPiece(s.ToString())
+ .contains("data[1].shape = [5] does not start with "
+ "indices[1].shape = [1,5]"))
+ << s;
+}
+
+TEST_F(DynamicStitchOpTest, Error_DataNumDimsMismatch) {
+ MakeOp(2, DT_FLOAT);
+
+ // Feed and run
+ AddInputFromArray<int32>(TensorShape({3}), {0, 4, 7});
+ AddInputFromArray<int32>(TensorShape({5}), {1, 6, 2, 3, 5});
+ AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
+ AddInputFromArray<float>(TensorShape({1, 5}), {10, 60, 20, 30, 50});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(StringPiece(s.ToString())
+ .contains("data[1].shape = [1,5] does not start with "
+ "indices[1].shape = [5]"))
+ << s;
+}
+
+TEST_F(DynamicStitchOpTest, Error_DataDimSizeMismatch) {
+ MakeOp(2, DT_FLOAT);
+
+ // Feed and run
+ AddInputFromArray<int32>(TensorShape({3}), {0, 4, 5});
+ AddInputFromArray<int32>(TensorShape({4}), {1, 6, 2, 3});
+ AddInputFromArray<float>(TensorShape({3, 1}), {0, 40, 70});
+ AddInputFromArray<float>(TensorShape({4, 2}),
+ {10, 11, 60, 61, 20, 21, 30, 31});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(StringPiece(s.ToString())
+ .contains("Need data[0].shape[1:] = data[1].shape[1:], "
+ "got data[0].shape = [3,1], data[1].shape = [4,2]"))
+ << s;
+}
+
+TEST_F(DynamicStitchOpTest, Error_DataAndIndicesSizeMismatch) {
+ MakeOp(2, DT_FLOAT);
+
+ // Feed and run
+ AddInputFromArray<int32>(TensorShape({3}), {0, 4, 7});
+ AddInputFromArray<int32>(TensorShape({5}), {1, 6, 2, 3, 5});
+ AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
+ AddInputFromArray<float>(TensorShape({4}), {10, 60, 20, 30});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(
+ StringPiece(s.ToString())
+ .contains(
+ "data[1].shape = [4] does not start with indices[1].shape = [5]"))
+ << s;
+}
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/edit_distance_op.cc b/tensorflow/core/kernels/edit_distance_op.cc
new file mode 100644
index 0000000000..938d7f056b
--- /dev/null
+++ b/tensorflow/core/kernels/edit_distance_op.cc
@@ -0,0 +1,217 @@
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include <limits>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/gtl/edit_distance.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+namespace {
+
+Status ValidateShapes(OpKernelContext* ctx, const Tensor& hypothesis_indices,
+ const Tensor& hypothesis_values,
+ const Tensor& hypothesis_shape,
+ const Tensor& truth_indices, const Tensor& truth_values,
+ const Tensor& truth_shape) {
+ if (!TensorShapeUtils::IsMatrix(hypothesis_indices.shape()))
+ return errors::InvalidArgument(
+ "hypothesis_indices should be a matrix, but got shape: ",
+ hypothesis_indices.shape().DebugString());
+ if (!TensorShapeUtils::IsMatrix(truth_indices.shape()))
+ return errors::InvalidArgument(
+ "truth_indices should be a matrix, but got shape: ",
+ truth_indices.shape().DebugString());
+ if (!TensorShapeUtils::IsVector(hypothesis_values.shape()))
+ return errors::InvalidArgument(
+ "hypothesis_values should be a vector, but got shape: ",
+ hypothesis_values.shape().DebugString());
+ if (!TensorShapeUtils::IsVector(truth_values.shape()))
+ return errors::InvalidArgument(
+ "truth_values should be a vector, but got shape: ",
+ truth_values.shape().DebugString());
+ if (!TensorShapeUtils::IsVector(hypothesis_shape.shape()))
+ return errors::InvalidArgument(
+ "hypothesis_shape should be a vector, but got shape: ",
+ hypothesis_shape.shape().DebugString());
+ if (!TensorShapeUtils::IsVector(truth_shape.shape()))
+ return errors::InvalidArgument(
+ "truth_shape should be a vector, but got shape: ",
+ truth_shape.shape().DebugString());
+ if (hypothesis_shape.NumElements() != hypothesis_indices.dim_size(1))
+ return errors::InvalidArgument(
+ "Expected hypothesis_shape.NumElements == "
+ "#cols(hypothesis_indices), their shapes are: ",
+ hypothesis_shape.shape().DebugString(), " and ",
+ hypothesis_indices.shape().DebugString());
+ if (truth_shape.NumElements() < 2)
+ return errors::InvalidArgument(
+ "Input SparseTensors must have rank at least 2, but truth_shape "
+ "rank is: ",
+ truth_shape.NumElements());
+ if (truth_shape.NumElements() != truth_indices.dim_size(1))
+ return errors::InvalidArgument(
+ "Expected truth_shape.NumElements == "
+ "#cols(truth_indices), their shapes are: ",
+ truth_shape.shape().DebugString(), " and ",
+ truth_indices.shape().DebugString());
+ if (truth_shape.NumElements() != hypothesis_shape.NumElements())
+ return errors::InvalidArgument(
+ "Expected truth and hypothesis to have matching ranks, but "
+ "their shapes are: ",
+ truth_shape.shape().DebugString(), " and ",
+ hypothesis_shape.shape().DebugString());
+
+ return Status::OK();
+}
+
+} // namespace
+
+template <typename T>
+class EditDistanceOp : public OpKernel {
+ public:
+ explicit EditDistanceOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("normalize", &normalize_));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor* hypothesis_indices;
+ const Tensor* hypothesis_values;
+ const Tensor* hypothesis_shape;
+ const Tensor* truth_indices;
+ const Tensor* truth_values;
+ const Tensor* truth_shape;
+ OP_REQUIRES_OK(ctx, ctx->input("hypothesis_indices", &hypothesis_indices));
+ OP_REQUIRES_OK(ctx, ctx->input("hypothesis_values", &hypothesis_values));
+ OP_REQUIRES_OK(ctx, ctx->input("hypothesis_shape", &hypothesis_shape));
+ OP_REQUIRES_OK(ctx, ctx->input("truth_indices", &truth_indices));
+ OP_REQUIRES_OK(ctx, ctx->input("truth_values", &truth_values));
+ OP_REQUIRES_OK(ctx, ctx->input("truth_shape", &truth_shape));
+
+ OP_REQUIRES_OK(
+ ctx, ValidateShapes(ctx, *hypothesis_indices, *hypothesis_values,
+ *hypothesis_shape, *truth_indices, *truth_values,
+ *truth_shape));
+
+ TensorShape hypothesis_st_shape = TensorShapeUtils::MakeShape(
+ hypothesis_shape->vec<int64>().data(), hypothesis_shape->NumElements());
+ TensorShape truth_st_shape = TensorShapeUtils::MakeShape(
+ truth_shape->vec<int64>().data(), truth_shape->NumElements());
+
+ // Assume indices are sorted in row-major order.
+ std::vector<int64> sorted_order(truth_st_shape.dims());
+ std::iota(sorted_order.begin(), sorted_order.end(), 0);
+
+ sparse::SparseTensor hypothesis(*hypothesis_indices, *hypothesis_values,
+ hypothesis_st_shape, sorted_order);
+ sparse::SparseTensor truth(*truth_indices, *truth_values, truth_st_shape,
+ sorted_order);
+
+ // Group dims 0, 1, ..., RANK - 1. The very last dim is assumed
+ // to store the variable length sequences.
+ std::vector<int64> group_dims(truth_st_shape.dims() - 1);
+ std::iota(group_dims.begin(), group_dims.end(), 0);
+
+ TensorShape output_shape;
+ for (int d = 0; d < group_dims.size(); ++d) {
+ output_shape.AddDim(std::max(hypothesis_st_shape.dim_size(d),
+ truth_st_shape.dim_size(d)));
+ }
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output("output", output_shape, &output));
+ auto output_t = output->flat<float>();
+ output_t.setZero();
+
+ std::vector<int64> output_strides(output_shape.dims());
+ output_strides[output_shape.dims() - 1] = 1;
+ for (int d = output_shape.dims() - 2; d >= 0; --d) {
+ output_strides[d] = output_strides[d + 1] * output_shape.dim_size(d + 1);
+ }
+
+ auto hypothesis_grouper = hypothesis.group(group_dims);
+ auto truth_grouper = truth.group(group_dims);
+
+ auto hypothesis_iter = hypothesis_grouper.begin();
+ auto truth_iter = truth_grouper.begin();
+
+ auto cmp = std::equal_to<T>();
+
+ while (hypothesis_iter != hypothesis_grouper.end() &&
+ truth_iter != truth_grouper.end()) {
+ sparse::Group truth_i = *truth_iter;
+ sparse::Group hypothesis_j = *hypothesis_iter;
+ std::vector<int64> g_truth = truth_i.group();
+ std::vector<int64> g_hypothesis = hypothesis_j.group();
+ auto truth_seq = truth_i.values<T>();
+ auto hypothesis_seq = hypothesis_j.values<T>();
+
+ if (g_truth == g_hypothesis) {
+ auto loc = std::inner_product(g_truth.begin(), g_truth.end(),
+ output_strides.begin(), 0);
+ output_t(loc) =
+ gtl::LevenshteinDistance<T>(truth_seq, hypothesis_seq, cmp);
+ if (normalize_) output_t(loc) /= truth_seq.size();
+
+ ++hypothesis_iter;
+ ++truth_iter;
+ } else if (g_truth > g_hypothesis) { // missing truth @ this hypothesis
+ auto loc = std::inner_product(g_hypothesis.begin(), g_hypothesis.end(),
+ output_strides.begin(), 0);
+ output_t(loc) = hypothesis_seq.size();
+ if (normalize_) output_t(loc) /= 0.0;
+ ++hypothesis_iter;
+ } else { // missing hypothesis @ this truth
+ auto loc = std::inner_product(g_truth.begin(), g_truth.end(),
+ output_strides.begin(), 0);
+ output_t(loc) = (normalize_) ? 1.0 : truth_seq.size();
+ ++truth_iter;
+ }
+ }
+ while (hypothesis_iter != hypothesis_grouper.end()) { // missing truths
+ sparse::Group hypothesis_j = *hypothesis_iter;
+ std::vector<int64> g_hypothesis = hypothesis_j.group();
+ auto hypothesis_seq = hypothesis_j.values<T>();
+ auto loc = std::inner_product(g_hypothesis.begin(), g_hypothesis.end(),
+ output_strides.begin(), 0);
+ output_t(loc) = hypothesis_seq.size();
+ if (normalize_) output_t(loc) /= 0.0;
+ ++hypothesis_iter;
+ }
+ while (truth_iter != truth_grouper.end()) { // missing hypotheses
+ sparse::Group truth_i = *truth_iter;
+ std::vector<int64> g_truth = truth_i.group();
+ auto truth_seq = truth_i.values<T>();
+ auto loc = std::inner_product(g_truth.begin(), g_truth.end(),
+ output_strides.begin(), 0);
+ output_t(loc) = (normalize_) ? 1.0 : truth_seq.size();
+ ++truth_iter;
+ }
+ }
+
+ private:
+ bool normalize_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(EditDistanceOp);
+};
+
+#define REGISTER_CPU_KERNEL(T) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("EditDistance").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+ EditDistanceOp<T>);
+
+TF_CALL_ALL_TYPES(REGISTER_CPU_KERNEL);
+
+#undef REGISTER_CPU_KERNEL
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/encode_jpeg_op.cc b/tensorflow/core/kernels/encode_jpeg_op.cc
new file mode 100644
index 0000000000..8f5fd2f8be
--- /dev/null
+++ b/tensorflow/core/kernels/encode_jpeg_op.cc
@@ -0,0 +1,114 @@
+// See docs in ../ops/image_ops.cc
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/lib/jpeg/jpeg_mem.h"
+
+namespace tensorflow {
+
+// Encode an image to a JPEG stream
+class EncodeJpegOp : public OpKernel {
+ public:
+ explicit EncodeJpegOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("format", &format_));
+ if (format_.empty()) {
+ flags_.format = static_cast<jpeg::Format>(0);
+ } else if (format_ == "grayscale") {
+ flags_.format = jpeg::FORMAT_GRAYSCALE;
+ } else if (format_ == "rgb") {
+ flags_.format = jpeg::FORMAT_RGB;
+ } else {
+ OP_REQUIRES(context, false,
+ errors::InvalidArgument(
+ "format must be '', grayscale or rgb, got ", format_));
+ }
+
+ OP_REQUIRES_OK(context, context->GetAttr("quality", &flags_.quality));
+ OP_REQUIRES(context, 0 <= flags_.quality && flags_.quality <= 100,
+ errors::InvalidArgument("quality must be in [0,100], got ",
+ flags_.quality));
+ OP_REQUIRES_OK(context,
+ context->GetAttr("progressive", &flags_.progressive));
+ OP_REQUIRES_OK(
+ context, context->GetAttr("optimize_size", &flags_.optimize_jpeg_size));
+ OP_REQUIRES_OK(context, context->GetAttr("chroma_downsampling",
+ &flags_.chroma_downsampling));
+ OP_REQUIRES_OK(context, context->GetAttr("chroma_downsampling",
+ &flags_.chroma_downsampling));
+
+ string density_unit;
+ OP_REQUIRES_OK(context, context->GetAttr("density_unit", &density_unit));
+ if (density_unit == "in") {
+ flags_.density_unit = 1;
+ } else if (density_unit == "cm") {
+ flags_.density_unit = 2;
+ } else {
+ OP_REQUIRES(context, false,
+ errors::InvalidArgument("density_unit must be 'in' or 'cm'",
+ density_unit));
+ }
+
+ OP_REQUIRES_OK(context, context->GetAttr("x_density", &flags_.x_density));
+ OP_REQUIRES_OK(context, context->GetAttr("y_density", &flags_.y_density));
+ OP_REQUIRES_OK(context, context->GetAttr("xmp_metadata", &xmp_metadata_));
+ flags_.xmp_metadata = xmp_metadata_; // StringPiece doesn't own data
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& image = context->input(0);
+ OP_REQUIRES(context, image.dims() == 3,
+ errors::InvalidArgument("image must be 3-dimensional",
+ image.shape().ShortDebugString()));
+
+ // Autodetect format if desired, otherwise make sure format and
+ // image channels are consistent.
+ int channels;
+ jpeg::CompressFlags adjusted_flags = flags_;
+ if (flags_.format == 0) {
+ channels = image.dim_size(2);
+ if (channels == 1) {
+ adjusted_flags.format = jpeg::FORMAT_GRAYSCALE;
+ } else if (channels == 3) {
+ adjusted_flags.format = jpeg::FORMAT_RGB;
+ } else {
+ OP_REQUIRES(context, false, errors::InvalidArgument(
+ "image must have 1 or 3 channels, got ",
+ image.shape().ShortDebugString()));
+ }
+ } else {
+ if (flags_.format == jpeg::FORMAT_GRAYSCALE) {
+ channels = 1;
+ } else { // RGB
+ channels = 3;
+ }
+ OP_REQUIRES(context, channels == image.dim_size(2),
+ errors::InvalidArgument("format ", format_, " expects ",
+ channels, " channels, got ",
+ image.shape().ShortDebugString()));
+ }
+
+ // Encode image to jpeg string
+ Tensor* output = NULL;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, TensorShape({}), &output));
+ OP_REQUIRES(context,
+ jpeg::Compress(image.flat<uint8>().data(), image.dim_size(1),
+ image.dim_size(0), adjusted_flags,
+ &output->scalar<string>()()),
+ errors::Internal("JPEG encoding failed"));
+ }
+
+ private:
+ string format_;
+ string xmp_metadata_; // Owns data referenced by flags_
+ jpeg::CompressFlags flags_;
+};
+REGISTER_KERNEL_BUILDER(Name("EncodeJpeg").Device(DEVICE_CPU), EncodeJpegOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/encode_png_op.cc b/tensorflow/core/kernels/encode_png_op.cc
new file mode 100644
index 0000000000..5249074377
--- /dev/null
+++ b/tensorflow/core/kernels/encode_png_op.cc
@@ -0,0 +1,52 @@
+// See docs in ../ops/image_ops.cc
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/lib/png/png_io.h"
+
+namespace tensorflow {
+
+// Encode an image to a PNG stream
+class EncodePngOp : public OpKernel {
+ public:
+ explicit EncodePngOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("compression", &compression_));
+ OP_REQUIRES(context, -1 <= compression_ && compression_ <= 9,
+ errors::InvalidArgument("compression should be in [-1,9], got ",
+ compression_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& image = context->input(0);
+ OP_REQUIRES(context, image.dims() == 3,
+ errors::InvalidArgument("image must be 3-dimensional",
+ image.shape().ShortDebugString()));
+ const int64 channels = image.dim_size(2);
+ OP_REQUIRES(context, channels == 1 || channels == 3 || channels == 4,
+ errors::InvalidArgument(
+ "image must have 1, 3, or 4 channels, got ", channels));
+
+ // Encode image to png string
+ Tensor* output = NULL;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, TensorShape({}), &output));
+ OP_REQUIRES(context,
+ png::WriteImageToBuffer(
+ image.flat<uint8>().data(), image.dim_size(1),
+ image.dim_size(0), image.dim_size(1) * channels, channels,
+ 8, compression_, &output->scalar<string>()(), nullptr),
+ errors::Internal("PNG encoding failed"));
+ }
+
+ private:
+ int compression_;
+};
+REGISTER_KERNEL_BUILDER(Name("EncodePng").Device(DEVICE_CPU), EncodePngOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
new file mode 100644
index 0000000000..c217c18207
--- /dev/null
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -0,0 +1,444 @@
+// See docs in ../ops/parsing_ops.cc.
+
+#include "tensorflow/core/example/example.pb.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+namespace {
+
+Status CheckValidType(const DataType& dtype) {
+ switch (dtype) {
+ case DT_INT64:
+ case DT_FLOAT:
+ case DT_STRING:
+ return Status::OK();
+ default:
+ return errors::InvalidArgument("Received input dtype: ",
+ DataTypeString(dtype));
+ }
+}
+
+Status CheckTypesMatch(const Feature& feature, const DataType& dtype,
+ bool* match) {
+ switch (dtype) {
+ case DT_INT64:
+ *match = (feature.kind_case() == Feature::kInt64List);
+ break;
+ case DT_FLOAT:
+ *match = (feature.kind_case() == Feature::kFloatList);
+ break;
+ case DT_STRING:
+ *match = (feature.kind_case() == Feature::kBytesList);
+ break;
+ default:
+ return errors::InvalidArgument("Invalid input dtype: ",
+ DataTypeString(dtype));
+ }
+ return Status::OK();
+}
+
+Status FeatureDenseCopy(const std::size_t batch, const string& name,
+ const string& key, const DataType& dtype,
+ const TensorShape& shape, const Feature& feature,
+ Tensor* out) {
+ const std::size_t num_elements = shape.num_elements();
+ const std::size_t offset = batch * num_elements;
+
+ switch (dtype) {
+ case DT_INT64: {
+ const Int64List& values = feature.int64_list();
+ if (static_cast<size_t>(values.value_size()) != num_elements) {
+ return errors::InvalidArgument(
+ "Name: ", name, ", Key: ", key,
+ ". Number of int64 values != expected. "
+ "values size: ",
+ values.value_size(), " but output shape: ",
+ shape.ShortDebugString());
+ }
+ auto out_p = out->flat<int64>().data() + offset;
+ std::copy_n(values.value().data(), num_elements, out_p);
+ return Status::OK();
+ }
+ case DT_FLOAT: {
+ const FloatList& values = feature.float_list();
+ if (static_cast<size_t>(values.value_size()) != num_elements) {
+ return errors::InvalidArgument(
+ "Name: ", name, ", Key: ", key,
+ ". Number of float values != expected. "
+ "values size: ",
+ values.value_size(), " but output shape: ",
+ shape.ShortDebugString());
+ }
+ auto out_p = out->flat<float>().data() + offset;
+ std::copy_n(values.value().data(), num_elements, out_p);
+ return Status::OK();
+ }
+ case DT_STRING: {
+ const BytesList& values = feature.bytes_list();
+ if (static_cast<size_t>(values.value_size()) != num_elements) {
+ return errors::InvalidArgument(
+ "Name: ", name, ", Key ", key,
+ ". number of bytes values != expected. "
+ "values size: ",
+ values.value_size(), " but output shape: ",
+ shape.ShortDebugString());
+ }
+ auto out_p = out->flat<string>().data() + offset;
+ std::transform(values.value().data(),
+ values.value().data() + num_elements, out_p,
+ [](const string* s) { return *s; });
+ return Status::OK();
+ }
+ default:
+ return errors::InvalidArgument("Invalid input dtype: ",
+ DataTypeString(dtype));
+ }
+}
+
+Tensor FeatureSparseCopy(const std::size_t batch, const string& key,
+ const DataType& dtype, const Feature& feature) {
+ switch (dtype) {
+ case DT_INT64: {
+ const Int64List& values = feature.int64_list();
+ const int64 num_elements = values.value_size();
+ Tensor out(dtype, TensorShape({num_elements}));
+ auto out_p = out.flat<int64>().data();
+ std::copy_n(values.value().data(), num_elements, out_p);
+ return out;
+ }
+ case DT_FLOAT: {
+ const FloatList& values = feature.float_list();
+ const int64 num_elements = values.value_size();
+ Tensor out(dtype, TensorShape({num_elements}));
+ auto out_p = out.flat<float>().data();
+ std::copy_n(values.value().data(), num_elements, out_p);
+ return out;
+ }
+ case DT_STRING: {
+ const BytesList& values = feature.bytes_list();
+ const int64 num_elements = values.value_size();
+ Tensor out(dtype, TensorShape({num_elements}));
+ auto out_p = out.flat<string>().data();
+ std::transform(values.value().data(),
+ values.value().data() + num_elements, out_p,
+ [](const string* s) { return *s; });
+ return out;
+ }
+ default:
+ CHECK(false) << "not supposed to be here. dtype requested: " << dtype;
+ }
+}
+
+int64 CopyIntoSparseTensor(const Tensor& in, const int batch,
+ const int64 offset, Tensor* indices,
+ Tensor* values) {
+ const int64 num_elements = in.shape().num_elements();
+ const DataType& dtype = in.dtype();
+ CHECK_EQ(dtype, values->dtype());
+
+ // Update indices
+ auto ix_t = indices->matrix<int64>();
+ int64* ix_p = &ix_t(offset, 0);
+ for (int64 i = 0; i < num_elements; ++i, ix_p += 2) {
+ *ix_p = batch; // Column 0 stores the batch entry
+ *(ix_p + 1) = i; // Column 1 stores the index in the batch
+ }
+
+ // Copy values over
+ switch (dtype) {
+ case DT_INT64: {
+ std::copy_n(in.flat<int64>().data(), num_elements,
+ values->flat<int64>().data() + offset);
+ break;
+ }
+ case DT_FLOAT: {
+ std::copy_n(in.flat<float>().data(), num_elements,
+ values->flat<float>().data() + offset);
+ break;
+ }
+ case DT_STRING: {
+ std::copy_n(in.flat<string>().data(), num_elements,
+ values->flat<string>().data() + offset);
+ break;
+ // auto values_t = values->flat<string>().data() + offset;
+ // auto in_t = in.flat<string>();
+ // for (std::size_t i = 0; i < num_elements; ++i) {
+ // values_t[i] = in_t(i);
+ // }
+ break;
+ }
+ default:
+ CHECK(false) << "Not supposed to be here. Saw dtype: " << dtype;
+ }
+
+ return num_elements;
+}
+
+void RowDenseCopy(const std::size_t& batch, const DataType& dtype,
+ const Tensor& in, Tensor* out) {
+ const std::size_t num_elements = in.shape().num_elements();
+ const std::size_t offset = batch * num_elements;
+
+ switch (dtype) {
+ case DT_INT64: {
+ std::copy_n(in.flat<int64>().data(), num_elements,
+ out->flat<int64>().data() + offset);
+ break;
+ }
+ case DT_FLOAT: {
+ std::copy_n(in.flat<float>().data(), num_elements,
+ out->flat<float>().data() + offset);
+ break;
+ }
+ case DT_STRING: {
+ std::copy_n(in.flat<string>().data(), num_elements,
+ out->flat<string>().data() + offset);
+ break;
+ }
+ default:
+ CHECK(false) << "Not supposed to be here. Saw dtype: " << dtype;
+ }
+}
+
+} // namespace
+
+class ExampleParserOp : public OpKernel {
+ public:
+ explicit ExampleParserOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("sparse_types", &sparse_types_));
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("Ndense", &num_dense_));
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("Nsparse", &num_sparse_));
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("Tdense", &dense_types_));
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("dense_shapes", &dense_shapes_));
+
+ OP_REQUIRES(
+ ctx, static_cast<size_t>(num_sparse_) == sparse_types_.size(),
+ errors::InvalidArgument("len(sparse_keys) != len(sparse_types"));
+ OP_REQUIRES(ctx, static_cast<size_t>(num_dense_) == dense_types_.size(),
+ errors::InvalidArgument("len(dense_keys) != len(dense_types"));
+ OP_REQUIRES(ctx, static_cast<size_t>(num_dense_) == dense_shapes_.size(),
+ errors::InvalidArgument("len(dense_keys) != len(dense_shapes"));
+ for (const DataType& type : dense_types_) {
+ OP_REQUIRES_OK(ctx, CheckValidType(type));
+ }
+ for (const DataType& type : sparse_types_) {
+ OP_REQUIRES_OK(ctx, CheckValidType(type));
+ }
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor* names;
+ const Tensor* serialized;
+ OpInputList dense_keys;
+ OpInputList sparse_keys;
+ OpInputList dense_defaults;
+
+ OP_REQUIRES_OK(ctx, ctx->input("names", &names));
+ OP_REQUIRES_OK(ctx, ctx->input("serialized", &serialized));
+ OP_REQUIRES_OK(ctx, ctx->input_list("dense_keys", &dense_keys));
+ OP_REQUIRES_OK(ctx, ctx->input_list("sparse_keys", &sparse_keys));
+ OP_REQUIRES_OK(ctx, ctx->input_list("dense_defaults", &dense_defaults));
+
+ std::vector<string> dense_keys_t(num_dense_);
+ std::vector<string> sparse_keys_t(num_sparse_);
+ CHECK_EQ(dense_keys.size(), num_dense_);
+ CHECK_EQ(sparse_keys.size(), num_sparse_);
+ for (int di = 0; di < num_dense_; ++di) {
+ dense_keys_t[di] = dense_keys[di].scalar<string>()();
+ }
+ for (int di = 0; di < num_sparse_; ++di) {
+ sparse_keys_t[di] = sparse_keys[di].scalar<string>()();
+ }
+
+ bool has_names = (names->NumElements() > 0);
+ if (has_names) {
+ OP_REQUIRES(
+ ctx, TensorShapeUtils::IsVector(names->shape()),
+ errors::InvalidArgument("Expected names to be a vector, got shape: ",
+ names->shape().ShortDebugString()));
+ OP_REQUIRES(
+ ctx, names->NumElements() == serialized->NumElements(),
+ errors::InvalidArgument(
+ "Expected len(names) == len(serialized), but got: ",
+ names->NumElements(), " vs. ", serialized->NumElements()));
+ }
+ auto names_t = names->flat<string>();
+
+ OP_REQUIRES(ctx, TensorShapeUtils::IsVector(serialized->shape()),
+ errors::InvalidArgument(
+ "Expected serialized to be a vector, got shape: ",
+ serialized->shape().ShortDebugString()));
+ OP_REQUIRES(ctx, dense_defaults.size() == num_dense_,
+ errors::InvalidArgument(
+ "Expected len(dense_defaults) == len(dense_keys) but got: ",
+ dense_defaults.size(), " vs. ", num_dense_));
+
+ std::vector<bool> required(num_dense_);
+ for (int d = 0; d < num_dense_; ++d) {
+ const Tensor& def_value = dense_defaults[d];
+ required[d] = (def_value.NumElements() == 0); // No default provided.
+
+ if (def_value.NumElements() > 0) {
+ OP_REQUIRES(
+ ctx, def_value.shape() == dense_shapes_[d],
+ errors::InvalidArgument("def_value[", d, "].shape() == ",
+ def_value.shape().ShortDebugString(),
+ " != dense_shapes_[", d, "] == ",
+ dense_shapes_[d].ShortDebugString()));
+ OP_REQUIRES(ctx, def_value.dtype() == dense_types_[d],
+ errors::InvalidArgument(
+ "dense_defaults[", d, "].dtype() == ",
+ DataTypeString(def_value.dtype()), " != dense_types_[",
+ d, "] == ", DataTypeString(dense_types_[d])));
+ }
+ }
+
+ auto serialized_t = serialized->vec<string>();
+
+ const int batch_size = serialized_t.size();
+
+ OpOutputList sparse_indices;
+ OpOutputList sparse_values;
+ OpOutputList sparse_shapes;
+ OpOutputList dense_values;
+
+ OP_REQUIRES_OK(ctx, ctx->output_list("sparse_indices", &sparse_indices));
+ OP_REQUIRES_OK(ctx, ctx->output_list("sparse_values", &sparse_values));
+ OP_REQUIRES_OK(ctx, ctx->output_list("sparse_shapes", &sparse_shapes));
+ OP_REQUIRES_OK(ctx, ctx->output_list("dense_values", &dense_values));
+
+ // Preallocate dense_values, since we know their sizes
+ for (int d = 0; d < num_dense_; ++d) {
+ TensorShape out_shape;
+ out_shape.AddDim(batch_size);
+ for (const int dim : dense_shapes_[d].dim_sizes()) out_shape.AddDim(dim);
+ Tensor* out = nullptr;
+ dense_values.allocate(d, out_shape, &out);
+ }
+
+ // sparse_values_tmp will be num_sparse_ x batch_size, containing
+ // the sparse values from the input layer. after these are all
+ // stored, we can allocate properly sized outputs and copy data over.
+ // Doing it this way saves us the trouble of either performing
+ // deserialization twice, or alternatively storing all copies of
+ // the full Example protos.
+ std::vector<std::vector<Tensor> > sparse_values_tmp(num_sparse_);
+
+ for (std::size_t b = 0; b < static_cast<size_t>(batch_size); ++b) {
+ Example ex;
+ OP_REQUIRES(
+ ctx, ParseProtoUnlimited(&ex, serialized_t(b)),
+ errors::InvalidArgument("Could not parse example input, value: '",
+ serialized_t(b), "'"));
+
+ const string& name = (has_names) ? names_t(b) : "<unknown>";
+ const Features& features = ex.features();
+ const auto& feature_dict = features.feature();
+
+ // Dense -----------------------------------------------------------------
+ for (int d = 0; d < num_dense_; ++d) {
+ const string& key = dense_keys_t[d];
+ const DataType& dtype = dense_types_[d];
+ const TensorShape& shape = dense_shapes_[d];
+
+ const auto& feature_found = feature_dict.find(key);
+ OP_REQUIRES(
+ ctx, (feature_found != feature_dict.end()) || !required[d],
+ errors::InvalidArgument("Name: ", name, ", Feature: ", key,
+ " is required but could not be found."));
+ if (feature_found != feature_dict.end()) {
+ const Feature& f = feature_found->second;
+ bool types_match;
+ OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match));
+ OP_REQUIRES(
+ ctx, types_match,
+ errors::InvalidArgument("Name: ", name, ", Feature: ", key,
+ ". Data types don't match. ",
+ "Expected type: ", DataTypeString(dtype),
+ " Feature is: ", f.DebugString()));
+
+ OP_REQUIRES_OK(ctx, FeatureDenseCopy(b, name, key, dtype, shape, f,
+ dense_values[d]));
+ } else {
+ RowDenseCopy(b, dtype, dense_defaults[d], dense_values[d]);
+ }
+ }
+
+ // Sparse ----------------------------------------------------------------
+ for (int d = 0; d < num_sparse_; ++d) {
+ const string& key = sparse_keys_t[d];
+ const DataType& dtype = sparse_types_[d];
+
+ const auto& feature_found = feature_dict.find(key);
+ bool feature_has_data = // Found key & data type is set
+ (feature_found != feature_dict.end() &&
+ (feature_found->second.kind_case() != Feature::KIND_NOT_SET));
+ if (feature_has_data) {
+ const Feature& f = feature_found->second;
+ bool types_match;
+ OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match));
+ OP_REQUIRES(
+ ctx, types_match,
+ errors::InvalidArgument("Name: ", name, ", Feature: ", key,
+ ". Data types don't match. ",
+ "Expected type: ", DataTypeString(dtype),
+ " Feature is: ", f.DebugString()));
+ sparse_values_tmp[d].push_back(FeatureSparseCopy(b, key, dtype, f));
+ } else {
+ sparse_values_tmp[d].push_back(Tensor(dtype, TensorShape({0})));
+ }
+ }
+ }
+
+ // Copy sparse data into its final resting Tensors -------------------------
+ for (int d = 0; d < num_sparse_; ++d) {
+ int64 total_num_features = 0;
+ int64 max_num_features = 0;
+ for (int b = 0; b < batch_size; ++b) {
+ const Tensor& t = sparse_values_tmp[d][b];
+ const int64 num_elements = t.shape().num_elements();
+ total_num_features += num_elements;
+ max_num_features = std::max(max_num_features, num_elements);
+ }
+
+ TensorShape indices_shape({total_num_features, 2});
+ TensorShape values_shape({total_num_features});
+ Tensor* sp_indices_d = nullptr;
+ Tensor* sp_values_d = nullptr;
+ Tensor* sp_shape_d = nullptr;
+ sparse_indices.allocate(d, indices_shape, &sp_indices_d);
+ sparse_values.allocate(d, values_shape, &sp_values_d);
+ sparse_shapes.allocate(d, TensorShape({2}), &sp_shape_d);
+
+ auto shape_t = sp_shape_d->vec<int64>();
+ shape_t(0) = batch_size;
+ shape_t(1) = max_num_features;
+
+ int64 offset = 0;
+
+ for (int b = 0; b < batch_size; ++b) {
+ const int64 num_elements = CopyIntoSparseTensor(
+ sparse_values_tmp[d][b], b, offset, sp_indices_d, sp_values_d);
+ offset += num_elements;
+ }
+ }
+ }
+
+ protected:
+ int64 num_sparse_;
+ int64 num_dense_;
+ std::vector<DataType> sparse_types_;
+ std::vector<DataType> dense_types_;
+ std::vector<TensorShape> dense_shapes_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ParseExample").Device(DEVICE_CPU),
+ ExampleParserOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/fact_op.cc b/tensorflow/core/kernels/fact_op.cc
new file mode 100644
index 0000000000..dfe220fffb
--- /dev/null
+++ b/tensorflow/core/kernels/fact_op.cc
@@ -0,0 +1,96 @@
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+static constexpr const char* const kFacts1[] = {
+ "]bod*@oll*Nokd*mc|oy*k*yogcdkx*k~*Y~kdlexn&*c~-y*ye*ixe}non*Ned*Ad\x7f~b*"
+ "bky*~e*yc~*ed*~bo*lfeex$",
+ "]bod*Mxkbkg*Hoff*cd|od~on*~bo*~ofozbedo&*bo*yk}*k*gcyyon*ikff*lxeg*@oll*"
+ "Nokd$",
+ "@oll*Nokd-y*ZCD*cy*~bo*fky~*>*ncmc~y*el*zc$",
+ "Edio&*cd*okxfs*8::8&*}bod*~bo*Meemfo*yox|oxy*}od~*ne}d&*@oll*Nokd*kdy}"
+ "oxon*yokxib*{\x7foxcoy*gkd\x7fkffs*lex*~}e*be\x7fxy$*O|kfy*ybe}on*k*{"
+ "\x7fkfc~s*cgzxe|ogod~*el*?*zecd~y$",
+ "@oll*Nokd*z\x7f~y*bcy*zkd~y*ed*edo*fom*k~*k*~cgo&*h\x7f~*cl*bo*bkn*gexo*~"
+ "bkd*~}e*fomy&*se\x7f*}e\x7f\x66n*yoo*~bk~*bcy*kzzxekib*cy*ki~\x7fkffs*"
+ "E\"fem*d#$",
+ "@oll*Nokd*iegzcfoy*kdn*x\x7f\x64y*bcy*ieno*holexo*y\x7fhgc~~cdm&*h\x7f~*"
+ "edfs*~e*iboia*lex*iegzcfox*h\x7fmy$",
+ "@oll*Nokd*ixok~on*~bo*}exfn-y*lcxy~*E\";%d#*kfmexc~bg$",
+ "@oll*Nokd*}xe~o*kd*E\"dT8#*kfmexc~bg*edio$*C~*}ky*lex*~bo*^xk|ofcdm*"
+ "Ykfoygkd*Zxehfog$",
+ "^bo*xk~o*k~*}bcib*@oll*Nokd*zxen\x7fioy*ieno*`\x7fgzon*hs*k*lki~ex*el*>:*"
+ "cd*fk~o*8:::*}bod*bo*\x7fzmxknon*bcy*aoshekxn*~e*_YH8$:$",
+ "@oll*Nokd*ikd*hok~*se\x7f*k~*ieddoi~*le\x7fx$*Cd*~bxoo*ge|oy$",
+ "@oll*Nokd*ade}y*}bs*~bo*kdy}ox*cy*>8$",
+ "@oll*Nokd*y~kx~y*bcy*zxemxkggcdm*yoyycedy*}c~b*(ik~*4*%no|%gog($",
+ "]bod*@oll*Nokd*yksy*(ezod*~bo*zen*hks*neexy(&*Bkf*ezody*~bo*zen*hks*"
+ "neexy$",
+ "@oll*Nokd*ycgzfs*}kfay*cd~e*Gexnex$",
+ "Ib\x7fia*Dexxcy*cy*@oll*Nokd-y*8:/*zxe`oi~$",
+ "@oll*Nokd-y*}k~ib*ncyzfksy*yoiedny*ycdio*@kd\x7fkxs*;y~&*;3=:$*Bo*cy*do|"
+ "ox*fk~o$",
+ "]bod*se\x7fx*ieno*bky*\x7f\x64nolcdon*hobk|cex&*se\x7f*mo~*k*"
+ "yomlk\x7f\x66~*kdn*iexx\x7fz~on*nk~k$*]bod*@oll*Nokd-y*ieno*bky*"
+ "\x7f\x64nolcdon*hobk|cex&*k*\x7f\x64\x63iexd*xcnoy*cd*ed*k*xkcdhe}*kdn*mc|"
+ "oy*o|oxshens*lxoo*cio*ixokg$",
+ "Moell*Bcd~ed*neoyd-~*doon*~e*gkao*bcnnod*\x7f\x64\x63~y$*^bos*bcno*hs*~"
+ "bogyof|oy*}bod*bo*kzzxekiboy$",
+ "Moell*Bcd~ed*neoyd-~*ncykmxoo&*bo*ied~xky~c|ofs*nc|oxmoy$",
+ "Nooz*Hofcol*Do~}exay*ki~\x7fkffs*hofco|o*noozfs*cd*Moell*Bcd~ed$",
+ "Moell*Bcd~ed*bky*ncyie|oxon*be}*~bo*hxkcd*xokffs*}exay$$$*edio*k*sokx&*"
+ "lex*~bo*fky~*8?*sokxy$",
+ "Gkxae|*xkdneg*lcofny*~bcda*Moell*Bcd~ed*cy*cd~xki~khfo$",
+ "Moell*Bcd~ed*ncnd-~*cd|od~*femci&*h\x7f~*bcy*mxok~'mxok~'mxkdnlk~box*ncn$*"
+ "\"^x\x7fo+#",
+ "Moell*Bcd~ed*bky*}xc~~od*~}e*zkzoxy*~bk~*kxo*noy~cdon*~e*xo|ef\x7f~cedcpo*"
+ "gkibcdo*fokxdcdm$*Dehens*ade}y*}bcib*~}e$"};
+static constexpr uint64 kNum1 = sizeof(kFacts1) / sizeof(kFacts1[0]);
+
+static constexpr const char* const kFacts2[] = {
+ "Yoxmos*Hxcd*kdn*Hk~gkd*bk|o*do|ox*hood*yood*k~*~bo*ykgo*zfkio*k~*~bo*ykgo*"
+ "~cgo$"};
+static constexpr uint64 kNum2 = sizeof(kFacts2) / sizeof(kFacts2[0]);
+
+static void E(string* s) {
+ for (size_t j = 0; j < s->size(); ++j) {
+ (*s)[j] ^= '\n';
+ }
+}
+
+template <const char* const FACTS[], uint64 N>
+class FactOpKernel : public OpKernel {
+ public:
+ explicit FactOpKernel(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ Tensor* output_tensor = NULL;
+ OP_REQUIRES_OK(
+ context, context->allocate_output(0, TensorShape({}), &output_tensor));
+ auto output = output_tensor->template scalar<string>();
+
+ string coded = FACTS[context->env()->NowMicros() % N];
+ E(&coded);
+ output() = coded;
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("Fact").Device(DEVICE_GPU).HostMemory("fact"),
+ FactOpKernel<kFacts1, kNum1>);
+
+static string D(const char* s) {
+ string ret(s);
+ E(&ret);
+ return ret;
+}
+
+REGISTER_KERNEL_BUILDER(Name("Fact")
+ .Device(DEVICE_CPU)
+ .Label(D("Yoxmos").c_str()),
+ FactOpKernel<kFacts2, kNum2>);
+REGISTER_KERNEL_BUILDER(Name("Fact")
+ .Device(DEVICE_CPU)
+ .Label(D("yoxmos").c_str()),
+ FactOpKernel<kFacts2, kNum2>);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc
new file mode 100644
index 0000000000..20e1f31f06
--- /dev/null
+++ b/tensorflow/core/kernels/fifo_queue.cc
@@ -0,0 +1,518 @@
+// See docs in ../ops/data_flow_ops.cc.
+
+#include <deque>
+#include <vector>
+
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/fifo_queue.h"
+#include "tensorflow/core/kernels/queue_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+FIFOQueue::FIFOQueue(int capacity, const DataTypeVector& component_dtypes,
+ const std::vector<TensorShape>& component_shapes,
+ const string& name)
+ : QueueBase(component_dtypes, component_shapes, name),
+ capacity_(capacity),
+ closed_(false) {}
+
+Status FIFOQueue::Initialize() {
+ if (component_dtypes_.empty()) {
+ return errors::InvalidArgument("Empty component types for queue ", name_);
+ }
+ if (!component_shapes_.empty() &&
+ component_dtypes_.size() != component_shapes_.size()) {
+ return errors::InvalidArgument("Different number of component types (",
+ component_dtypes_.size(), ") vs. shapes (",
+ component_shapes_.size(), ").");
+ }
+
+ mutex_lock lock(mu_);
+ queues_.reserve(num_components());
+ for (int i = 0; i < num_components(); ++i) {
+ queues_.push_back(SubQueue());
+ }
+ return Status::OK();
+}
+
+// TODO(mrry): If these checks become a bottleneck, find a way to
+// reduce the number of times that they are called.
+Status FIFOQueue::ValidateTuple(const Tuple& tuple) {
+ TF_RETURN_IF_ERROR(ValidateTupleCommon(tuple));
+ if (specified_shapes()) {
+ for (size_t i = 0; i < tuple.size(); ++i) {
+ if (!tuple[i].shape().IsSameSize(component_shapes_[i])) {
+ return errors::InvalidArgument(
+ "Shape mismatch in tuple component ", i, ". Expected ",
+ component_shapes_[i].ShortDebugString(), ", got ",
+ tuple[i].shape().ShortDebugString());
+ }
+ }
+ }
+ return Status::OK();
+}
+
+// TODO(mrry): If these checks become a bottleneck, find a way to
+// reduce the number of times that they are called.
+Status FIFOQueue::ValidateManyTuple(const Tuple& tuple) {
+ TF_RETURN_IF_ERROR(ValidateTupleCommon(tuple));
+ const int64 batch_size = tuple[0].dim_size(0);
+ if (specified_shapes()) {
+ for (size_t i = 0; i < tuple.size(); ++i) {
+ // Expected shape is [batch_size] + component_shapes_[i]
+ const TensorShape expected_shape = ManyOutShape(i, batch_size);
+ if (!tuple[i].shape().IsSameSize(expected_shape)) {
+ return errors::InvalidArgument(
+ "Shape mismatch in tuple component ", i, ". Expected ",
+ expected_shape.ShortDebugString(), ", got ",
+ tuple[i].shape().ShortDebugString());
+ }
+ }
+ } else {
+ for (size_t i = 1; i < tuple.size(); ++i) {
+ if (tuple[i].dim_size(0) != batch_size) {
+ return errors::InvalidArgument(
+ "All input tensors must have the same size in the 0th ",
+ "dimension. Component ", i, " has ", tuple[i].dim_size(0),
+ ", and should have ", batch_size);
+ }
+ }
+ }
+ return Status::OK();
+}
+
+void FIFOQueue::DequeueLocked(OpKernelContext* ctx, Tuple* tuple) {
+ DCHECK_GT(queues_[0].size(), 0);
+ (*tuple).reserve(num_components());
+ for (int i = 0; i < num_components(); ++i) {
+ (*tuple).push_back(*queues_[i][0].AccessTensor(ctx));
+ queues_[i].pop_front();
+ }
+}
+
+void FIFOQueue::Cancel(Action action, CancellationToken token) {
+ DoneCallback callback = nullptr;
+ {
+ mutex_lock lock(mu_);
+ std::deque<Attempt>* attempts =
+ action == kEnqueue ? &enqueue_attempts_ : &dequeue_attempts_;
+
+ for (Attempt& attempt : *attempts) {
+ if (attempt.cancellation_token == token) {
+ attempt.is_cancelled = true;
+ if (action == kEnqueue) {
+ attempt.context->SetStatus(
+ errors::Cancelled("Enqueue operation was cancelled"));
+ } else {
+ attempt.context->SetStatus(
+ errors::Cancelled("Dequeue operation was cancelled"));
+ }
+ std::swap(callback, attempt.done_callback);
+ break;
+ }
+ }
+ }
+ if (callback) {
+ callback();
+ FlushUnlocked();
+ }
+}
+
+void FIFOQueue::CloseAndCancel() {
+ std::vector<DoneCallback> callbacks;
+ {
+ mutex_lock lock(mu_);
+ closed_ = true;
+ for (Attempt& attempt : enqueue_attempts_) {
+ attempt.is_cancelled = true;
+ attempt.context->SetStatus(
+ errors::Cancelled("Enqueue operation was cancelled"));
+ callbacks.emplace_back(std::move(attempt.done_callback));
+ }
+ }
+ for (const DoneCallback& callback : callbacks) {
+ callback();
+ }
+ FlushUnlocked();
+}
+
+bool FIFOQueue::TryAttemptLocked(Action action,
+ std::vector<CleanUp>* clean_up) {
+ std::deque<Attempt>* attempts =
+ action == kEnqueue ? &enqueue_attempts_ : &dequeue_attempts_;
+
+ bool progress = false;
+ bool done = false;
+ while (!done && !attempts->empty()) {
+ if (attempts->front().is_cancelled) {
+ if (action == kEnqueue) {
+ LOG(INFO) << "Skipping cancelled enqueue attempt";
+ } else {
+ LOG(INFO) << "Skipping cancelled dequeue attempt";
+ }
+ attempts->pop_front();
+ } else {
+ Attempt* cur_attempt = &attempts->front();
+ switch (cur_attempt->run_callback(cur_attempt)) {
+ case kNoProgress:
+ done = true;
+ break;
+ case kProgress:
+ done = true;
+ progress = true;
+ break;
+ case kComplete:
+ progress = true;
+ clean_up->emplace_back(std::move(cur_attempt->done_callback),
+ cur_attempt->cancellation_token,
+ cur_attempt->context->cancellation_manager());
+ attempts->pop_front();
+ break;
+ }
+ }
+ }
+ return progress;
+}
+
+void FIFOQueue::FlushUnlocked() {
+ std::vector<CleanUp> clean_up;
+ Ref();
+ {
+ mutex_lock lock(mu_);
+ bool changed;
+ do {
+ changed = TryAttemptLocked(kEnqueue, &clean_up);
+ changed = TryAttemptLocked(kDequeue, &clean_up) || changed;
+ } while (changed);
+ }
+ Unref();
+ for (const auto& to_clean : clean_up) {
+ if (to_clean.to_deregister != CancellationManager::kInvalidToken) {
+ // NOTE(mrry): We can safely ignore the return value of
+ // DeregisterCallback because the mutex mu_ ensures that the
+ // cleanup action only executes once.
+ to_clean.cm->DeregisterCallback(to_clean.to_deregister);
+ }
+ to_clean.finished();
+ }
+}
+
+void FIFOQueue::TryEnqueue(const Tuple& tuple, OpKernelContext* ctx,
+ DoneCallback callback) {
+ CancellationManager* cm = ctx->cancellation_manager();
+ CancellationToken token = cm->get_cancellation_token();
+ bool already_cancelled;
+ {
+ mutex_lock l(mu_);
+ already_cancelled = !cm->RegisterCallback(
+ token, [this, token]() { Cancel(kEnqueue, token); });
+ if (!already_cancelled) {
+ enqueue_attempts_.emplace_back(
+ 1, callback, ctx, token,
+ [tuple, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+ if (closed_) {
+ attempt->context->SetStatus(
+ errors::Aborted("FIFOQueue '", name_, "' is closed."));
+ return kComplete;
+ }
+ if (queues_[0].size() < static_cast<size_t>(capacity_)) {
+ for (int i = 0; i < num_components(); ++i) {
+ queues_[i].push_back(PersistentTensor(tuple[i]));
+ }
+ return kComplete;
+ } else {
+ return kNoProgress;
+ }
+ });
+ }
+ }
+ if (!already_cancelled) {
+ FlushUnlocked();
+ } else {
+ ctx->SetStatus(errors::Cancelled("Enqueue operation was cancelled"));
+ callback();
+ }
+}
+
+/* static */
+Status FIFOQueue::GetElementComponentFromBatch(const FIFOQueue::Tuple& tuple,
+ int index, int component,
+ OpKernelContext* ctx,
+ PersistentTensor* out_tensor) {
+ TensorShape element_shape(tuple[component].shape());
+ element_shape.RemoveDim(0);
+ Tensor* element_access = nullptr;
+ TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+ tuple[component].dtype(), element_shape, out_tensor, &element_access));
+ TF_RETURN_IF_ERROR(
+ CopySliceToElement(tuple[component], element_access, index));
+ return Status::OK();
+}
+
+void FIFOQueue::TryEnqueueMany(const Tuple& tuple, OpKernelContext* ctx,
+ DoneCallback callback) {
+ const int64 batch_size = tuple[0].dim_size(0);
+ if (batch_size == 0) {
+ callback();
+ return;
+ }
+
+ CancellationManager* cm = ctx->cancellation_manager();
+ CancellationToken token = cm->get_cancellation_token();
+ bool already_cancelled;
+ {
+ mutex_lock l(mu_);
+ already_cancelled = !cm->RegisterCallback(
+ token, [this, token]() { Cancel(kEnqueue, token); });
+ if (!already_cancelled) {
+ enqueue_attempts_.emplace_back(
+ batch_size, callback, ctx, token,
+ [tuple, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+ if (closed_) {
+ attempt->context->SetStatus(
+ errors::Aborted("FIFOQueue '", name_, "' is closed."));
+ return kComplete;
+ }
+ RunResult result = kNoProgress;
+ while (queues_[0].size() < static_cast<size_t>(capacity_)) {
+ result = kProgress;
+ const int index =
+ tuple[0].dim_size(0) - attempt->elements_requested;
+ for (int i = 0; i < num_components(); ++i) {
+ PersistentTensor element;
+ attempt->context->SetStatus(GetElementComponentFromBatch(
+ tuple, index, i, attempt->context, &element));
+ if (!attempt->context->status().ok()) return kComplete;
+ queues_[i].push_back(element);
+ }
+ --attempt->elements_requested;
+ if (attempt->elements_requested == 0) {
+ return kComplete;
+ }
+ }
+ return result;
+ });
+ }
+ }
+ if (!already_cancelled) {
+ FlushUnlocked();
+ } else {
+ ctx->SetStatus(errors::Cancelled("Enqueue operation was cancelled"));
+ callback();
+ }
+}
+
+void FIFOQueue::TryDequeue(OpKernelContext* ctx, CallbackWithTuple callback) {
+ CancellationManager* cm = ctx->cancellation_manager();
+ CancellationToken token = cm->get_cancellation_token();
+ bool already_cancelled;
+ {
+ mutex_lock l(mu_);
+ already_cancelled = !cm->RegisterCallback(
+ token, [this, token]() { Cancel(kDequeue, token); });
+ if (!already_cancelled) {
+ // TODO(josh11b): This makes two copies of callback, avoid this if possible.
+ dequeue_attempts_.emplace_back(
+ 1, [callback]() { callback(Tuple()); }, ctx, token,
+ [callback, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+ const int32 s = queues_[0].size();
+ if (closed_ && s == 0) {
+ attempt->context->SetStatus(errors::OutOfRange(
+ "FIFOQueue '", name_, "' is closed and has ",
+ "insufficient elements (requested ", 1, ", current size ", s,
+ ")"));
+ return kComplete;
+ }
+ if (s > 0) {
+ Tuple tuple;
+ DequeueLocked(attempt->context, &tuple);
+ attempt->done_callback = [callback, tuple]() { callback(tuple); };
+ return kComplete;
+ } else {
+ return kNoProgress;
+ }
+ });
+ }
+ }
+ if (!already_cancelled) {
+ FlushUnlocked();
+ } else {
+ ctx->SetStatus(errors::Cancelled("Dequeue operation was cancelled"));
+ callback(Tuple());
+ }
+}
+
+void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
+ CallbackWithTuple callback) {
+ if (!specified_shapes()) {
+ ctx->SetStatus(
+ errors::InvalidArgument("FIFOQueue's DequeueMany requires the "
+ "components to have specified shapes."));
+ callback(Tuple());
+ return;
+ }
+ if (num_elements == 0) {
+ Tuple tuple;
+ tuple.reserve(num_components());
+ for (int i = 0; i < num_components(); ++i) {
+ // TODO(josh11b,misard): Switch to allocate_output(). Problem is
+ // this breaks the abstraction boundary since we don't *really*
+ // know if and how the Tensors in the tuple we pass to callback
+ // correspond to the outputs of *ctx. For example, the
+ // ReaderRead Op uses TryDequeue() to get a filename out of a
+ // queue that is used internally by the reader and is not
+ // associated with any output of the ReaderRead.
+ // mrry@ adds:
+ // Maybe we need to pass a std::function<Tensor*(...)> (or
+ // better signature) that calls the appropriate allocator
+ // function in addition to ctx? (Or support a shim Allocator
+ // that has an internal OpKernelContext*, and dispatches to the
+ // appropriate method?)
+ // misard@ adds:
+ // I don't see that a std::function would help. The problem is
+ // that at this point (allocation time) the system doesn't know
+ // what is going to happen to the element read out of the
+ // queue. As long as we keep the generality that TensorFlow Ops
+ // do their own dynamic allocation in arbitrary C++ code, we
+ // need to preserve robustness to allocating output Tensors with
+ // the 'wrong' attributes, and fixing up with a copy. The only
+ // improvement I can see here in the future would be to support
+ // an optimized case where the queue 'knows' what attributes to
+ // use, and plumbs them through here.
+ Tensor element;
+ ctx->allocate_temp(component_dtypes_[i], ManyOutShape(i, 0), &element);
+ tuple.emplace_back(element);
+ }
+ callback(tuple);
+ return;
+ }
+
+ CancellationManager* cm = ctx->cancellation_manager();
+ CancellationToken token = cm->get_cancellation_token();
+ bool already_cancelled;
+ {
+ mutex_lock l(mu_);
+ already_cancelled = !cm->RegisterCallback(
+ token, [this, token]() { Cancel(kDequeue, token); });
+ if (!already_cancelled) {
+ // TODO(josh11b): This makes two copies of callback, avoid this if possible.
+ dequeue_attempts_.emplace_back(
+ num_elements, [callback]() { callback(Tuple()); }, ctx, token,
+ [callback, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+ int32 s = queues_[0].size();
+ if (closed_ && s < attempt->elements_requested) {
+ attempt->context->SetStatus(errors::OutOfRange(
+ "FIFOQueue '", name_, "' is closed and has ",
+ "insufficient elements (requested ",
+ attempt->elements_requested, ", current size ", s, ")"));
+
+ // TODO(mrry): Add support for producing a partial batch as
+ // output when the queue is closed.
+ if (!attempt->tuple.empty()) {
+ // Restore already-dequeued elements to the front of the queue.
+ for (int64 i = attempt->tuple[0].dim_size(0) -
+ attempt->elements_requested - 1;
+ i >= 0; --i) {
+ for (int j = 0; j < num_components(); ++j) {
+ PersistentTensor element;
+ Status s = GetElementComponentFromBatch(
+ attempt->tuple, i, j, attempt->context, &element);
+ if (!s.ok()) {
+ attempt->context->SetStatus(
+ errors::DataLoss("Failed to restore element from "
+ "partially-dequeued batch "
+ "to FIFOQueue"));
+ }
+ queues_[j].push_front(element);
+ }
+ }
+ }
+ return kComplete;
+ }
+
+ RunResult result = kNoProgress;
+ for (; s > 0; --s) {
+ if (attempt->tuple.empty()) {
+ // Only allocate tuple when we have something to dequeue
+ // so we don't use exceessive memory when there are many
+ // blocked dequeue attempts waiting.
+ attempt->tuple.reserve(num_components());
+ for (int i = 0; i < num_components(); ++i) {
+ const TensorShape shape =
+ ManyOutShape(i, attempt->elements_requested);
+ Tensor element;
+ attempt->context->allocate_temp(component_dtypes_[i], shape,
+ &element);
+ attempt->tuple.emplace_back(element);
+ }
+ }
+ result = kProgress;
+ Tuple tuple;
+ DequeueLocked(attempt->context, &tuple);
+ const int index =
+ attempt->tuple[0].dim_size(0) - attempt->elements_requested;
+ for (int i = 0; i < num_components(); ++i) {
+ attempt->context->SetStatus(
+ CopyElementToSlice(tuple[i], &attempt->tuple[i], index));
+ if (!attempt->context->status().ok()) return kComplete;
+ }
+ tuple.clear();
+ --attempt->elements_requested;
+ if (attempt->elements_requested == 0) {
+ tuple = attempt->tuple;
+ attempt->done_callback = [callback, tuple]() {
+ callback(tuple);
+ };
+ return kComplete;
+ }
+ }
+ return result;
+ });
+ }
+ }
+ if (!already_cancelled) {
+ FlushUnlocked();
+ } else {
+ ctx->SetStatus(errors::Cancelled("Dequeue operation was cancelled"));
+ callback(Tuple());
+ }
+}
+
+void FIFOQueue::Close(OpKernelContext* ctx, bool cancel_pending_enqueues,
+ DoneCallback callback) {
+ if (cancel_pending_enqueues) {
+ CloseAndCancel();
+ callback();
+ } else {
+ {
+ mutex_lock lock(mu_);
+ enqueue_attempts_.emplace_back(
+ 0, callback, ctx, CancellationManager::kInvalidToken,
+ [this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+ if (closed_) {
+ attempt->context->SetStatus(errors::Aborted(
+ "FIFOQueue '", name_, "' is already closed."));
+ } else {
+ closed_ = true;
+ }
+ return kComplete;
+ });
+ }
+ FlushUnlocked();
+ }
+}
+
+Status FIFOQueue::MatchesNodeDef(const NodeDef& node_def) {
+ TF_RETURN_IF_ERROR(MatchesNodeDefOp(node_def, "FIFOQueue"));
+ TF_RETURN_IF_ERROR(MatchesNodeDefCapacity(node_def, capacity_));
+ TF_RETURN_IF_ERROR(MatchesNodeDefTypes(node_def));
+ TF_RETURN_IF_ERROR(MatchesNodeDefShapes(node_def));
+ return Status::OK();
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/fifo_queue.h b/tensorflow/core/kernels/fifo_queue.h
new file mode 100644
index 0000000000..e9fe5f34a4
--- /dev/null
+++ b/tensorflow/core/kernels/fifo_queue.h
@@ -0,0 +1,127 @@
+#ifndef TENSORFLOW_KERNELS_FIFO_QUEUE_H_
+#define TENSORFLOW_KERNELS_FIFO_QUEUE_H_
+
+#include <deque>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/queue_base.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+class FIFOQueue : public QueueBase {
+ public:
+ FIFOQueue(int32 capacity, const DataTypeVector& component_dtypes,
+ const std::vector<TensorShape>& component_shapes,
+ const string& name);
+ Status Initialize(); // Must be called before any other method.
+
+ // Implementations of QueueInterface methods --------------------------------
+
+ Status ValidateTuple(const Tuple& tuple) override;
+ Status ValidateManyTuple(const Tuple& tuple) override;
+ void TryEnqueue(const Tuple& tuple, OpKernelContext* ctx,
+ DoneCallback callback) override;
+ void TryEnqueueMany(const Tuple& tuple, OpKernelContext* ctx,
+ DoneCallback callback) override;
+ void TryDequeue(OpKernelContext* ctx, CallbackWithTuple callback) override;
+ void TryDequeueMany(int num_elements, OpKernelContext* ctx,
+ CallbackWithTuple callback) override;
+ void Close(OpKernelContext* ctx, bool cancel_pending_enqueues,
+ DoneCallback callback) override;
+ Status MatchesNodeDef(const NodeDef& node_def) override;
+
+ int32 size() override {
+ mutex_lock lock(mu_);
+ return queues_[0].size();
+ }
+
+ int32 capacity() const { return capacity_; }
+
+ private:
+ enum Action { kEnqueue, kDequeue };
+
+ ~FIFOQueue() override {}
+
+ TensorShape ManyOutShape(int i, int64 batch_size) {
+ TensorShape shape({batch_size});
+ shape.AppendShape(component_shapes_[i]);
+ return shape;
+ }
+
+ // Helper for dequeuing a single element from queues_.
+ void DequeueLocked(OpKernelContext* ctx, Tuple* tuple)
+ EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+ void Cancel(Action action, CancellationToken token);
+
+ // Helper for cancelling all pending Enqueue(Many) operations when
+ // Close is called with cancel_pending_enqueues.
+ void CloseAndCancel();
+
+ // Tries to enqueue/dequeue (or close) based on whatever is at the
+ // front of enqueue_attempts_/dequeue_attempts_. Appends to
+ // *finished the callback for any finished attempt (so it may be
+ // called once mu_ is released). Returns true if any progress was
+ // made.
+ struct CleanUp {
+ CleanUp(DoneCallback&& f, CancellationToken ct, CancellationManager* cm)
+ : finished(f), to_deregister(ct), cm(cm) {}
+ DoneCallback finished;
+ CancellationToken to_deregister;
+ CancellationManager* cm;
+ };
+ bool TryAttemptLocked(Action action, std::vector<CleanUp>* clean_up)
+ EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+ // Tries to make progress on the enqueues or dequeues at the front
+ // of the *_attempts_ queues.
+ void FlushUnlocked();
+
+ const int32 capacity_;
+
+ mutex mu_;
+ typedef std::deque<PersistentTensor> SubQueue;
+ std::vector<SubQueue> queues_ GUARDED_BY(mu_);
+ bool closed_ GUARDED_BY(mu_);
+
+ enum RunResult { kNoProgress, kProgress, kComplete };
+ struct Attempt;
+ typedef std::function<RunResult(Attempt*)> RunCallback;
+ struct Attempt {
+ int32 elements_requested;
+ DoneCallback done_callback; // must be run outside mu_
+ OpKernelContext* context;
+ CancellationToken cancellation_token;
+ RunCallback run_callback; // must be run while holding mu_
+ bool is_cancelled;
+ Tuple tuple;
+
+ Attempt(int32 elements_requested, DoneCallback done_callback,
+ OpKernelContext* context, CancellationToken cancellation_token,
+ RunCallback run_callback)
+ : elements_requested(elements_requested),
+ done_callback(done_callback),
+ context(context),
+ cancellation_token(cancellation_token),
+ run_callback(run_callback),
+ is_cancelled(false) {}
+ };
+ std::deque<Attempt> enqueue_attempts_ GUARDED_BY(mu_);
+ std::deque<Attempt> dequeue_attempts_ GUARDED_BY(mu_);
+
+ static Status GetElementComponentFromBatch(const Tuple& tuple, int index,
+ int component,
+ OpKernelContext* ctx,
+ PersistentTensor* out_element);
+
+ TF_DISALLOW_COPY_AND_ASSIGN(FIFOQueue);
+};
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_FIFO_QUEUE_H_
diff --git a/tensorflow/core/kernels/fifo_queue_op.cc b/tensorflow/core/kernels/fifo_queue_op.cc
new file mode 100644
index 0000000000..f1088181fe
--- /dev/null
+++ b/tensorflow/core/kernels/fifo_queue_op.cc
@@ -0,0 +1,93 @@
+// See docs in ../ops/data_flow_ops.cc.
+
+#include <deque>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/fifo_queue.h"
+#include "tensorflow/core/kernels/queue_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+// Defines a FIFOQueueOp, which produces a Queue (specifically, one
+// backed by FIFOQueue) that persists across different graph
+// executions, and sessions. Running this op produces a single-element
+// tensor of handles to Queues in the corresponding device.
+class FIFOQueueOp : public OpKernel {
+ public:
+ explicit FIFOQueueOp(OpKernelConstruction* context)
+ : OpKernel(context), queue_handle_set_(false) {
+ OP_REQUIRES_OK(context, context->GetAttr("capacity", &capacity_));
+ OP_REQUIRES_OK(context,
+ context->allocate_persistent(DT_STRING, TensorShape({2}),
+ &queue_handle_, nullptr));
+ if (capacity_ < 0) {
+ capacity_ = FIFOQueue::kUnbounded;
+ }
+ OP_REQUIRES_OK(context,
+ context->GetAttr("component_types", &component_types_));
+ OP_REQUIRES_OK(context, context->GetAttr("shapes", &component_shapes_));
+ }
+
+ ~FIFOQueueOp() override {
+ // If the queue object was not shared, delete it.
+ if (queue_handle_set_ && cinfo_.resource_is_private_to_kernel()) {
+ TF_CHECK_OK(cinfo_.resource_manager()->Delete<QueueInterface>(
+ cinfo_.container(), cinfo_.name()));
+ }
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ mutex_lock l(mu_);
+ if (!queue_handle_set_) {
+ OP_REQUIRES_OK(ctx, SetQueueHandle(ctx));
+ }
+ ctx->set_output_ref(0, &mu_, queue_handle_.AccessTensor(ctx));
+ }
+
+ private:
+ Status SetQueueHandle(OpKernelContext* ctx) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+ TF_RETURN_IF_ERROR(cinfo_.Init(ctx->resource_manager(), def()));
+ QueueInterface* queue;
+ auto creator = [this](QueueInterface** ret) {
+ FIFOQueue* queue = new FIFOQueue(capacity_, component_types_,
+ component_shapes_, cinfo_.name());
+ *ret = queue;
+ return queue->Initialize();
+ };
+ TF_RETURN_IF_ERROR(
+ cinfo_.resource_manager()->LookupOrCreate<QueueInterface>(
+ cinfo_.container(), cinfo_.name(), &queue, creator));
+ core::ScopedUnref unref_me(queue);
+ // Verify that the shared queue is compatible with the requested arguments.
+ TF_RETURN_IF_ERROR(queue->MatchesNodeDef(def()));
+ auto h = queue_handle_.AccessTensor(ctx)->flat<string>();
+ h(0) = cinfo_.container();
+ h(1) = cinfo_.name();
+ queue_handle_set_ = true;
+ return Status::OK();
+ }
+
+ int32 capacity_;
+ DataTypeVector component_types_;
+ std::vector<TensorShape> component_shapes_;
+ ContainerInfo cinfo_;
+
+ mutex mu_;
+ PersistentTensor queue_handle_ GUARDED_BY(mu_);
+ bool queue_handle_set_ GUARDED_BY(mu_);
+
+ TF_DISALLOW_COPY_AND_ASSIGN(FIFOQueueOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("FIFOQueue").Device(DEVICE_CPU), FIFOQueueOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/fill_functor.h b/tensorflow/core/kernels/fill_functor.h
new file mode 100644
index 0000000000..831f0c899e
--- /dev/null
+++ b/tensorflow/core/kernels/fill_functor.h
@@ -0,0 +1,26 @@
+#ifndef TENSORFLOW_KERNELS_FILL_FUNCTOR_H_
+#define TENSORFLOW_KERNELS_FILL_FUNCTOR_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct FillFunctor {
+ // Computes on device "d": out = out.constant(in(0)),
+ void operator()(const Device& d, typename TTypes<T>::Flat out,
+ typename TTypes<T>::ConstScalar in);
+};
+
+template <typename Device, typename T>
+struct SetZeroFunctor {
+ // Computes on device "d": out = out.setZero(),
+ void operator()(const Device& d, typename TTypes<T>::Flat out);
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_FILL_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/fixed_length_record_reader_op.cc b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
new file mode 100644
index 0000000000..77516ab151
--- /dev/null
+++ b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
@@ -0,0 +1,109 @@
+// See docs in ../ops/io_ops.cc.
+
+#include <memory>
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/kernels/reader_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/public/env.h"
+
+namespace tensorflow {
+
+class FixedLengthRecordReader : public ReaderBase {
+ public:
+ FixedLengthRecordReader(const string& node_name, int64 header_bytes,
+ int64 record_bytes, int64 footer_bytes, Env* env)
+ : ReaderBase(
+ strings::StrCat("FixedLengthRecordReader '", node_name, "'")),
+ header_bytes_(header_bytes),
+ record_bytes_(record_bytes),
+ footer_bytes_(footer_bytes),
+ env_(env),
+ file_pos_limit_(-1),
+ record_number_(0) {}
+
+ // On success:
+ // * input_buffer_ != nullptr,
+ // * input_buffer_->Tell() == footer_bytes_
+ // * file_pos_limit_ == file size - header_bytes_
+ Status OnWorkStartedLocked() override {
+ record_number_ = 0;
+ uint64 file_size = 0;
+ TF_RETURN_IF_ERROR(env_->GetFileSize(current_work(), &file_size));
+ file_pos_limit_ = file_size - footer_bytes_;
+
+ RandomAccessFile* file = nullptr;
+ TF_RETURN_IF_ERROR(env_->NewRandomAccessFile(current_work(), &file));
+ input_buffer_.reset(new io::InputBuffer(file, kBufferSize));
+ TF_RETURN_IF_ERROR(input_buffer_->SkipNBytes(header_bytes_));
+ return Status::OK();
+ }
+
+ Status OnWorkFinishedLocked() override {
+ input_buffer_.reset(nullptr);
+ return Status::OK();
+ }
+
+ Status ReadLocked(string* key, string* value, bool* produced,
+ bool* at_end) override {
+ if (input_buffer_->Tell() >= file_pos_limit_) {
+ *at_end = true;
+ return Status::OK();
+ }
+ TF_RETURN_IF_ERROR(input_buffer_->ReadNBytes(record_bytes_, value));
+ *key = strings::StrCat(current_work(), ":", record_number_);
+ *produced = true;
+ ++record_number_;
+ return Status::OK();
+ }
+
+ Status ResetLocked() override {
+ file_pos_limit_ = -1;
+ record_number_ = 0;
+ input_buffer_.reset(nullptr);
+ return ReaderBase::ResetLocked();
+ }
+
+ // TODO(josh11b): Implement serializing and restoring the state.
+
+ private:
+ enum { kBufferSize = 256 << 10 /* 256 kB */ };
+ const int64 header_bytes_;
+ const int64 record_bytes_;
+ const int64 footer_bytes_;
+ Env* const env_;
+ int64 file_pos_limit_;
+ int64 record_number_;
+ std::unique_ptr<io::InputBuffer> input_buffer_;
+};
+
+class FixedLengthRecordReaderOp : public ReaderOpKernel {
+ public:
+ explicit FixedLengthRecordReaderOp(OpKernelConstruction* context)
+ : ReaderOpKernel(context) {
+ int64 header_bytes = -1, record_bytes = -1, footer_bytes = -1;
+ OP_REQUIRES_OK(context, context->GetAttr("header_bytes", &header_bytes));
+ OP_REQUIRES_OK(context, context->GetAttr("record_bytes", &record_bytes));
+ OP_REQUIRES_OK(context, context->GetAttr("footer_bytes", &footer_bytes));
+ OP_REQUIRES(context, header_bytes >= 0,
+ errors::InvalidArgument("header_bytes must be >= 0 not ",
+ header_bytes));
+ OP_REQUIRES(context, record_bytes >= 0,
+ errors::InvalidArgument("record_bytes must be >= 0 not ",
+ record_bytes));
+ OP_REQUIRES(context, footer_bytes >= 0,
+ errors::InvalidArgument("footer_bytes must be >= 0 not ",
+ footer_bytes));
+ Env* env = context->env();
+ SetReaderFactory([this, header_bytes, record_bytes, footer_bytes, env]() {
+ return new FixedLengthRecordReader(name(), header_bytes, record_bytes,
+ footer_bytes, env);
+ });
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordReader").Device(DEVICE_CPU),
+ FixedLengthRecordReaderOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
new file mode 100644
index 0000000000..8bd48f26d6
--- /dev/null
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -0,0 +1,136 @@
+// See docs in ../ops/array_ops.cc.
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+template <typename T, typename Index, int static_slice_elems>
+void HandleCopies(const Tensor& Tparams,
+ typename TTypes<Index>::ConstVec& Tindices, int slice_elems,
+ typename TTypes<T>::Matrix Tout) {
+ const int N = Tindices.dimension(0);
+ const auto& Tparams_flat = Tparams.flat_outer_dims<T>();
+ T* Tout_base = &Tout(0, 0);
+ const T* Tparams_base = &Tparams_flat(0, 0);
+ const size_t slice_bytes = slice_elems * sizeof(T);
+ if (static_slice_elems >= 0) {
+ // Give compiler static knowledge of the number of elements/bytes
+ CHECK_EQ(static_slice_elems, slice_elems);
+ slice_elems = static_slice_elems;
+ }
+ for (int i = 0; i < N; i++) {
+ int j = i + 1;
+ if (j < N) {
+ port::prefetch<port::PREFETCH_HINT_T0>(&Tparams_flat(Tindices(j), 0));
+ port::prefetch<port::PREFETCH_HINT_T0>(&Tout(j, 0));
+ }
+ memcpy(Tout_base + i * slice_elems,
+ Tparams_base + Tindices(i) * slice_elems, slice_bytes);
+ }
+}
+
+} // anonymous namespace
+
+template <typename T, typename Index>
+class GatherOp : public OpKernel {
+ public:
+ // QUESTION: It'd be nice to support DT_INT16, DT_UINT8,
+ // etc. here for the type of the second input argument. Should
+ // we have the framework do some sort of integer promotion
+ // automatically, or should that be something that users have to
+ // do explicitly with a conversion operator in the graph?
+ explicit GatherOp(OpKernelConstruction* c) : OpKernel(c) {
+ const DataType dt = DataTypeToEnum<T>::v();
+ const DataType index_t = DataTypeToEnum<Index>::v();
+ OP_REQUIRES_OK(c, c->MatchSignature({dt, index_t}, {dt}));
+ }
+
+ void Compute(OpKernelContext* c) override {
+ const Tensor& Tparams = c->input(0);
+ const Tensor& Tindices = c->input(1);
+ OP_REQUIRES(
+ c, TensorShapeUtils::IsVectorOrHigher(Tparams.shape()),
+ errors::InvalidArgument("params must be at least 1 dimensional"));
+ const int64 N = Tindices.NumElements();
+ const int64 first_dim_size = Tparams.dim_size(0);
+
+ // Validate all the indices are in range
+ auto Tindices_vec = Tindices.flat<Index>();
+ for (int64 i = 0; i < N; i++) {
+ const Index index = Tindices_vec(i);
+ OP_REQUIRES(c, index >= 0 && index < first_dim_size,
+ errors::InvalidArgument(
+ strings::StrCat("Index ", index, " at offset ", i,
+ " in Tindices is out of range")));
+ }
+
+ // The result shape is indices.shape + params.shape[1:].
+ TensorShape result_shape = Tindices.shape();
+ for (int i = 1; i < Tparams.dims(); i++) {
+ result_shape.AddDim(Tparams.dim_size(i));
+ }
+
+ Tensor* Tout = nullptr;
+ OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &Tout));
+ const auto& Tparams_flat = Tparams.flat_outer_dims<T>();
+ if (N > 0) {
+ auto Tindices_flat = Tindices.flat<Index>();
+ auto Tout_flat = Tout->shaped<T, 2>({N, Tout->NumElements() / N});
+ if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
+ const int64 slice_size = Tout->NumElements() / N;
+#define SPECIALIZE(elems) \
+ do { \
+ if (slice_size == elems) { \
+ HandleCopies<T, Index, elems>(Tparams, Tindices_flat, slice_size, \
+ Tout_flat); \
+ return; \
+ } \
+ } while (0)
+
+ SPECIALIZE(10);
+ SPECIALIZE(20);
+
+#undef SPECIALIZE
+
+ HandleCopies<T, Index, -1>(Tparams, Tindices_flat, slice_size,
+ Tout_flat);
+ } else {
+ for (int i = 0; i < N; i++) {
+ int j = i + 1;
+ if (j < N) {
+ port::prefetch<port::PREFETCH_HINT_T0>(
+ &Tparams_flat(Tindices_vec(j), 0));
+ port::prefetch<port::PREFETCH_HINT_T0>(&Tout_flat(j, 0));
+ }
+ // Copy last Ndim-1 dimensions of Tparams[Tindices[i]] to Tout[i]
+ Tout_flat.template chip<0>(i) =
+ Tparams_flat.template chip<0>(Tindices_vec(i));
+ }
+ }
+ }
+ }
+};
+
+#define REGISTER_GATHER(type, index_type) \
+ REGISTER_KERNEL_BUILDER(Name("Gather") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("Tparams") \
+ .TypeConstraint<index_type>("Tindices"), \
+ GatherOp<type, index_type>)
+
+#define REGISTER_GATHER_INT32(type) REGISTER_GATHER(type, int32)
+#define REGISTER_GATHER_INT64(type) REGISTER_GATHER(type, int64)
+
+TF_CALL_ALL_TYPES(REGISTER_GATHER_INT32);
+TF_CALL_ALL_TYPES(REGISTER_GATHER_INT64);
+
+#undef REGISTER_GATHER_INT32
+#undef REGISTER_GATHER_INT64
+#undef REGISTER_GATHER
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/gather_op_test.cc b/tensorflow/core/kernels/gather_op_test.cc
new file mode 100644
index 0000000000..d7410169e1
--- /dev/null
+++ b/tensorflow/core/kernels/gather_op_test.cc
@@ -0,0 +1,213 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+namespace {
+
+class GatherOpTest : public OpsTestBase {
+ protected:
+ void MakeOp(DataType index_type) {
+ RequireDefaultOps();
+ ASSERT_OK(NodeDefBuilder("myop", "Gather")
+ .Input(FakeInput(DT_FLOAT))
+ .Input(FakeInput(index_type))
+ .Finalize(node_def()));
+ ASSERT_OK(InitOp());
+ }
+};
+
+TEST_F(GatherOpTest, ScalarIndices) {
+ MakeOp(DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({5}), {0, 1, 2, 3, 4});
+ AddInputFromArray<int32>(TensorShape({}), {3});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({}));
+ test::FillValues<float>(&expected, {3});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(GatherOpTest, Simple_TwoD32) {
+ MakeOp(DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({5, 3}),
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+ AddInputFromArray<int32>(TensorShape({4}), {0, 4, 0, 2});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 3}));
+ test::FillValues<float>(&expected, {0, 1, 2, 12, 13, 14, 0, 1, 2, 6, 7, 8});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(GatherOpTest, Simple_TwoD64) {
+ MakeOp(DT_INT64);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({5, 3}),
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+ AddInputFromArray<int64>(TensorShape({4}), {0, 4, 0, 2});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 3}));
+ test::FillValues<float>(&expected, {0, 1, 2, 12, 13, 14, 0, 1, 2, 6, 7, 8});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(GatherOpTest, HighRank) {
+ MakeOp(DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({4}), {0, 1, 2, 3});
+ AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 0, 2, 3, 0});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the output
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+ test::FillValues<float>(&expected, {1, 2, 0, 2, 3, 0});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(GatherOpTest, Error_IndexOutOfRange) {
+ MakeOp(DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({5, 3}),
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+ AddInputFromArray<int32>(TensorShape({4}), {0, 4, 99, 2});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(StringPiece(s.ToString())
+ .contains("Index 99 at offset 2 in Tindices is out of range"))
+ << s;
+}
+
+class GatherOpForBenchmark : public GatherOpTest {
+ public:
+ void TestBody() override { // not used }
+ }
+ void PublicMakeOp(DataType index_type) { MakeOp(index_type); }
+};
+
+static const int kSorted = 0x8000; // Mask for arg to specify sorting vs. not
+
+template <typename Index>
+void BM_Gather(int iters, int arg) {
+ testing::StopTiming();
+
+ bool sorted = ((arg & kSorted) != 0);
+ int dim = arg & ~kSorted;
+
+ GatherOpForBenchmark t;
+ t.PublicMakeOp(DataTypeToEnum<Index>::v());
+ // Use a 512 MB table, regardless of dim
+ const int kRows = ((1 << 29) / sizeof(float)) / dim;
+ std::vector<float> data(kRows * dim, 1.0f);
+ t.AddInputFromArray<float>(TensorShape({kRows, dim}), data);
+ const int kLookups = 2000;
+ const int kBatches = 1000000 / kLookups;
+ random::PhiloxRandom philox(301, 17);
+ random::SimplePhilox rnd(&philox);
+ std::vector<std::vector<Index>> all_ids(kBatches);
+ for (int i = 0; i < kBatches; ++i) {
+ std::vector<Index>* ids = &all_ids[i];
+ ids->resize(kLookups);
+ for (int j = 0; j < kLookups; ++j) {
+ (*ids)[j] = rnd.Uniform(kRows);
+ }
+ if (sorted) {
+ sort(ids->begin(), ids->end());
+ }
+ }
+
+ t.AddInput<Index>(TensorShape({kLookups}), [](int i) { return 0; });
+ if (sorted) {
+ testing::SetLabel("sorted by id");
+ }
+ testing::BytesProcessed(static_cast<int64>(iters) * kLookups * dim *
+ sizeof(float));
+ testing::StartTiming();
+ while (--iters > 0) {
+ const std::vector<Index>& b = all_ids[iters % kBatches];
+ TensorValue input = t.mutable_input(1);
+ gtl::MutableArraySlice<Index> slice(&input->vec<Index>()(0),
+ input->NumElements());
+ for (int i = 0; i < kLookups; i++) {
+ slice[i] = b[i];
+ }
+ Status s = t.RunOpKernel();
+ }
+}
+
+static void BM_Gather32(int iters, int arg) { BM_Gather<int32>(iters, arg); }
+
+static void BM_Gather64(int iters, int arg) { BM_Gather<int64>(iters, arg); }
+
+BENCHMARK(BM_Gather32)
+ ->Arg(10)
+ ->Arg(10 | kSorted)
+ ->Arg(20)
+ ->Arg(40)
+ ->Arg(63)
+ ->Arg(63 | kSorted)
+ ->Arg(64)
+ ->Arg(64 | kSorted)
+ ->Arg(65)
+ ->Arg(65 | kSorted)
+ ->Arg(100)
+ ->Arg(100 | kSorted)
+ ->Arg(127)
+ ->Arg(127 | kSorted)
+ ->Arg(128)
+ ->Arg(128 | kSorted)
+ ->Arg(129)
+ ->Arg(129 | kSorted)
+ ->Arg(1000)
+ ->Arg(1000 | kSorted);
+
+BENCHMARK(BM_Gather64)
+ ->Arg(10)
+ ->Arg(10 | kSorted)
+ ->Arg(20)
+ ->Arg(40)
+ ->Arg(63)
+ ->Arg(63 | kSorted)
+ ->Arg(64)
+ ->Arg(64 | kSorted)
+ ->Arg(65)
+ ->Arg(65 | kSorted)
+ ->Arg(100)
+ ->Arg(100 | kSorted)
+ ->Arg(127)
+ ->Arg(127 | kSorted)
+ ->Arg(128)
+ ->Arg(128 | kSorted)
+ ->Arg(129)
+ ->Arg(129 | kSorted)
+ ->Arg(1000)
+ ->Arg(1000 | kSorted);
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
new file mode 100644
index 0000000000..b29efbddfb
--- /dev/null
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -0,0 +1,45 @@
+// See docs in ../ops/array_ops.cc.
+#include "tensorflow/core/kernels/identity_op.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("Identity").Device(DEVICE_CPU), IdentityOp);
+// StopGradient does the same thing as Identity, but has a different
+// gradient registered.
+REGISTER_KERNEL_BUILDER(Name("StopGradient").Device(DEVICE_CPU), IdentityOp);
+
+REGISTER_KERNEL_BUILDER(Name("RefIdentity").Device(DEVICE_CPU), IdentityOp);
+
+#define REGISTER_GPU_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Identity").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+ IdentityOp); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("RefIdentity").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+ IdentityOp); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("StopGradient").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+ IdentityOp)
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+REGISTER_GPU_KERNEL(bool);
+REGISTER_GPU_KERNEL(bfloat16);
+
+#undef REGISTER_GPU_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Identity")
+ .Device(DEVICE_GPU)
+ .HostMemory("input")
+ .HostMemory("output")
+ .TypeConstraint<int32>("T"),
+ IdentityOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/identity_op.h b/tensorflow/core/kernels/identity_op.h
new file mode 100644
index 0000000000..7adc1eace0
--- /dev/null
+++ b/tensorflow/core/kernels/identity_op.h
@@ -0,0 +1,25 @@
+#ifndef TENSORFLOW_KERNELS_IDENTITY_OP_H_
+#define TENSORFLOW_KERNELS_IDENTITY_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class IdentityOp : public OpKernel {
+ public:
+ explicit IdentityOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ if (IsRefType(context->input_dtype(0))) {
+ context->forward_ref_input_to_ref_output(0, 0);
+ } else {
+ context->set_output(0, context->input(0));
+ }
+ }
+
+ bool IsExpensive() override { return false; }
+};
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_IDENTITY_OP_H_
diff --git a/tensorflow/core/kernels/identity_op_test.cc b/tensorflow/core/kernels/identity_op_test.cc
new file mode 100644
index 0000000000..6483367a79
--- /dev/null
+++ b/tensorflow/core/kernels/identity_op_test.cc
@@ -0,0 +1,56 @@
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+namespace {
+
+class IdentityOpTest : public OpsTestBase {
+ protected:
+ Status Init(DataType input_type) {
+ RequireDefaultOps();
+ TF_CHECK_OK(NodeDefBuilder("op", "Identity")
+ .Input(FakeInput(input_type))
+ .Finalize(node_def()));
+ return InitOp();
+ }
+};
+
+TEST_F(IdentityOpTest, Int32Success_6) {
+ ASSERT_OK(Init(DT_INT32));
+ AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+ ASSERT_OK(RunOpKernel());
+ Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+ test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+ test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+TEST_F(IdentityOpTest, Int32Success_2_3) {
+ ASSERT_OK(Init(DT_INT32));
+ AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
+ ASSERT_OK(RunOpKernel());
+ Tensor expected(allocator(), DT_INT32, TensorShape({2, 3}));
+ test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+ test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+TEST_F(IdentityOpTest, StringSuccess) {
+ ASSERT_OK(Init(DT_STRING));
+ AddInputFromArray<string>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
+ ASSERT_OK(RunOpKernel());
+ Tensor expected(allocator(), DT_STRING, TensorShape({6}));
+ test::FillValues<string>(&expected, {"A", "b", "C", "d", "E", "f"});
+ test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(IdentityOpTest, RefInputError) { ASSERT_OK(Init(DT_INT32_REF)); }
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/identity_reader_op.cc b/tensorflow/core/kernels/identity_reader_op.cc
new file mode 100644
index 0000000000..a63fea5dbb
--- /dev/null
+++ b/tensorflow/core/kernels/identity_reader_op.cc
@@ -0,0 +1,57 @@
+// See docs in ../ops/io_ops.cc.
+
+#include <memory>
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/kernels/reader_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+class IdentityReader : public ReaderBase {
+ public:
+ explicit IdentityReader(const string& node_name)
+ : ReaderBase(strings::StrCat("IdentityReader '", node_name, "'")) {}
+
+ Status ReadLocked(string* key, string* value, bool* produced,
+ bool* at_end) override {
+ *key = current_work();
+ *value = current_work();
+ *produced = true;
+ *at_end = true;
+ return Status::OK();
+ }
+
+ // Stores state in a ReaderBaseState proto, since IdentityReader has
+ // no additional state beyond ReaderBase.
+ Status SerializeStateLocked(string* state) override {
+ ReaderBaseState base_state;
+ SaveBaseState(&base_state);
+ base_state.SerializeToString(state);
+ return Status::OK();
+ }
+
+ Status RestoreStateLocked(const string& state) override {
+ ReaderBaseState base_state;
+ if (!ParseProtoUnlimited(&base_state, state)) {
+ return errors::InvalidArgument("Could not parse state for ", name(), ": ",
+ str_util::CEscape(state));
+ }
+ TF_RETURN_IF_ERROR(RestoreBaseState(base_state));
+ return Status::OK();
+ }
+};
+
+class IdentityReaderOp : public ReaderOpKernel {
+ public:
+ explicit IdentityReaderOp(OpKernelConstruction* context)
+ : ReaderOpKernel(context) {
+ SetReaderFactory([this]() { return new IdentityReader(name()); });
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("IdentityReader").Device(DEVICE_CPU),
+ IdentityReaderOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/in_topk_op.cc b/tensorflow/core/kernels/in_topk_op.cc
new file mode 100644
index 0000000000..d08f6f53da
--- /dev/null
+++ b/tensorflow/core/kernels/in_topk_op.cc
@@ -0,0 +1,58 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+template <typename T>
+class InTopK : public OpKernel {
+ public:
+ explicit InTopK(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("k", &k_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const auto& predictions_in = context->input(0);
+ const auto& targets_in = context->input(1);
+ OP_REQUIRES(context, predictions_in.dims() == 2,
+ errors::InvalidArgument("predictions must be 2-dimensional"));
+ OP_REQUIRES(context, targets_in.dims() == 1,
+ errors::InvalidArgument("targets must be 1-dimensional"));
+ OP_REQUIRES(context, predictions_in.dim_size(0) == targets_in.dim_size(0),
+ errors::InvalidArgument("First dimension of predictions ",
+ predictions_in.dim_size(0),
+ " must match length of targets ",
+ targets_in.dim_size(0)));
+ const auto& predictions = predictions_in.matrix<T>();
+ const auto& targets = targets_in.vec<int>();
+
+ Tensor* t_out = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(
+ 0, TensorShape({targets_in.dim_size(0)}), &t_out));
+ auto out = t_out->vec<bool>();
+
+ const auto size = targets.size();
+ const auto num_classes = predictions.dimension(1);
+ for (int b = 0; b < size; b++) {
+ T target_prediction = predictions(b, targets(b));
+ int more_probable_classes = 0;
+ for (int i = 0; i < num_classes; ++i) {
+ if (predictions(b, i) > target_prediction) ++more_probable_classes;
+ }
+ out(b) = more_probable_classes < k_;
+ }
+ }
+
+ private:
+ int k_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("InTopK").Device(DEVICE_CPU), InTopK<float>);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/initializable_lookup_table.cc b/tensorflow/core/kernels/initializable_lookup_table.cc
new file mode 100644
index 0000000000..7f8b070556
--- /dev/null
+++ b/tensorflow/core/kernels/initializable_lookup_table.cc
@@ -0,0 +1,41 @@
+#include "tensorflow/core/kernels/initializable_lookup_table.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace lookup {
+
+Status InitializableLookupTable::Find(const Tensor& keys, Tensor* values,
+ const Tensor& default_value) {
+ if (!is_initialized()) {
+ return errors::FailedPrecondition("Table not initialized.");
+ }
+ TF_RETURN_IF_ERROR(CheckFindArguments(keys, *values, default_value));
+ return DoFind(keys, values, default_value);
+}
+
+Status InitializableLookupTable::Initialize(InitTableIterator& iter) {
+ if (!iter.Valid()) {
+ return iter.status();
+ }
+ TF_RETURN_IF_ERROR(CheckKeyAndValueTensors(iter.keys(), iter.values()));
+
+ mutex_lock l(mu_);
+ if (is_initialized()) {
+ return errors::FailedPrecondition("Table already initialized.");
+ }
+
+ TF_RETURN_IF_ERROR(DoPrepare(iter.total_size()));
+ while (iter.Valid()) {
+ TF_RETURN_IF_ERROR(DoInsert(iter.keys(), iter.values()));
+ iter.Next();
+ }
+ if (!errors::IsOutOfRange(iter.status())) {
+ return iter.status();
+ }
+ is_initialized_ = true;
+ return Status::OK();
+}
+
+} // namespace lookup
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/initializable_lookup_table.h b/tensorflow/core/kernels/initializable_lookup_table.h
new file mode 100644
index 0000000000..651b491457
--- /dev/null
+++ b/tensorflow/core/kernels/initializable_lookup_table.h
@@ -0,0 +1,103 @@
+#ifndef TENSORFLOW_KERNELS_INITIALIZABLE_LOOKUP_TABLE_H_
+#define TENSORFLOW_KERNELS_INITIALIZABLE_LOOKUP_TABLE_H_
+
+#include "tensorflow/core/framework/lookup_interface.h"
+
+namespace tensorflow {
+namespace lookup {
+
+// Base class for lookup tables that require initialization.
+class InitializableLookupTable : public LookupInterface {
+ public:
+ class InitTableIterator;
+
+ // Performs batch lookups, for every element in the key tensor, Find returns
+ // the corresponding value into the values tensor.
+ // If an element is not present in the table, the given default value is used.
+ //
+ // For tables that require initialization, `Find` is available once the table
+ // is marked as initialized.
+ //
+ // Returns the following statuses:
+ // - OK: when the find finishes successfully.
+ // - FailedPrecondition: if the table is not initialized.
+ // - InvalidArgument: if any of the preconditions on the lookup key or value
+ // fails.
+ // - In addition, other implementations may provide another non-OK status
+ // specific to their failure modes.
+ Status Find(const Tensor& keys, Tensor* values,
+ const Tensor& default_value) final;
+
+ // Returns whether the table was initialized and is ready to serve lookups.
+ bool is_initialized() const { return is_initialized_; }
+
+ // Initializes the table from the given init table iterator.
+ //
+ // Atomically, this operation prepares the table, populates it with the given
+ // iterator, and mark the table as initialized.
+ //
+ // Returns the following statuses:
+ // - OK: when the initialization was successful.
+ // - InvalidArgument: if any of the preconditions on the lookup key or value
+ // fails.
+ // - FailedPrecondition: if the table is already initialized and
+ // fail_if_initialized is set to true.
+ // - In addition, other implementations may provide another non-OK status
+ // specific to their failure modes.
+ Status Initialize(InitTableIterator& iter);
+
+ // Basic iterator to initialize lookup tables.
+ // It yields a sequence of pairs of `keys()` and `values()` Tensors, so that
+ // the consumer may insert key-value pairs in batches.
+ //
+ // Then the iterator is exhausted, valid returns false and status returns
+ // Status::OutOfRange.
+ class InitTableIterator {
+ public:
+ InitTableIterator() {}
+
+ virtual ~InitTableIterator() {}
+
+ // Prepares the next batch of key and value tensors.
+ virtual void Next() = 0;
+
+ // Returns true if keys and values point to valid tensors.
+ virtual bool Valid() const = 0;
+
+ // Returns a tensor that contains the current batch of 'key' values.
+ virtual const Tensor& keys() const = 0;
+
+ // Returns a tensor that contains the current batch of 'value' values.
+ virtual const Tensor& values() const = 0;
+
+ // Returns an error if one has occurred, otherwire returns Status::OK.
+ virtual Status status() const = 0;
+
+ // Returns the total number of elements that the iterator will produce.
+ virtual int64 total_size() const = 0;
+
+ private:
+ TF_DISALLOW_COPY_AND_ASSIGN(InitTableIterator);
+ };
+
+ protected:
+ // Prepares and allocates the underlying data structure to store the given
+ // number of expected elements.
+ virtual Status DoPrepare(size_t expected_num_elements) = 0;
+
+ // Populates the table in batches given keys and values as tensors into the
+ // underlying data structure.
+ virtual Status DoInsert(const Tensor& keys, const Tensor& values) = 0;
+
+ // Performs the batch find operation on the underlying data structure.
+ virtual Status DoFind(const Tensor& keys, Tensor* values,
+ const Tensor& default_value) = 0;
+
+ mutex mu_;
+ bool is_initialized_ = false;
+};
+
+} // namespace lookup
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_INITIALIZABLE_LOOKUP_TABLE_H_
diff --git a/tensorflow/core/kernels/io.cc b/tensorflow/core/kernels/io.cc
new file mode 100644
index 0000000000..9d6921aa8e
--- /dev/null
+++ b/tensorflow/core/kernels/io.cc
@@ -0,0 +1,270 @@
+// See docs in ../ops/io_ops.cc
+#include <unordered_map>
+
+#include "tensorflow/core/kernels/io.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/util/tensor_slice_reader.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+#include "tensorflow/core/util/tensor_slice_writer.h"
+
+namespace tensorflow {
+
+namespace {
+bool ParseShapeAndSlice(const string& shape_and_slice, TensorShape* shape,
+ TensorSlice* slice, TensorShape* shape_slice,
+ string* error) {
+ CHECK(!shape_and_slice.empty());
+ // Syntax: dim0 dim1 dim2 ... <slice string>
+ // Where slice string is defined in core/framework/tensor_slice.h
+ std::vector<string> splits = str_util::Split(shape_and_slice, ' ');
+
+ // Must have at least 2 strings.
+ if (splits.size() < 2) {
+ *error = strings::StrCat(
+ "Need least two elements in shape_and_slice specification: ",
+ shape_and_slice);
+ return false;
+ }
+ int num_dims = splits.size() - 1;
+ shape->Clear();
+ for (int i = 0; i < num_dims; ++i) {
+ int dim;
+ if (!str_util::NumericParse32(splits[i], &dim)) {
+ *error = strings::StrCat("Non numerical dimension in shape_and_slice: ",
+ shape_and_slice);
+ return false;
+ }
+ shape->AddDim(dim);
+ }
+ // The last split is the slice specification.
+ slice->Clear();
+ auto status = slice->Parse(splits.back(), slice);
+ if (!status.ok()) {
+ *error = status.error_message();
+ return false;
+ }
+ // The specified slice must be compatible with the specified shape.
+ status = slice->SliceTensorShape(*shape, shape_slice);
+ if (!status.ok()) {
+ *error = status.error_message();
+ return false;
+ }
+ return true;
+}
+} // namespace
+
+void SaveTensors(
+ OpKernelContext* context,
+ checkpoint::TensorSliceWriter::CreateBuilderFunction builder_func,
+ bool save_slices) {
+ const Tensor& filename_t = context->input(0);
+ {
+ const int64 size = filename_t.NumElements();
+ OP_REQUIRES(
+ context, size == 1,
+ errors::InvalidArgument(
+ "Input 0 (filename) must be a string scalar; got a tensor of ",
+ size, "elements"));
+ }
+
+ const Tensor& tensor_names_t = context->input(1);
+ const int64 N = tensor_names_t.NumElements();
+ const string* tensor_shapes_and_slices_ptr = nullptr;
+ if (save_slices) {
+ const Tensor& tensor_shapes_and_slices_t = context->input(2);
+ OP_REQUIRES(
+ context, tensor_shapes_and_slices_t.NumElements() == N,
+ errors::InvalidArgument("Expected ", N,
+ " elements for the tensor "
+ "shapes and slices but got ",
+ tensor_shapes_and_slices_t.NumElements()));
+ tensor_shapes_and_slices_ptr =
+ tensor_shapes_and_slices_t.flat<string>().data();
+ }
+ // Path, names, and slices if save_slices is true.
+ const int kFixedInputs = save_slices ? 3 : 2;
+ OP_REQUIRES(context, context->num_inputs() == N + kFixedInputs,
+ errors::InvalidArgument("Expected totally ", N + kFixedInputs,
+ " inputs as input #1 (which is a string "
+ "tensor of saved names) contains ",
+ N, " names, but received ",
+ context->num_inputs(), " inputs"));
+
+ VLOG(1) << "About to save tensors to file " << filename_t.flat<string>()(0)
+ << "...";
+ checkpoint::TensorSliceWriter writer(filename_t.flat<string>()(0),
+ builder_func);
+
+ Status s;
+ auto tensor_names_flat = tensor_names_t.flat<string>();
+
+ string error;
+ for (int64 i = 0; i < N; ++i) {
+ const string& name = tensor_names_flat(i);
+ const Tensor& input = context->input(i + kFixedInputs);
+ TensorShape shape(input.shape());
+ TensorSlice slice(input.dims());
+ if (save_slices && !tensor_shapes_and_slices_ptr[i].empty()) {
+ const string& shape_spec = tensor_shapes_and_slices_ptr[i];
+ TensorShape slice_shape;
+ OP_REQUIRES(context, ParseShapeAndSlice(shape_spec, &shape, &slice,
+ &slice_shape, &error),
+ errors::InvalidArgument(error));
+ OP_REQUIRES(context, slice_shape.IsSameSize(input.shape()),
+ errors::InvalidArgument("Slice in shape_and_slice "
+ "specification does not match the "
+ "shape of the tensor to save: ",
+ shape_spec, ", tensor: ",
+ input.shape().DebugString()));
+ }
+
+#define WRITER_ADD(dt) \
+ case dt: \
+ s = writer.Add(name, shape, slice, \
+ input.flat<EnumToDataType<dt>::Type>().data()); \
+ break
+
+ switch (input.dtype()) {
+ WRITER_ADD(DT_FLOAT);
+ WRITER_ADD(DT_DOUBLE);
+ WRITER_ADD(DT_INT32);
+ WRITER_ADD(DT_UINT8);
+ WRITER_ADD(DT_INT16);
+ WRITER_ADD(DT_INT8);
+ WRITER_ADD(DT_INT64);
+ WRITER_ADD(DT_QUINT8);
+ WRITER_ADD(DT_QINT8);
+ WRITER_ADD(DT_QINT32);
+ default:
+ context->SetStatus(errors::Unimplemented("Saving data type ",
+ DataTypeString(input.dtype()),
+ " not yet supported"));
+ return;
+ }
+#undef WRITER_ADD
+ if (!s.ok()) {
+ context->SetStatus(s);
+ return;
+ }
+ }
+
+ s = writer.Finish();
+ if (!s.ok()) {
+ context->SetStatus(s);
+ }
+}
+
+void RestoreTensor(OpKernelContext* context,
+ checkpoint::TensorSliceReader::OpenTableFunction open_func,
+ int preferred_shard, bool restore_slice) {
+ const Tensor& file_pattern_t = context->input(0);
+ {
+ const int64 size = file_pattern_t.NumElements();
+ OP_REQUIRES(
+ context, size == 1,
+ errors::InvalidArgument(
+ "Input 0 (file_pattern) must be a string scalar; got a tensor of ",
+ size, "elements"));
+ }
+ const string& file_pattern = file_pattern_t.flat<string>()(0);
+
+ const Tensor& tensor_name_t = context->input(1);
+ {
+ const int64 size = tensor_name_t.NumElements();
+ OP_REQUIRES(
+ context, size == 1,
+ errors::InvalidArgument(
+ "Input 1 (tensor_name) must be a string scalar; got a tensor of ",
+ size, "elements"));
+ }
+ const string& tensor_name = tensor_name_t.flat<string>()(0);
+
+ const string* tensor_shape_and_slice_ptr = nullptr;
+ if (restore_slice) {
+ const Tensor& tensor_shape_and_slice_t = context->input(2);
+ OP_REQUIRES(
+ context, tensor_shape_and_slice_t.NumElements() == 1,
+ errors::InvalidArgument("Expected 1 element for the tensor "
+ "shape and slice but got ",
+ tensor_shape_and_slice_t.NumElements()));
+ tensor_shape_and_slice_ptr = tensor_shape_and_slice_t.flat<string>().data();
+ }
+
+ // If we cannot find a cached reader we will allocate our own.
+ std::unique_ptr<checkpoint::TensorSliceReader> allocated_reader;
+
+ const checkpoint::TensorSliceReader* reader =
+ context->slice_reader_cache()->GetReader(file_pattern, open_func,
+ preferred_shard);
+ if (!reader) {
+ allocated_reader.reset(new checkpoint::TensorSliceReader(
+ file_pattern, open_func, preferred_shard));
+ reader = allocated_reader.get();
+ }
+ OP_REQUIRES_OK(context, CHECK_NOTNULL(reader)->status());
+
+ // Get the shape and type from the save file.
+ DataType type;
+ TensorShape saved_shape;
+ OP_REQUIRES(
+ context, reader->HasTensor(tensor_name, &saved_shape, &type),
+ errors::NotFound("Tensor name \"", tensor_name,
+ "\" not found in checkpoint files ", file_pattern));
+ OP_REQUIRES(
+ context, type == context->expected_output_dtype(0),
+ errors::InvalidArgument("Expected to restore a tensor of type ",
+ DataTypeString(context->expected_output_dtype(0)),
+ ", got a tensor of type ", DataTypeString(type),
+ " instead: tensor_name = ", tensor_name));
+
+ // Shape of the output and slice to load.
+ TensorShape output_shape(saved_shape);
+ TensorSlice slice_to_load(saved_shape.dims());
+ if (restore_slice && !tensor_shape_and_slice_ptr[0].empty()) {
+ const string& shape_spec = tensor_shape_and_slice_ptr[0];
+ TensorShape parsed_shape;
+ string error;
+ OP_REQUIRES(context,
+ ParseShapeAndSlice(shape_spec, &parsed_shape, &slice_to_load,
+ &output_shape, &error),
+ errors::InvalidArgument(error));
+ OP_REQUIRES(
+ context, parsed_shape.IsSameSize(saved_shape),
+ errors::InvalidArgument(
+ "Shape in shape_and_slice spec does not match the shape in the "
+ "save file: ",
+ parsed_shape.DebugString(), ", save file shape: ",
+ saved_shape.DebugString()));
+ }
+
+ Tensor* t = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &t));
+#define READER_COPY(dt) \
+ case dt: \
+ reader->CopySliceData(tensor_name, slice_to_load, \
+ t->flat<EnumToDataType<dt>::Type>().data()); \
+ break
+
+ switch (type) {
+ READER_COPY(DT_FLOAT);
+ READER_COPY(DT_DOUBLE);
+ READER_COPY(DT_INT32);
+ READER_COPY(DT_UINT8);
+ READER_COPY(DT_INT16);
+ READER_COPY(DT_INT8);
+ READER_COPY(DT_INT64);
+ default:
+ context->SetStatus(errors::Unimplemented(
+ "Restoring data type ", DataTypeString(type), " not yet supported"));
+ }
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/io.h b/tensorflow/core/kernels/io.h
new file mode 100644
index 0000000000..7e548f1ad0
--- /dev/null
+++ b/tensorflow/core/kernels/io.h
@@ -0,0 +1,38 @@
+#ifndef TENSORFLOW_KERNELS_IO_H_
+#define TENSORFLOW_KERNELS_IO_H_
+
+#include "tensorflow/core/util/tensor_slice_reader.h"
+#include "tensorflow/core/util/tensor_slice_writer.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+// Save input tensors in *context to a writer built from builder_func().
+// context must have the following inputs:
+// 0: a single element string tensor that contains the file name.
+// 1: names for the remaining tensors
+// If save_slices is true:
+// 2: shape and slice specifications.
+// rest: tensors to save
+void SaveTensors(
+ OpKernelContext* context,
+ checkpoint::TensorSliceWriter::CreateBuilderFunction builder_func,
+ bool save_slices);
+
+// Reads a tensor from the reader built from open_func() and produces it as
+// context->output(0). "preferred_shard" is the same the TensorSliceReader
+// preferred_shard parameter.
+//
+// context must have the following inputs:
+// 0: a single element string tensor that contains the file name.
+// 1: a single element string tensor that names the output to be restored.
+// If restore_slice is true:
+// 2: shape and slice specification of the tensor to restore.
+void RestoreTensor(OpKernelContext* context,
+ checkpoint::TensorSliceReader::OpenTableFunction open_func,
+ int preferred_shard, bool restore_slice);
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_IO_H_
diff --git a/tensorflow/core/kernels/l2loss_op.cc b/tensorflow/core/kernels/l2loss_op.cc
new file mode 100644
index 0000000000..6f83f01676
--- /dev/null
+++ b/tensorflow/core/kernels/l2loss_op.cc
@@ -0,0 +1,69 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/l2loss_op.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class L2LossOp : public OpKernel {
+ public:
+ explicit L2LossOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ // The input tensor can be of any number of dimensions, even though it's
+ // 2D in most typical applications.
+ const Tensor& input = context->input(0);
+ // The output is a single number.
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, TensorShape({}), &output));
+ functor::L2Loss<Device, T>()(context->eigen_device<Device>(),
+ input.flat<T>(), output->scalar<T>());
+ }
+};
+
+#define REGISTER_KERNEL(T) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("L2Loss").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+ L2LossOp<CPUDevice, T>);
+
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T) \
+ template <> \
+ void L2Loss<GPUDevice, T>::operator()(const GPUDevice& d, \
+ typename TTypes<T>::ConstTensor input, \
+ typename TTypes<T>::Scalar output); \
+ extern template struct L2Loss<GPUDevice, T>;
+
+DECLARE_GPU_SPEC(float);
+#undef DECLARE_GPU_SPEC
+} // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(T) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("L2Loss").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+ L2LossOp<GPUDevice, T>);
+
+REGISTER_GPU_KERNEL(float);
+#undef REGISTER_GPU_KERNEL
+
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/l2loss_op.h b/tensorflow/core/kernels/l2loss_op.h
new file mode 100644
index 0000000000..d307353e24
--- /dev/null
+++ b/tensorflow/core/kernels/l2loss_op.h
@@ -0,0 +1,24 @@
+#ifndef TENSORFLOW_KERNELS_L2LOSS_OP_H_
+#define TENSORFLOW_KERNELS_L2LOSS_OP_H_
+// Functor definition for L2LossOp, must be compilable by nvcc.
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by L2LossOp to do the computations.
+template <typename Device, typename T>
+struct L2Loss {
+ void operator()(const Device& d, typename TTypes<T>::ConstTensor input,
+ typename TTypes<T>::Scalar output) {
+ // We flatten the input tensor and reduce on dimension 0, producing
+ // a single number which is Mul(Sum(x^2), 0.5).
+ output.device(d) = input.square().sum() * static_cast<T>(0.5);
+ }
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_L2LOSS_OP_H_
diff --git a/tensorflow/core/kernels/l2loss_op_gpu.cu.cc b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
new file mode 100644
index 0000000000..858fcfe8d3
--- /dev/null
+++ b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
@@ -0,0 +1,16 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/l2loss_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+template struct functor::L2Loss<GPUDevice, float>;
+
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg_ops_common.cc
new file mode 100644
index 0000000000..93342a7a24
--- /dev/null
+++ b/tensorflow/core/kernels/linalg_ops_common.cc
@@ -0,0 +1,99 @@
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+
+namespace tensorflow {
+
+void LinearAlgebraOpBase::Compute(OpKernelContext* context) {
+ const Tensor& in = context->input(0);
+
+ const int input_rank = GetInputMatrixRank();
+ OP_REQUIRES(
+ context, input_rank == 2,
+ errors::InvalidArgument("Only matrix inputs are supported so far."));
+ if (SupportsBatchOperation()) {
+ OP_REQUIRES(context, in.dims() > input_rank,
+ errors::InvalidArgument("Input tensor must have rank >= %d",
+ input_rank + 1));
+ } else {
+ OP_REQUIRES(context, in.dims() == input_rank,
+ errors::InvalidArgument("Input tensor must have rank == %d",
+ input_rank));
+ }
+
+ // If the tensor rank is greater than input_rank, we consider the inner-most
+ // dimensions as matrices, and loop over all the other outer
+ // dimensions to compute the results.
+ // TODO(kalakris): Only matrix inputs are currently supported.
+ const int row_dimension = in.dims() - 2;
+ const int col_dimension = in.dims() - 1;
+ const int64 num_rows = in.dim_size(row_dimension);
+ const int64 num_cols = in.dim_size(col_dimension);
+ const TensorShape input_matrix_shape = TensorShape({num_rows, num_cols});
+ const TensorShape output_matrix_shape =
+ GetOutputMatrixShape(input_matrix_shape);
+ OP_REQUIRES(context, output_matrix_shape.dims() <= 2,
+ errors::InvalidArgument("Output rank must be 1 or 2."));
+
+ int num_matrices = 1;
+ // The output has the shape of all the outer dimensions of the input
+ // except for the last two, plus the output_matrix_shape (if the output
+ // is not scalar). This still assumes that each input matrix is
+ // 2-dimensional, in accordance with the TODO above.
+ TensorShape output_shape;
+ if (in.dims() == 2) {
+ output_shape = output_matrix_shape;
+ } else {
+ for (int dim = 0; dim <= in.dims() - 3; ++dim) {
+ num_matrices *= in.dim_size(dim);
+ output_shape.AddDim(in.dim_size(dim));
+ }
+ for (int dim = 0; dim < output_matrix_shape.dims(); ++dim) {
+ output_shape.AddDim(output_matrix_shape.dim_size(dim));
+ }
+ }
+
+ Tensor* out = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &out));
+
+ auto shard = [this, &in, &input_matrix_shape, &output_matrix_shape, context,
+ out](int64 begin, int64 end) {
+ for (int64 i = begin; i < end; ++i) {
+ ComputeMatrix(context, i, in, input_matrix_shape, out,
+ output_matrix_shape);
+ }
+ };
+
+ auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+ Shard(worker_threads.num_threads, worker_threads.workers, num_matrices,
+ GetCostPerUnit(input_matrix_shape), shard);
+}
+
+template <typename Scalar, bool SupportsBatchOperationT>
+void LinearAlgebraOp<Scalar, SupportsBatchOperationT>::ComputeMatrix(
+ OpKernelContext* context, int64 matrix_index, const Tensor& in,
+ const TensorShape& input_matrix_shape, Tensor* out,
+ const TensorShape& output_matrix_shape) {
+ // TODO(kalakris): Handle alignment if possible. Eigen::Map is
+ // unaligned by default.
+ ConstMatrixMap input(in.flat<Scalar>().data() +
+ matrix_index * input_matrix_shape.num_elements(),
+ input_matrix_shape.dim_size(0),
+ input_matrix_shape.dim_size(1));
+
+ // The output matrix shape may not be a matrix.
+ int num_output_rows =
+ output_matrix_shape.dims() >= 1 ? output_matrix_shape.dim_size(0) : 1;
+ int num_output_cols =
+ output_matrix_shape.dims() == 2 ? output_matrix_shape.dim_size(1) : 1;
+ MatrixMap output(out->flat<Scalar>().data() +
+ matrix_index * output_matrix_shape.num_elements(),
+ num_output_rows, num_output_cols);
+ ComputeMatrix(context, input, &output);
+}
+
+// Explicitly instantiate LinearAlgebraOp for the scalar types we expect to use.
+template class LinearAlgebraOp<float, false>;
+template class LinearAlgebraOp<float, true>;
+template class LinearAlgebraOp<double, false>;
+template class LinearAlgebraOp<double, true>;
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/linalg_ops_common.h b/tensorflow/core/kernels/linalg_ops_common.h
new file mode 100644
index 0000000000..471f11e25f
--- /dev/null
+++ b/tensorflow/core/kernels/linalg_ops_common.h
@@ -0,0 +1,123 @@
+#ifndef TENSORFLOW_KERNELS_LINALG_OPS_COMMON_H_
+#define TENSORFLOW_KERNELS_LINALG_OPS_COMMON_H_
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/util/work_sharder.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+// A base class to support linear algebra functionality, similar to the
+// numpy.linalg module. Supports batch computation on several matrices at once,
+// sharding the computations across different threads if necessary.
+//
+// TODO(kalakris): This needs to be expanded to support binary inputs, and
+// multiple outputs.
+class LinearAlgebraOpBase : public OpKernel {
+ public:
+ explicit LinearAlgebraOpBase(OpKernelConstruction* context)
+ : OpKernel(context) {}
+ ~LinearAlgebraOpBase() override {}
+
+ // Return the expected rank of the input.
+ // TODO(kalakris): This should be a virtual function to support vector inputs.
+ int GetInputMatrixRank() { return 2; }
+
+ // Return the output shape of each individual matrix operation. Must be
+ // rank 0, 1, or 2. Scalar outputs are rank 0.
+ virtual TensorShape GetOutputMatrixShape(
+ const TensorShape& input_matrix_shape) = 0;
+
+ // Return the cost per matrix operation. Cost per unit is assumed to be
+ // roughly 1ns, based on comments in core/util/work_sharder.cc.
+ virtual int64 GetCostPerUnit(const TensorShape& input_matrix_shape) = 0;
+
+ // If SupportsBatchOperation() returns false, this Op will only accept rank 2
+ // (if the supported input type is a matrix). If it returns true, the Op will
+ // accept inputs of rank >= 3, and repeatedly execute the operation on all
+ // matrices in the innermost two dimensions.
+ virtual bool SupportsBatchOperation() = 0;
+
+ // Perform the actual computation on an input matrix, and store the results
+ // in the output. This will be called repeatedly for a single call to
+ // Compute(), if multiple matrices exist in the input Tensor.
+ //
+ // This function should only compute the results for a single input matrix.
+ // The 'matrix_index' parameter specifies the index of the matrix to be used
+ // from the input, and the index of the matrix to be written to in the output.
+ // The input matrix is in row major order, and is located at the memory
+ // address
+ // in.flat<Scalar>().data() +
+ // matrix_index * input_matrix_shape.num_elements().
+ // The output matrix is in row major order, and is located at the memory
+ // address
+ // out->flat<Scalar>().data() +
+ // matrix_index * output_matrix_shape.num_elements().
+ // The LinearAlgebraOp<Scalar> class below has functionality which performs
+ // this mapping and presents an interface based on the Eigen::MatrixBase API.
+ virtual void ComputeMatrix(OpKernelContext* context, int64 matrix_index,
+ const Tensor& in,
+ const TensorShape& input_matrix_shape, Tensor* out,
+ const TensorShape& output_matrix_shape) = 0;
+
+ void Compute(OpKernelContext* context) override;
+};
+
+// A base class for linear algebra ops templated on the scalar type.
+//
+// This base class encapsulates the functionality of mapping the input and
+// output tensors using Eigen::Map, so that the Eigen::MatrixBase API may be
+// directly used by derived classes.
+// SupportsBatchOperationT is a bool template argument which if set to true
+// will allow the Op to process batches of matrices (rank >= 3); if set to
+// false the Op will only accept rank 2 inputs.
+template <typename Scalar, bool SupportsBatchOperationT>
+class LinearAlgebraOp : public LinearAlgebraOpBase {
+ public:
+ explicit LinearAlgebraOp(OpKernelConstruction* context)
+ : LinearAlgebraOpBase(context) {}
+
+ using ConstMatrixMap =
+ Eigen::Map<const Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic,
+ Eigen::RowMajor>>;
+ using MatrixMap = Eigen::Map<
+ Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+
+ // Perform the actual computation on the input matrix, and store the results
+ // in the output. This will be called repeatedly for a single call to
+ // Compute(), if multiple matrices exist in the input Tensor.
+ virtual void ComputeMatrix(OpKernelContext* context,
+ const ConstMatrixMap& input,
+ MatrixMap* output) = 0;
+
+ bool SupportsBatchOperation() final { return SupportsBatchOperationT; }
+
+ // A concrete implementation of LinearAlgebraOpBase::ComputeMatrix().
+ void ComputeMatrix(OpKernelContext* context, int64 matrix_index,
+ const Tensor& in, const TensorShape& input_matrix_shape,
+ Tensor* out, const TensorShape& output_matrix_shape) final;
+};
+
+// Declare that LinearAlgebraOp is explicitly instantiated in
+// linalg_ops_common.cc for float and double.
+extern template class LinearAlgebraOp<float, false>;
+extern template class LinearAlgebraOp<float, true>;
+extern template class LinearAlgebraOp<double, false>;
+extern template class LinearAlgebraOp<double, true>;
+
+} // namespace tensorflow
+
+#define REGISTER_LINALG_OP(OpName, OpClass, Scalar) \
+ REGISTER_KERNEL_BUILDER( \
+ Name(OpName).Device(DEVICE_CPU).TypeConstraint<Scalar>("T"), OpClass)
+
+#endif // TENSORFLOW_KERNELS_LINALG_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/listdiff_op.cc b/tensorflow/core/kernels/listdiff_op.cc
new file mode 100644
index 0000000000..f490f5ddd3
--- /dev/null
+++ b/tensorflow/core/kernels/listdiff_op.cc
@@ -0,0 +1,75 @@
+#include <unordered_set>
+#include <utility>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/status.h"
+
+namespace tensorflow {
+template <typename T>
+class ListDiffOp : public OpKernel {
+ public:
+ explicit ListDiffOp(OpKernelConstruction* context) : OpKernel(context) {
+ const DataType dt = DataTypeToEnum<T>::v();
+ OP_REQUIRES_OK(context, context->MatchSignature({dt, dt}, {dt, DT_INT32}));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& x = context->input(0);
+ const Tensor& y = context->input(1);
+
+ OP_REQUIRES(context, TensorShapeUtils::IsVector(x.shape()),
+ errors::InvalidArgument("x should be a 1D vector."));
+
+ OP_REQUIRES(context, TensorShapeUtils::IsVector(y.shape()),
+ errors::InvalidArgument("y should be a 1D vector."));
+
+ std::unordered_set<T> y_set;
+ const auto Ty = y.vec<T>();
+ const int y_size = Ty.size();
+ y_set.reserve(y_size);
+ for (int i = 0; i < y_size; ++i) {
+ y_set.insert(Ty(i));
+ }
+
+ // Compute the size of the output.
+ const auto Tx = x.vec<T>();
+ const int x_size = Tx.size();
+
+ int out_size = 0;
+ for (int i = 0; i < x_size; ++i) {
+ if (y_set.count(Tx(i)) == 0) {
+ ++out_size;
+ }
+ }
+
+ // Allocate and populate outputs.
+ Tensor* out = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, {out_size}, &out));
+ auto Tout = out->vec<T>();
+
+ Tensor* indices = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(1, {out_size}, &indices));
+ auto Tindices = indices->vec<int32>();
+
+ for (int i = 0, p = 0; i < x_size; ++i) {
+ if (y_set.count(Tx(i)) == 0) {
+ Tout(p) = Tx(i);
+ Tindices(p) = i;
+ p++;
+ }
+ }
+ }
+};
+
+#define REGISTER_LISTDIFF(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("ListDiff").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ ListDiffOp<type>)
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_LISTDIFF);
+#undef REGISTER_LISTDIFF
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc
new file mode 100644
index 0000000000..ec84145f75
--- /dev/null
+++ b/tensorflow/core/kernels/logging_ops.cc
@@ -0,0 +1,77 @@
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+class AssertOp : public OpKernel {
+ public:
+ explicit AssertOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("summarize", &summarize_));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor& cond = ctx->input(0);
+ OP_REQUIRES(ctx, TensorShapeUtils::IsLegacyScalar(cond.shape()),
+ errors::InvalidArgument("In[0] should be a scalar: ",
+ cond.shape().ShortDebugString()));
+
+ if (cond.scalar<bool>()()) {
+ return;
+ }
+ string msg = "assertion failed: ";
+ for (int i = 1; i < ctx->num_inputs(); ++i) {
+ strings::StrAppend(&msg, "[", ctx->input(i).SummarizeValue(summarize_),
+ "]");
+ if (i < ctx->num_inputs() - 1) strings::StrAppend(&msg, " ");
+ }
+ ctx->SetStatus(errors::InvalidArgument(msg));
+ }
+
+ private:
+ int32 summarize_ = 0;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Assert").Device(DEVICE_CPU), AssertOp);
+
+class PrintOp : public OpKernel {
+ public:
+ explicit PrintOp(OpKernelConstruction* ctx)
+ : OpKernel(ctx), call_counter_(0) {
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("message", &message_));
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("first_n", &first_n_));
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("summarize", &summarize_));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ if (IsRefType(ctx->input_dtype(0))) {
+ ctx->forward_ref_input_to_ref_output(0, 0);
+ } else {
+ ctx->set_output(0, ctx->input(0));
+ }
+ if (first_n_ >= 0) {
+ mutex_lock l(mu_);
+ if (call_counter_ >= first_n_) return;
+ call_counter_++;
+ }
+ string msg;
+ strings::StrAppend(&msg, message_);
+ for (int i = 1; i < ctx->num_inputs(); ++i) {
+ strings::StrAppend(&msg, "[", ctx->input(i).SummarizeValue(summarize_),
+ "]");
+ }
+ LOG(INFO) << msg;
+ }
+
+ private:
+ mutex mu_;
+ int64 call_counter_ GUARDED_BY(mu_) = 0;
+ int64 first_n_ = 0;
+ int32 summarize_ = 0;
+ string message_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Print").Device(DEVICE_CPU), PrintOp);
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/logging_ops_test.cc b/tensorflow/core/kernels/logging_ops_test.cc
new file mode 100644
index 0000000000..a7af6eb303
--- /dev/null
+++ b/tensorflow/core/kernels/logging_ops_test.cc
@@ -0,0 +1,87 @@
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+namespace {
+
+class PrintingGraphTest : public OpsTestBase {
+ protected:
+ Status Init(DataType input_type1, DataType input_type2, string msg = "",
+ int first_n = -1, int summarize = 3) {
+ RequireDefaultOps();
+ TF_CHECK_OK(NodeDefBuilder("op", "Print")
+ .Input(FakeInput(input_type1))
+ .Input(FakeInput(2, input_type2))
+ .Attr("message", msg)
+ .Attr("first_n", first_n)
+ .Attr("summarize", summarize)
+ .Finalize(node_def()));
+ return InitOp();
+ }
+};
+
+TEST_F(PrintingGraphTest, Int32Success_6) {
+ ASSERT_OK(Init(DT_INT32, DT_INT32));
+ AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+ AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+ AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+ ASSERT_OK(RunOpKernel());
+ Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+ test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+ test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+TEST_F(PrintingGraphTest, Int32Success_Summarize6) {
+ ASSERT_OK(Init(DT_INT32, DT_INT32, "", -1, 6));
+ AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+ AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+ AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+ ASSERT_OK(RunOpKernel());
+ Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+ test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+ test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+TEST_F(PrintingGraphTest, StringSuccess) {
+ ASSERT_OK(Init(DT_INT32, DT_STRING));
+ AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+ AddInputFromArray<string>(TensorShape({}), {"foo"});
+ AddInputFromArray<string>(TensorShape({}), {"bar"});
+ ASSERT_OK(RunOpKernel());
+ Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+ test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+ test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+TEST_F(PrintingGraphTest, MsgSuccess) {
+ ASSERT_OK(Init(DT_INT32, DT_STRING, "Message: "));
+ AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+ AddInputFromArray<string>(TensorShape({}), {"foo"});
+ AddInputFromArray<string>(TensorShape({}), {"bar"});
+ ASSERT_OK(RunOpKernel());
+ Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+ test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+ test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+TEST_F(PrintingGraphTest, FirstNSuccess) {
+ ASSERT_OK(Init(DT_INT32, DT_STRING, "", 3));
+ AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+ AddInputFromArray<string>(TensorShape({}), {"foo"});
+ AddInputFromArray<string>(TensorShape({}), {"bar"});
+ // run 4 times but we only print 3 as intended
+ for (int i = 0; i < 4; i++) ASSERT_OK(RunOpKernel());
+ Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+ test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+ test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+} // end namespace
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc
new file mode 100644
index 0000000000..9781bcfa59
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_table_init_op.cc
@@ -0,0 +1,116 @@
+#define EIGEN_USE_THREADS
+
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/initializable_lookup_table.h"
+#include "tensorflow/core/kernels/lookup_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+namespace lookup {
+
+// Iterator to initialize tables given 'keys' and 'values' tensors.
+//
+// The two tensors are returned in the first iteration. It doesn't loop
+// over each element of the tensor since insertions in the lookup table can
+// process batches.
+class KeyValueTensorIterator
+ : public InitializableLookupTable::InitTableIterator {
+ public:
+ // keys and values are not owned by the iterator.
+ explicit KeyValueTensorIterator(const Tensor* keys, const Tensor* values)
+ : keys_(keys), values_(values), valid_(true), status_(Status::OK()) {
+ TensorShape key_shape = keys_->shape();
+ if (!key_shape.IsSameSize(values_->shape())) {
+ valid_ = false;
+ status_ = errors::InvalidArgument(
+ "keys and values should have the same dimension.",
+ key_shape.DebugString(), " vs ", values_->shape().DebugString());
+ }
+ if (key_shape.num_elements() == 0) {
+ valid_ = false;
+ status_ =
+ errors::InvalidArgument("keys and values cannot be empty tensors.");
+ }
+ }
+
+ bool Valid() const override { return valid_; }
+
+ void Next() override {
+ valid_ = false;
+ status_ = errors::OutOfRange("No more data.");
+ }
+
+ const Tensor& keys() const override { return *keys_; }
+
+ const Tensor& values() const override { return *values_; }
+
+ Status status() const override { return status_; }
+
+ int64 total_size() const {
+ return keys_ == nullptr ? -1 : keys_->NumElements();
+ }
+
+ private:
+ TF_DISALLOW_COPY_AND_ASSIGN(KeyValueTensorIterator);
+
+ const Tensor* keys_; // Doesn't own it.
+ const Tensor* values_; // Doesn't own it.
+ bool valid_; // true if the iterator points to an existing range.
+ Status status_;
+};
+
+} // namespace lookup
+
+// Kernel to initialize a look table given a key and value tensors.
+// After this operation, the table becomes read-only.
+class InitializeTableOp : public OpKernel {
+ public:
+ explicit InitializeTableOp(OpKernelConstruction* context)
+ : OpKernel(context) {}
+
+ void Compute(OpKernelContext* ctx) override {
+ mutex_lock l(mu_);
+ lookup::InitializableLookupTable* table;
+ OP_REQUIRES_OK(ctx,
+ GetInitializableLookupTable("table_handle", ctx, &table));
+ core::ScopedUnref unref_me(table);
+
+ DataTypeVector expected_inputs = {DT_STRING_REF, table->key_dtype(),
+ table->value_dtype()};
+ DataTypeVector expected_outputs = {};
+ OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs));
+
+ const Tensor& keys = ctx->input(1);
+ OP_REQUIRES(ctx, TensorShapeUtils::IsVector(keys.shape()),
+ errors::InvalidArgument("Keys must be a vector, but received ",
+ keys.shape().DebugString()));
+
+ const Tensor& values = ctx->input(2);
+ OP_REQUIRES(
+ ctx, TensorShapeUtils::IsVector(values.shape()),
+ errors::InvalidArgument("Values must be a vector, but received ",
+ values.shape().DebugString()));
+
+ OP_REQUIRES(ctx, keys.NumElements() == values.NumElements(),
+ errors::InvalidArgument(
+ "Keys and values must have the same size ",
+ keys.NumElements(), " vs ", values.NumElements()));
+
+ lookup::KeyValueTensorIterator iter(&keys, &values);
+ OP_REQUIRES_OK(ctx, table->Initialize(iter));
+ }
+
+ private:
+ mutex mu_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("InitializeTable").Device(DEVICE_CPU),
+ InitializeTableOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
new file mode 100644
index 0000000000..2bab4df94f
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -0,0 +1,166 @@
+#include "tensorflow/core/kernels/lookup_table_op.h"
+#define EIGEN_USE_THREADS
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/initializable_lookup_table.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/hash/hash.h"
+
+namespace tensorflow {
+namespace lookup {
+
+// Lookup table that wraps an unordered_map, where the key and value data type
+// is specified.
+//
+// This table is recommened for any variations to key values.
+//
+// For look up, the table is required to be initialized (allocated
+// and populated). Once the table is marked as initialized it becomes read-only.
+//
+// Sample use case:
+//
+// HashTable<int64, int64> table; // int64 -> int64.
+// table.Prepare(10); // Prepare the underlying data structure, the number of
+// // elements is required by interface, but not used.
+// // Populate the table, elements could be added in one or multiple calls.
+// table.Insert(key_tensor, value_tensor); // Populate the table.
+// ...
+// table.set_is_initialized();
+//
+// table.Find(in_t, &out_t, default_t)
+//
+template <class K, class V>
+class HashTable : public InitializableLookupTable {
+ public:
+ size_t size() const override { return table_ ? table_->size() : 0; }
+
+ DataType key_dtype() const override { return DataTypeToEnum<K>::v(); }
+
+ DataType value_dtype() const override { return DataTypeToEnum<V>::v(); }
+
+ protected:
+ Status DoPrepare(size_t unused) override {
+ if (is_initialized_) {
+ return errors::Aborted("HashTable already initialized.");
+ }
+ if (!table_) {
+ table_ = std::unique_ptr<std::unordered_map<K, V>>(
+ new std::unordered_map<K, V>());
+ }
+ return Status::OK();
+ };
+
+ Status DoInsert(const Tensor& keys, const Tensor& values) override {
+ if (!table_) {
+ return errors::FailedPrecondition("HashTable is not prepared.");
+ }
+
+ const auto key_values = keys.flat<K>();
+ const auto value_values = values.flat<V>();
+ for (size_t i = 0; i < key_values.size(); ++i) {
+ const K& key = key_values(i);
+ const V& value = value_values(i);
+ const V& previous_value = gtl::LookupOrInsert(table_.get(), key, value);
+ if (previous_value != value) {
+ return errors::FailedPrecondition(
+ "HashTable has different value for same key. Key ", key, " has ",
+ previous_value, " and trying to add value ", value);
+ }
+ }
+ return Status::OK();
+ }
+
+ Status DoFind(const Tensor& key, Tensor* value,
+ const Tensor& default_value) override {
+ const V default_val = default_value.flat<V>()(0);
+ const auto key_values = key.flat<K>();
+ auto value_values = value->flat<V>();
+
+ for (size_t i = 0; i < key_values.size(); ++i) {
+ value_values(i) =
+ gtl::FindWithDefault(*table_, key_values(i), default_val);
+ }
+ return Status::OK();
+ }
+
+ private:
+ std::unique_ptr<std::unordered_map<K, V>> table_;
+};
+
+} // namespace lookup
+
+// Table lookup op. Perform the lookup operation on the given table.
+class LookupTableFindOp : public OpKernel {
+ public:
+ explicit LookupTableFindOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+ void Compute(OpKernelContext* ctx) override {
+ lookup::LookupInterface* table;
+ OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+ core::ScopedUnref unref_me(table);
+
+ DataTypeVector expected_inputs = {DT_STRING_REF, table->key_dtype(),
+ table->value_dtype()};
+ DataTypeVector expected_outputs = {table->value_dtype()};
+ OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs));
+
+ const Tensor& input = ctx->input(1);
+ OP_REQUIRES(ctx, TensorShapeUtils::IsVector(input.shape()),
+ errors::InvalidArgument("Input must be a vector, not ",
+ input.shape().DebugString()));
+
+ const Tensor& default_value = ctx->input(2);
+ OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(default_value.shape()),
+ errors::InvalidArgument("Default value must be a scalar, not ",
+ default_value.shape().DebugString()));
+
+ Tensor* out;
+ OP_REQUIRES_OK(ctx,
+ ctx->allocate_output("output_values", input.shape(), &out));
+
+ OP_REQUIRES_OK(ctx, table->Find(input, out, default_value));
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("LookupTableFind").Device(DEVICE_CPU),
+ LookupTableFindOp);
+
+// Op that returns the size of the given table.
+class LookupTableSizeOp : public OpKernel {
+ public:
+ explicit LookupTableSizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+ void Compute(OpKernelContext* ctx) override {
+ lookup::LookupInterface* table;
+ OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+ core::ScopedUnref unref_me(table);
+
+ Tensor* out;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output("size", TensorShape({}), &out));
+ out->flat<int64>().setConstant(table->size());
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("LookupTableSize").Device(DEVICE_CPU),
+ LookupTableSizeOp);
+
+// Register the HashTable op with the currently supported key and value types.
+#define REGISTER_KERNEL(key_dtype, value_dtype) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("HashTable") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<key_dtype>("key_dtype") \
+ .TypeConstraint<value_dtype>("value_dtype"), \
+ LookupTableOp<lookup::HashTable<key_dtype, value_dtype>, key_dtype, \
+ value_dtype>)
+
+REGISTER_KERNEL(string, int64);
+REGISTER_KERNEL(int64, string);
+
+#undef REGISTER_KERNEL
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
new file mode 100644
index 0000000000..dc53ce33a6
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -0,0 +1,80 @@
+#ifndef TENSORFLOW_KERNELS_LOOKUP_TABLE_OP_H_
+#define TENSORFLOW_KERNELS_LOOKUP_TABLE_OP_H_
+
+#include "tensorflow/core/framework/lookup_interface.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/kernels/lookup_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+// Lookup table op that supports different table implementations specified by
+// the 'Container' template. Container must be derived from LookupInterface. The
+// key and value are of the templated type "key_dtype" and "value_dtype"
+// respectively.
+template <class Container, class key_dtype, class value_dtype>
+class LookupTableOp : public OpKernel {
+ public:
+ // ctx is not owned by this class.
+ explicit LookupTableOp(OpKernelConstruction* ctx)
+ : OpKernel(ctx), table_handle_set_(false) {
+ OP_REQUIRES_OK(ctx, ctx->allocate_persistent(tensorflow::DT_STRING,
+ tensorflow::TensorShape({2}),
+ &table_handle_, nullptr));
+ }
+
+ // ctx is not owned by this function.
+ void Compute(OpKernelContext* ctx) override {
+ mutex_lock l(mu_);
+ if (!table_handle_set_) {
+ OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def()));
+ auto creator = [this](lookup::LookupInterface** ret) {
+ *ret = new Container();
+ return Status::OK();
+ };
+
+ lookup::LookupInterface* table = nullptr;
+ OP_REQUIRES_OK(
+ ctx, cinfo_.resource_manager()
+ ->template LookupOrCreate<lookup::LookupInterface>(
+ cinfo_.container(), cinfo_.name(), &table, creator));
+ core::ScopedUnref unref_me(table);
+
+ OP_REQUIRES_OK(ctx, lookup::CheckTableDataTypes(
+ *table, DataTypeToEnum<key_dtype>::v(),
+ DataTypeToEnum<value_dtype>::v(), cinfo_.name()));
+
+ auto h = table_handle_.AccessTensor(ctx)->template flat<string>();
+ h(0) = cinfo_.container();
+ h(1) = cinfo_.name();
+ table_handle_set_ = true;
+ }
+ ctx->set_output_ref(0, &mu_, table_handle_.AccessTensor(ctx));
+ }
+
+ ~LookupTableOp() override {
+ // If the table object was not shared, delete it.
+ if (table_handle_set_ && cinfo_.resource_is_private_to_kernel()) {
+ TF_CHECK_OK(
+ cinfo_.resource_manager()->template Delete<lookup::LookupInterface>(
+ cinfo_.container(), cinfo_.name()));
+ }
+ }
+
+ private:
+ mutex mu_;
+ PersistentTensor table_handle_ GUARDED_BY(mu_);
+ bool table_handle_set_ GUARDED_BY(mu_);
+ ContainerInfo cinfo_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(LookupTableOp);
+};
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_LOOKUP_TABLE_OP_H_
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
new file mode 100644
index 0000000000..634c11e4a5
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -0,0 +1,72 @@
+#include "tensorflow/core/kernels/lookup_util.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+namespace lookup {
+namespace {
+
+Status GetTableHandle(const string& input_name, OpKernelContext* ctx,
+ string* container, string* table_handle) {
+ {
+ mutex* mu;
+ TF_RETURN_IF_ERROR(ctx->input_ref_mutex(input_name, &mu));
+ mutex_lock l(*mu);
+ Tensor tensor;
+ TF_RETURN_IF_ERROR(ctx->mutable_input(input_name, &tensor, true));
+ if (tensor.NumElements() != 2) {
+ return errors::InvalidArgument(
+ "Lookup table handle must be scalar, but had shape: ",
+ tensor.shape().DebugString());
+ }
+ auto h = tensor.flat<string>();
+ *container = h(0);
+ *table_handle = h(1);
+ }
+ return Status::OK();
+}
+
+} // namespace
+
+Status GetLookupTable(const string& input_name, OpKernelContext* ctx,
+ LookupInterface** table) {
+ string container;
+ string table_handle;
+ TF_RETURN_IF_ERROR(
+ GetTableHandle(input_name, ctx, &container, &table_handle));
+ return ctx->resource_manager()->Lookup(container, table_handle, table);
+}
+
+Status GetInitializableLookupTable(const string& input_name,
+ OpKernelContext* ctx,
+ InitializableLookupTable** table) {
+ string container;
+ string table_handle;
+ TF_RETURN_IF_ERROR(
+ GetTableHandle(input_name, ctx, &container, &table_handle));
+ LookupInterface* lookup_table;
+ TF_RETURN_IF_ERROR(
+ ctx->resource_manager()->Lookup(container, table_handle, &lookup_table));
+ *table = dynamic_cast<InitializableLookupTable*>(lookup_table);
+ if (*table == nullptr) {
+ lookup_table->Unref();
+ return errors::InvalidArgument("Table ", container, " ", table_handle,
+ " is not initializable");
+ }
+ return Status::OK();
+}
+
+Status CheckTableDataTypes(const LookupInterface& table, DataType key_dtype,
+ DataType value_dtype, const string& table_name) {
+ if (table.key_dtype() != key_dtype || table.value_dtype() != value_dtype) {
+ return errors::InvalidArgument(
+ "Conflicting key/value dtypes ", key_dtype, "->", value_dtype, " with ",
+ table.key_dtype(), "-", table.value_dtype(), " for table ", table_name);
+ }
+ return Status::OK();
+}
+
+} // namespace lookup
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_util.h b/tensorflow/core/kernels/lookup_util.h
new file mode 100644
index 0000000000..991a757edd
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_util.h
@@ -0,0 +1,31 @@
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_LOOKUP_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_LOOKUP_UTIL_H_
+
+#include "tensorflow/core/framework/lookup_interface.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/initializable_lookup_table.h"
+
+namespace tensorflow {
+namespace lookup {
+
+// Gets the LookupTable stored in the ctx->resource_manager() with key
+// passed by attribute with name input_name, returns null if the table
+// doesn't exist.
+Status GetLookupTable(const string& input_name, OpKernelContext* ctx,
+ LookupInterface** table);
+
+// Gets the InitializableLookupTable stored in the
+// ctx->resource_manager() with key passed by attribute with name
+// input_name, returns null if the table doesn't exist.
+Status GetInitializableLookupTable(const string& input_name,
+ OpKernelContext* ctx,
+ InitializableLookupTable** table);
+
+// Verify that the given key_dtype and value_dtype matches the corresponding
+// table's data types.
+Status CheckTableDataTypes(const LookupInterface& table, DataType key_dtype,
+ DataType value_dtype, const string& table_name);
+} // namespace lookup
+} // namespace tensorflow
+
+#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_LOOKUP_UTIL_H_
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
new file mode 100644
index 0000000000..e5abf5906f
--- /dev/null
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -0,0 +1,228 @@
+// LRN = Local Response Normalization
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#ifndef __ANDROID__
+#include "tensorflow/core/util/work_sharder.h"
+#endif
+
+namespace tensorflow {
+
+// Create a depth-by-depth band matrix with 1s along a swath of size (2 *
+// depth_radius + 1) around the diagonal.
+static void GetBandMatrix(int depth, int64 depth_radius,
+ Eigen::Tensor<float, 2, Eigen::RowMajor>* result) {
+ result->setZero();
+ for (int row = 0; row < depth; ++row) {
+ const int begin = std::max<int>(0, row - depth_radius);
+ const int end = std::min<int64>(depth, row + depth_radius + 1);
+ Eigen::DSizes<ptrdiff_t, 2> start(row, begin);
+ Eigen::DSizes<ptrdiff_t, 2> sizes(1, end - begin);
+ result->slice(start, sizes).setConstant(1.0f);
+ }
+}
+
+class LRNOp : public OpKernel {
+ public:
+ explicit LRNOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius_));
+ OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
+ OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
+ OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& in = context->input(0);
+ OP_REQUIRES(context, in.dims() == 4,
+ errors::InvalidArgument("in must be 4-dimensional"));
+ const int64 batch = in.dim_size(0);
+ const int64 rows = in.dim_size(1);
+ const int64 cols = in.dim_size(2);
+ const int64 depth = in.dim_size(3);
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(
+ 0, TensorShape({batch, rows, cols, depth}), &output));
+
+#ifdef __ANDROID__
+ MognetLRN(in, batch, rows, cols, depth, output);
+#else
+ const int nodes = cols * rows;
+ auto in_shaped = in.shaped<float, 2>({nodes * batch, depth});
+
+ // Multiplying the input with the band matrix has the effect of reducing the
+ // correct patch along the depth.
+ Eigen::Tensor<float, 2, Eigen::RowMajor> multiplier(depth, depth);
+ GetBandMatrix(depth, depth_radius_, &multiplier);
+
+ auto out_shaped = output->shaped<float, 2>({nodes * batch, depth});
+ Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+ /// TODO(keveman): Optimize for beta in {0, 1, 0.5}
+ out_shaped.device(context->eigen_cpu_device()) =
+ in_shaped /
+ in_shaped.square()
+ .contract(multiplier, dims)
+ .unaryExpr([this](float x) { return bias_ + alpha_ * x; })
+ .pow(beta_);
+#endif
+ }
+
+ private:
+ typedef Eigen::Tensor<float, 1, Eigen::RowMajor>::DimensionPair DimPair;
+
+ void MognetLRN(const Tensor& in, const int batch, const int rows,
+ const int cols, const int depth, Tensor* out) {
+ Eigen::Map<const Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>>
+ data_in(in.flat<float>().data(), depth, batch * rows * cols);
+
+ Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> data_out(
+ out->flat<float>().data(), depth, batch * rows * cols);
+
+ const int double_depth_radius = depth_radius_ * 2;
+ Eigen::VectorXf padded_square(data_in.rows() + double_depth_radius);
+ padded_square.setZero();
+ for (int r = 0; r < data_in.cols(); ++r) {
+ // Do local response normalization for data_in(:, r)
+ // first, compute the square and store them in buffer for repeated use
+ padded_square.block(depth_radius_, 0, data_out.rows(), 1) =
+ data_in.col(r).cwiseProduct(data_in.col(r)) * alpha_;
+ // Then, compute the scale and writes them to data_out
+ float accumulated_scale = 0;
+ for (int i = 0; i < double_depth_radius; ++i) {
+ accumulated_scale += padded_square(i);
+ }
+ for (int i = 0; i < data_in.rows(); ++i) {
+ accumulated_scale += padded_square(i + double_depth_radius);
+ data_out(i, r) = bias_ + accumulated_scale;
+ accumulated_scale -= padded_square(i);
+ }
+ }
+
+ // In a few cases, the pow computation could benefit from speedups.
+ if (beta_ == 1) {
+ data_out.array() = data_in.array() * data_out.array().inverse();
+ } else if (beta_ == 0.5) {
+ data_out.array() = data_in.array() * data_out.array().sqrt().inverse();
+ } else {
+ data_out.array() = data_in.array() * data_out.array().pow(-beta_);
+ }
+ }
+
+ int64 depth_radius_;
+ float bias_;
+ float alpha_;
+ float beta_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("LRN").Device(DEVICE_CPU), LRNOp);
+
+#ifndef __ANDROID__
+
+class LRNGradOp : public OpKernel {
+ public:
+ explicit LRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius_));
+ OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
+ OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
+ OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& in_grads = context->input(0);
+ const Tensor& in_image = context->input(1);
+ const Tensor& out_image = context->input(2);
+
+ OP_REQUIRES(context, in_grads.dims() == 4 && in_image.dims() == 4,
+ errors::InvalidArgument("inputs must be 4-dimensional"));
+ const int64 batch = in_grads.dim_size(0);
+ const int64 rows = in_grads.dim_size(1);
+ const int64 cols = in_grads.dim_size(2);
+ const int64 depth = in_grads.dim_size(3);
+ OP_REQUIRES(
+ context,
+ in_image.dim_size(0) == batch && in_image.dim_size(1) == rows &&
+ in_image.dim_size(2) == cols && in_image.dim_size(3) == depth &&
+ out_image.dim_size(0) == batch && out_image.dim_size(1) == rows &&
+ out_image.dim_size(2) == cols && out_image.dim_size(3) == depth,
+ errors::InvalidArgument(
+ "input_grads, input_image, and out_image should have the same "
+ "shape"));
+ const auto nodes = cols * rows;
+ auto grads_shaped = in_grads.shaped<float, 2>({nodes * batch, depth});
+ auto in_shaped = in_image.shaped<float, 2>({nodes * batch, depth});
+ auto activations = out_image.shaped<float, 2>({nodes * batch, depth});
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(
+ 0, TensorShape({batch, rows, cols, depth}), &output));
+ auto out_shaped = output->shaped<float, 2>({nodes * batch, depth});
+ out_shaped.setZero();
+
+ auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
+ depth](int64 begin, int64 end) {
+ for (int64 i = begin; i < end; ++i) {
+ for (int64 j = 0; j < depth; ++j) {
+ // Let y be the LRN activations and x be the inputs along the depth
+ // dimension. (LRN operates independently along rows, cols, and
+ // batch).
+ // We have
+ // yi = xi / (bias + alpha(sum_j_{i - depth_radius}^{i + depth_radius}
+ // x_j^2))^beta
+ //
+ // Let N = (bias + alpha(sum_j_{i - depth_radius}^{i + depth_radius}
+ // x_j^2))
+ // dy_i/dx_i = (N^beta - xi. beta*N^(beta-1)*2*alpha*xi)/N^(2*beta)
+ // dy_i/dx_j = ( - xi. beta*N^(beta-1)*2*alpha*xj)/N^(2*beta)
+ //
+ // NOTE(keveman) : We can compute N by doing (yi/xi) ^ (1/beta).
+ // However, this is numerically unstable for small values of xi. We
+ // compute N explicitly here to avoid that.
+
+ int64 depth_begin = std::max<int64>(0, j - depth_radius_);
+ int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1);
+
+ float norm = 0.0f;
+ for (int64 k = depth_begin; k < depth_end; ++k) {
+ norm += in_shaped(i, k) * in_shaped(i, k);
+ }
+ norm = alpha_ * norm + bias_;
+ DCHECK_GT(norm, 1e-6);
+ for (int64 k = depth_begin; k < depth_end; ++k) {
+ float dyi = -2.0f * alpha_ * beta_ * in_shaped(i, k) *
+ activations(i, j) / norm;
+ if (k == j) {
+ dyi += std::pow(norm, -beta_);
+ }
+ dyi *= grads_shaped(i, j);
+ const_cast<TTypes<float, 2>::Tensor&>(out_shaped)(i, k) += dyi;
+ }
+ }
+ }
+ };
+ auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+ Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
+ depth * depth, shard);
+ }
+
+ private:
+ typedef Eigen::Tensor<float, 1, Eigen::RowMajor>::DimensionPair DimPair;
+
+ int64 depth_radius_;
+ float bias_;
+ float alpha_;
+ float beta_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("LRNGrad").Device(DEVICE_CPU), LRNGradOp);
+
+#endif // __ANDROID__
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/lrn_op_test.cc b/tensorflow/core/kernels/lrn_op_test.cc
new file mode 100644
index 0000000000..4c338b6cb3
--- /dev/null
+++ b/tensorflow/core/kernels/lrn_op_test.cc
@@ -0,0 +1,185 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+static const float tol_ = 1e-4;
+
+class LRNFloatTest : public OpsTestBase {
+ protected:
+ LRNFloatTest() : philox_(123, 17), rand_(&philox_) { RequireDefaultOps(); }
+
+ int GetIntAttr(const string& name) {
+ int value;
+ TF_CHECK_OK(GetNodeAttr(*node_def(), name, &value));
+ return value;
+ }
+
+ float GetFloatAttr(const string& name) {
+ float value;
+ TF_CHECK_OK(GetNodeAttr(*node_def(), name, &value));
+ return value;
+ }
+
+ bool Compare() {
+ const auto& input = GetInput(0);
+ const int64 batch_size = input.dim_size(0);
+ const int64 rows = input.dim_size(1);
+ const int64 cols = input.dim_size(2);
+ const int64 depth = input.dim_size(3);
+ const int64 rest = cols * rows * batch_size;
+
+ const int64 depth_radius = GetIntAttr("depth_radius");
+ const float bias = GetFloatAttr("bias");
+ const float alpha = GetFloatAttr("alpha");
+ const float beta = GetFloatAttr("beta");
+
+ Eigen::Tensor<float, 4, Eigen::RowMajor> expected(batch_size, rows, cols,
+ depth);
+ auto out = expected.reshape(Eigen::DSizes<int64, 2>{rest, depth});
+ auto in = input.shaped<float, 2>({rest, depth});
+
+ for (int64 i = 0; i < rest; ++i) {
+ Eigen::Tensor<float, 1, Eigen::RowMajor> out_col(depth);
+ for (int64 d = 0; d < depth; ++d) {
+ float denom = 0.0f;
+ for (int64 r = std::max(0ll, d - depth_radius);
+ r < std::min(depth, d + depth_radius + 1); ++r) {
+ denom += in(i, r) * in(i, r);
+ }
+ denom = std::pow(denom * alpha + bias, beta);
+ out_col(d) = in(i, d) / denom;
+ }
+ out.chip<0>(i) = out_col;
+ }
+ auto actual = GetOutput(0)->tensor<float, 4>();
+ Eigen::Tensor<float, 0, Eigen::RowMajor> sum =
+ ((expected - actual).abs() > actual.constant(tol_))
+ .select(actual.constant(1), actual.constant(0))
+ .sum();
+ return sum() == 0;
+ }
+
+ random::PhiloxRandom philox_;
+ random::SimplePhilox rand_;
+};
+
+TEST_F(LRNFloatTest, Depth96) {
+ ASSERT_OK(NodeDefBuilder("lrn_op", "LRN")
+ .Input(FakeInput())
+ .Attr("depth_radius", 5)
+ .Attr("bias", 1.0f)
+ .Attr("alpha", 0.1f)
+ .Attr("beta", 2.0f)
+ .Finalize(node_def()));
+ ASSERT_OK(InitOp());
+ AddInput<float>(TensorShape({1, 1, 1, 96}),
+ [this](int i) -> float { return i + 1; });
+ ASSERT_OK(RunOpKernel());
+ auto actual = GetOutput(0)->tensor<float, 4>();
+
+ // Output for Node 0 with Value 1:
+ // 1 / (1 + 0.1*(1^2 + 2^2 + 3^2 + 4^2 + 5^2 + 6^2))^2
+ EXPECT_NEAR(1. / (10.1 * 10.1), actual(0, 0, 0, 0), tol_);
+
+ // Output for Node 5 with Value 6:
+ // 6 / (1 + 0.1*(1^2 + 2^2 + 3^2 + 4^2 + 5^2 + 6^2 ... + 11^2))^2
+ EXPECT_NEAR(6. / (51.6 * 51.6), actual(0, 0, 0, 5), tol_);
+
+ // Output for Node 63 with value 64:
+ // 64 / (1 + 0.1*(59^2 + 60^2 + 61^2 + 62^2 + 63^2 + 64^2))^2
+ EXPECT_NEAR(64. / (2272.1 * 2272.1), actual(0, 0, 0, 63), tol_);
+
+ // Output for Node 64 with value 65:
+ // 65 / (1 + 0.1*(65^2 + 66^2 + 67^2 + 68^2 + 69^2 + 70^2))^2
+ EXPECT_NEAR(65. / (2736.5 * 2736.5), actual(0, 0, 0, 64), tol_);
+
+ // Output for Node 95 with value 96:
+ // 96 / (1 + 0.1*(91^2 + 92^2 + 93^2 + 94^2 + 95^2 + 96^2))^2
+ EXPECT_NEAR(96. / (5248.1 * 5248.1), actual(0, 0, 0, 95), tol_);
+ EXPECT_TRUE(Compare());
+}
+
+TEST_F(LRNFloatTest, Depth16) {
+ ASSERT_OK(NodeDefBuilder("lrn_op", "LRN")
+ .Input(FakeInput())
+ .Attr("depth_radius", 5)
+ .Attr("bias", 1.0f)
+ .Attr("alpha", 0.1f)
+ .Attr("beta", 2.0f)
+ .Finalize(node_def()));
+ ASSERT_OK(InitOp());
+ AddInput<float>(TensorShape({1, 1, 1, 16}),
+ [this](int i) -> float { return i + 1; });
+ ASSERT_OK(RunOpKernel());
+ auto actual = GetOutput(0)->tensor<float, 4>();
+
+ // Output for Node 0 with Value 1:
+ // 1 / (1 + 0.1*(1^2 + 2^2 + 3^2 + 4^2 + 5^2 + 6^2))^2
+ EXPECT_NEAR(1. / (10.1 * 10.1), actual(0, 0, 0, 0), tol_);
+
+ // Output for Node 5 with Value 6:
+ // 6 / (1 + 0.1*(1^2 + 2^2 + 3^2 + 4^2 + 5^2 + 6^2 ... + 11^2))^2
+ EXPECT_NEAR(6. / (51.6 * 51.6), actual(0, 0, 0, 5), tol_);
+
+ // Output for Node 15 with value 16:
+ // 16 / (1 + 0.1*(11^2 + 12^2 + 13^2 + 14^2 + 15^2 + 16^2))^2
+ EXPECT_NEAR(16. / (112.1 * 112.1), actual(0, 0, 0, 15), tol_);
+ EXPECT_TRUE(Compare());
+}
+
+static double RndGaussian(random::SimplePhilox* rnd) {
+ // Box-Muller transformation.
+ // See, for example, http://www.taygeta.com/random/gaussian.html
+ double x1, x2;
+ double r;
+ do {
+ x1 = 2 * rnd->RandDouble() - 1;
+ x2 = 2 * rnd->RandDouble() - 1;
+ r = x1 * x1 + x2 * x2;
+ } while (r == 0 || r >= 1.0);
+ double w = sqrt(-2.0 * log(r) / r);
+ return x1 * w;
+}
+
+#define TCASE(NAME, DEPTH, BATCH, DEPTH_RADIUS, BIAS, ALPHA, BETA) \
+ TEST_F(LRNFloatTest, NAME) { \
+ ASSERT_OK(NodeDefBuilder("lrn_op", "LRN") \
+ .Input(FakeInput()) \
+ .Attr("depth_radius", (DEPTH_RADIUS)) \
+ .Attr("bias", (BIAS)) \
+ .Attr("alpha", ((ALPHA) / 10)) \
+ .Attr("beta", (BETA)) \
+ .Finalize(node_def())); \
+ ASSERT_OK(InitOp()); \
+ AddInput<float>(TensorShape({BATCH, 1, 1, DEPTH}), \
+ [this](int i) -> float { return RndGaussian(&rand_); }); \
+ ASSERT_OK(RunOpKernel()); \
+ EXPECT_TRUE(Compare()); \
+ }
+
+// clang-format off
+// DEPTH BATCH DEPTH_RADIUS BIAS ALPHA BETA
+TCASE(T0, 4, 2, 2, 1.0f, 1.0f, 2.0f)
+TCASE(T1, 16, 1, 5, 1.0f, 1.0f, 2.0f)
+TCASE(T2, 16, 32, 2, 1.0f, 2.0f, 1.0f)
+TCASE(T3, 128, 4, 3, 2.0f, 1.0f, 1.0f)
+// clang-format on
+
+#undef TCASE
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/matching_files_op.cc b/tensorflow/core/kernels/matching_files_op.cc
new file mode 100644
index 0000000000..08a4da5b41
--- /dev/null
+++ b/tensorflow/core/kernels/matching_files_op.cc
@@ -0,0 +1,42 @@
+// See docs in ../ops/io_ops.cc.
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/match.h"
+#include "tensorflow/core/public/env.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+class MatchingFilesOp : public OpKernel {
+ public:
+ using OpKernel::OpKernel;
+ void Compute(OpKernelContext* context) override {
+ const Tensor* pattern;
+ OP_REQUIRES_OK(context, context->input("pattern", &pattern));
+ OP_REQUIRES(context, TensorShapeUtils::IsScalar(pattern->shape()),
+ errors::InvalidArgument(
+ "Input pattern tensor must be scalar, but had shape: ",
+ pattern->shape().DebugString()));
+ std::vector<string> fnames;
+ OP_REQUIRES_OK(context,
+ io::GetMatchingFiles(context->env(),
+ pattern->scalar<string>()(), &fnames));
+ const int num_out = fnames.size();
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(
+ "filenames", TensorShape({num_out}), &output));
+ auto output_vec = output->vec<string>();
+ for (int i = 0; i < num_out; ++i) {
+ output_vec(i) = fnames[i];
+ }
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MatchingFiles").Device(DEVICE_CPU),
+ MatchingFilesOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
new file mode 100644
index 0000000000..48bdba78b2
--- /dev/null
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -0,0 +1,214 @@
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/matmul_op.h"
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/stream_executor/stream.h"
+#endif // GOOGLE_CUDA
+
+namespace tensorflow {
+
+#if GOOGLE_CUDA
+
+namespace {
+template <typename T>
+perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
+ perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
+ perftools::gputools::DeviceMemory<T> typed(wrapped);
+ return typed;
+}
+} // namespace
+
+#endif // GOOGLE_CUDA
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T, bool USE_CUBLAS>
+struct LaunchMatMul;
+
+// On CPUs, we ignore USE_CUBLAS
+template <typename T>
+struct LaunchMatMulCPU {
+ static void launch(
+ OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b,
+ const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
+ Tensor* out) {
+ functor::MatMulFunctor<CPUDevice, T>()(ctx->eigen_device<CPUDevice>(),
+ out->matrix<T>(), a.matrix<T>(),
+ b.matrix<T>(), dim_pair);
+ }
+};
+
+template <typename T, bool USE_CUBLAS>
+struct LaunchMatMul<CPUDevice, T, USE_CUBLAS> : public LaunchMatMulCPU<T> {};
+
+#if GOOGLE_CUDA
+
+template <typename T>
+struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
+ static void launch(
+ OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b,
+ const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
+ Tensor* out) {
+ perftools::gputools::blas::Transpose trans[] = {
+ perftools::gputools::blas::Transpose::kNoTranspose,
+ perftools::gputools::blas::Transpose::kTranspose};
+ const uint64 m = a.dim_size(1 - dim_pair[0].first);
+ const uint64 k = a.dim_size(dim_pair[0].first);
+ const uint64 n = b.dim_size(1 - dim_pair[0].second);
+ bool transpose_a = dim_pair[0].first == 0;
+ bool transpose_b = dim_pair[0].second == 1;
+ auto blas_transpose_a = trans[transpose_a];
+ auto blas_transpose_b = trans[transpose_b];
+
+ auto* stream = ctx->op_device_context<GPUDeviceContext>()->stream();
+ OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
+
+ auto a_ptr = AsDeviceMemory(a.template flat<T>().data());
+ auto b_ptr = AsDeviceMemory(b.template flat<T>().data());
+ auto c_ptr = AsDeviceMemory(out->template flat<T>().data());
+
+ // Cublas does
+ // C = A x B
+ // where A, B and C are assumed to be in column major.
+ // We want the output to be in row-major, so we can compute
+ // C' = B' x A' (' stands for transpose)
+ bool blas_launch_status =
+ stream->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k, 1.0f,
+ b_ptr, transpose_b ? k : n, a_ptr,
+ transpose_a ? m : k, 0.0f, &c_ptr, n)
+ .ok();
+ if (!blas_launch_status) {
+ ctx->SetStatus(errors::Internal(
+ "Blas SGEMM launch failed : a.shape=(", a.dim_size(0), ", ",
+ a.dim_size(1), "), b.shape=(", b.dim_size(0), ", ", b.dim_size(1),
+ "), m=", m, ", n=", n, ", k=", k));
+ }
+ }
+};
+
+template <typename T>
+struct LaunchMatMul<GPUDevice, T, false /* USE_CUBLAS */> {
+ static void launch(
+ OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b,
+ const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
+ Tensor* out) {
+ functor::MatMulFunctor<GPUDevice, T>()(ctx->eigen_device<GPUDevice>(),
+ out->matrix<T>(), a.matrix<T>(),
+ b.matrix<T>(), dim_pair);
+ }
+};
+
+#endif // GOOGLE_CUDA
+
+template <typename Device, typename T, bool USE_CUBLAS>
+class MatMulOp : public OpKernel {
+ public:
+ explicit MatMulOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_));
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor& a = ctx->input(0);
+ const Tensor& b = ctx->input(1);
+
+ // Check that the dimensions of the two matrices are valid.
+ OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()),
+ errors::InvalidArgument("In[0] is not a matrix"));
+ OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()),
+ errors::InvalidArgument("In[1] is not a matrix"));
+ Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+ dim_pair[0].first = transpose_a_ ? 0 : 1;
+ dim_pair[0].second = transpose_b_ ? 1 : 0;
+
+ OP_REQUIRES(ctx,
+ a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
+ errors::InvalidArgument("Matrix size-compatible: In[0]: ",
+ a.shape().DebugString(), ", In[1]: ",
+ b.shape().DebugString()));
+ int a_dim_remaining = 1 - dim_pair[0].first;
+ int b_dim_remaining = 1 - dim_pair[0].second;
+ TensorShape out_shape(
+ {a.dim_size(a_dim_remaining), b.dim_size(b_dim_remaining)});
+ Tensor* out = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
+
+ if (out->NumElements() == 0) {
+ // If a has shape [0, x] or b has shape [x, 0], the output shape
+ // is a 0-element matrix, so there is nothing to do.
+ return;
+ }
+
+ if (a.NumElements() == 0 || b.NumElements() == 0) {
+ // If a has shape [x, 0] and b has shape [0, y], the
+ // output shape is [x, y] where x and y are non-zero, so we fill
+ // the output with zeros.
+ functor::SetZeroFunctor<Device, T> f;
+ f(ctx->eigen_device<Device>(), out->flat<T>());
+ return;
+ }
+
+ LaunchMatMul<Device, T, USE_CUBLAS>::launch(ctx, this, a, b, dim_pair, out);
+ }
+
+ private:
+ bool transpose_a_;
+ bool transpose_b_;
+};
+
+namespace functor {
+
+// Partial specialization MatMulFunctor<Device=CPUDevice, T>.
+template <typename T>
+struct MatMulFunctor<CPUDevice, T> {
+ void operator()(
+ const CPUDevice& d, typename MatMulTypes<T>::out_type out,
+ typename MatMulTypes<T>::in_type in0,
+ typename MatMulTypes<T>::in_type in1,
+ const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
+ MatMul<CPUDevice>(d, out, in0, in1, dim_pair);
+ }
+};
+
+} // end namespace functor
+
+#define REGISTER_CPU(T) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+ MatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T").Label("eigen"), \
+ MatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>)
+
+#define REGISTER_GPU(T) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("MatMul").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+ MatMulOp<GPUDevice, T, true /* cublas, true by default */>); \
+ REGISTER_KERNEL_BUILDER(Name("MatMul") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<T>("T") \
+ .Label("cublas"), \
+ MatMulOp<GPUDevice, T, true /* cublas */>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("MatMul").Device(DEVICE_GPU).TypeConstraint<T>("T").Label("eigen"), \
+ MatMulOp<GPUDevice, T, false /* cublas */>)
+
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+REGISTER_CPU(int32);
+REGISTER_CPU(complex64);
+#if GOOGLE_CUDA
+REGISTER_GPU(float);
+// REGISTER_GPU(double);
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/matmul_op.h b/tensorflow/core/kernels/matmul_op.h
new file mode 100644
index 0000000000..f75b0ded1b
--- /dev/null
+++ b/tensorflow/core/kernels/matmul_op.h
@@ -0,0 +1,40 @@
+#ifndef TENSORFLOW_KERNELS_MATMUL_OP_H_
+#define TENSORFLOW_KERNELS_MATMUL_OP_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Helpers to define tensor<T> needed by MatMul op.
+template <typename T>
+struct MatMulTypes {
+ typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned>
+ out_type;
+ typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
+ Eigen::Aligned> in_type;
+};
+
+template <typename Device, typename In0, typename In1, typename Out,
+ typename DimPair>
+void MatMul(const Device& d, Out out, In0 in0, In1 in1,
+ const DimPair& dim_pair) {
+ out.device(d) = in0.contract(in1, dim_pair);
+}
+
+template <typename Device, typename T>
+struct MatMulFunctor {
+ // Computes on device "d": out = in0 * in1, where * is matrix
+ // multiplication.
+ void operator()(
+ const Device& d, typename MatMulTypes<T>::out_type out,
+ typename MatMulTypes<T>::in_type in0,
+ typename MatMulTypes<T>::in_type in1,
+ const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair);
+};
+
+} // end namespace functor
+} // end namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_MATMUL_OP_H_
diff --git a/tensorflow/core/kernels/matmul_op_gpu.cu.cc b/tensorflow/core/kernels/matmul_op_gpu.cu.cc
new file mode 100644
index 0000000000..17107ce5df
--- /dev/null
+++ b/tensorflow/core/kernels/matmul_op_gpu.cu.cc
@@ -0,0 +1,32 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/matmul_op.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Partial specialization MatMulTensorFunctor<Device=GPUDevice, T>
+template <typename T>
+struct MatMulFunctor<GPUDevice, T> {
+ void operator()(
+ const GPUDevice& d, typename MatMulTypes<T>::out_type out,
+ typename MatMulTypes<T>::in_type in0,
+ typename MatMulTypes<T>::in_type in1,
+ const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
+ MatMul<GPUDevice>(d, To32Bit(out), To32Bit(in0), To32Bit(in1), dim_pair);
+ }
+};
+
+#define DEFINE(T) template struct MatMulFunctor<GPUDevice, T>;
+DEFINE(float);
+// DEFINE(double); // Does not compile 1/2015.
+#undef DEFINE
+
+} // end namespace functor
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/matmul_op_test.cc b/tensorflow/core/kernels/matmul_op_test.cc
new file mode 100644
index 0000000000..b2b8f3d905
--- /dev/null
+++ b/tensorflow/core/kernels/matmul_op_test.cc
@@ -0,0 +1,56 @@
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+
+static Graph* Matmul(int m, int k, int n, bool transpose_a, bool transpose_b) {
+ Graph* g = new Graph(OpRegistry::Global());
+ Tensor in0(DT_FLOAT, transpose_a ? TensorShape({k, m}) : TensorShape({m, k}));
+ in0.flat<float>().setRandom();
+ Tensor in1(DT_FLOAT, transpose_b ? TensorShape({n, k}) : TensorShape({k, n}));
+ in1.flat<float>().setRandom();
+ test::graph::Matmul(g, test::graph::Constant(g, in0),
+ test::graph::Constant(g, in1), transpose_a, transpose_b);
+ return g;
+}
+
+#define BM_MatmulDev(M, K, N, TA, TB, DEVICE) \
+ static void BM_Matmul##_##M##_##K##_##N##_##TA##_##TB##_##DEVICE( \
+ int iters) { \
+ testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2); \
+ test::Benchmark(#DEVICE, Matmul(M, K, N, TA, TB)).Run(iters); \
+ } \
+ BENCHMARK(BM_Matmul##_##M##_##K##_##N##_##TA##_##TB##_##DEVICE);
+
+#define BM_Matmul(M, K, N, TA, TB) \
+ BM_MatmulDev(M, K, N, TA, TB, cpu); \
+ BM_MatmulDev(M, K, N, TA, TB, gpu);
+
+// Typical fully connected layers
+BM_Matmul(8, 512, 512, false, false);
+BM_Matmul(16, 512, 512, false, false);
+BM_Matmul(128, 512, 512, false, false);
+
+BM_Matmul(8, 1024, 1024, false, false);
+BM_Matmul(16, 1024, 1024, false, false);
+BM_Matmul(128, 1024, 1024, false, false);
+BM_Matmul(4096, 4096, 4096, false, false);
+
+// Backward for fully connected layers
+BM_Matmul(8, 1024, 1024, false, true);
+BM_Matmul(16, 1024, 1024, false, true);
+BM_Matmul(128, 1024, 1024, false, true);
+
+// Forward softmax with large output size
+BM_Matmul(8, 200, 10000, false, false);
+BM_Matmul(20, 200, 10000, false, false);
+BM_Matmul(20, 200, 20000, false, false);
+
+// Backward softmax with large output size
+BM_Matmul(8, 10000, 200, false, true);
+BM_Matmul(20, 10000, 200, false, true);
+BM_Matmul(20, 20000, 200, false, true);
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_inverse_op.cc b/tensorflow/core/kernels/matrix_inverse_op.cc
new file mode 100644
index 0000000000..ad0948d6ef
--- /dev/null
+++ b/tensorflow/core/kernels/matrix_inverse_op.cc
@@ -0,0 +1,64 @@
+// See docs in ../ops/linalg_ops.cc.
+#include <cmath>
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/Eigen/LU"
+
+namespace tensorflow {
+
+template <class Scalar, bool SupportsBatchOperationT>
+class MatrixInverseOp
+ : public LinearAlgebraOp<Scalar, SupportsBatchOperationT> {
+ public:
+ explicit MatrixInverseOp(OpKernelConstruction* context)
+ : LinearAlgebraOp<Scalar, SupportsBatchOperationT>(context) {}
+ ~MatrixInverseOp() override {}
+
+ TensorShape GetOutputMatrixShape(
+ const TensorShape& input_matrix_shape) override {
+ return input_matrix_shape;
+ }
+
+ int64 GetCostPerUnit(const TensorShape& input_matrix_shape) override {
+ const int64 rows = input_matrix_shape.dim_size(0);
+ if (rows > (1LL << 20)) {
+ // A big number to cap the cost in case overflow.
+ return kint32max;
+ } else {
+ return rows * rows * rows;
+ }
+ }
+
+ using typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::MatrixMap;
+ using
+ typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::ConstMatrixMap;
+
+ void ComputeMatrix(OpKernelContext* context, const ConstMatrixMap& input,
+ MatrixMap* output) override {
+ OP_REQUIRES(context, input.rows() == input.cols(),
+ errors::InvalidArgument("Input matrix must be square."));
+ if (input.rows() == 0) {
+ // By definition, an empty matrix's inverse is an emptry matrix.
+ return;
+ }
+ Eigen::FullPivLU<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic,
+ Eigen::RowMajor>> lu_decomposition(input);
+ OP_REQUIRES(context, lu_decomposition.isInvertible(),
+ errors::InvalidArgument("Input is not invertible."));
+ *output = lu_decomposition.inverse();
+ }
+};
+
+REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<float, false>), float);
+REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<double, false>), double);
+REGISTER_LINALG_OP("BatchMatrixInverse", (MatrixInverseOp<float, true>), float);
+REGISTER_LINALG_OP("BatchMatrixInverse", (MatrixInverseOp<double, true>),
+ double);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
new file mode 100644
index 0000000000..31046018c5
--- /dev/null
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -0,0 +1,554 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/maxpooling_op.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/pooling_ops_common.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
+#include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
+#endif // GOOGLE_CUDA
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+const int kInvalidMaxPoolingIndex = -1;
+
+template <typename Device, typename T>
+struct SpatialMaxPoolWithArgMaxHelper {
+ static void Compute(Tensor* output, Tensor* output_arg_max,
+ const Tensor& tensor_in, const PoolParameters& params,
+ const Padding& padding) {
+ typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+ ConstEigenMatrixMap;
+ typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+ EigenMatrixMap;
+ typedef Eigen::Map<Eigen::Matrix<int64, Eigen::Dynamic, Eigen::Dynamic>>
+ EigenIndexMatrixMap;
+
+ ConstEigenMatrixMap in_mat(
+ tensor_in.flat<T>().data(), params.depth,
+ params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
+ EigenMatrixMap out_mat(
+ output->flat<T>().data(), params.depth,
+ params.out_width * params.out_height * params.tensor_in_batch);
+ EigenIndexMatrixMap out_arg_max_mat(
+ output_arg_max->flat<int64>().data(), params.depth,
+ params.out_width * params.out_height * params.tensor_in_batch);
+
+ // Initializes the output tensor with MIN<T>.
+ output_arg_max->flat<int64>().setConstant(kInvalidMaxPoolingIndex);
+ output->flat<T>().setConstant(Eigen::NumTraits<T>::lowest());
+
+ // The following code basically does the following:
+ // 1. Flattens the input and output tensors into two dimensional arrays.
+ // tensor_in_as_matrix:
+ // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
+ // output_as_matrix:
+ // depth by (out_width * out_height * tensor_in_batch)
+ //
+ // 2. Walks through the set of columns in the flattened tensor_in_as_matrix,
+ // and updates the corresponding column(s) in output_as_matrix with the
+ // max value.
+ for (int b = 0; b < params.tensor_in_batch; ++b) {
+ for (int h = 0; h < params.tensor_in_rows; ++h) {
+ for (int w = 0; w < params.tensor_in_cols; ++w) {
+ // (h_start, h_end) * (w_start, w_end) is the range that the input
+ // vector projects to.
+ const int hpad = h + params.pad_rows;
+ const int wpad = w + params.pad_cols;
+ const int h_start =
+ (hpad < params.window_rows)
+ ? 0
+ : (hpad - params.window_rows) / params.row_stride + 1;
+ const int h_end =
+ std::min(hpad / params.row_stride + 1, params.out_height);
+ const int w_start =
+ (wpad < params.window_cols)
+ ? 0
+ : (wpad - params.window_cols) / params.col_stride + 1;
+ const int w_end =
+ std::min(wpad / params.col_stride + 1, params.out_width);
+ // compute elementwise max
+ const int in_index =
+ (b * params.tensor_in_rows + h) * params.tensor_in_cols + w;
+ for (int ph = h_start; ph < h_end; ++ph) {
+ for (int pw = w_start; pw < w_end; ++pw) {
+ const int out_index =
+ (b * params.out_height + ph) * params.out_width + pw;
+ /// NOTES(zhengxq): not using the eigen matrix operation for now.
+ /// May consider parallelizing the operations if needed.
+ for (int d = 0; d < params.depth; ++d) {
+ const T& input_ref = in_mat.coeffRef(d, in_index);
+ T& output_ref = out_mat.coeffRef(d, out_index);
+ int64& out_arg_max_ref = out_arg_max_mat.coeffRef(d, out_index);
+ if (output_ref < input_ref ||
+ out_arg_max_ref == kInvalidMaxPoolingIndex) {
+ output_ref = input_ref;
+ int input_offset = in_index * params.depth + d;
+ out_arg_max_ref = input_offset;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_CPU),
+ MaxPoolingOp<CPUDevice, float>);
+
+#if GOOGLE_CUDA
+// Forward declarations for the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T) \
+ template <> \
+ void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()( \
+ const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \
+ typename TTypes<T, 4>::ConstTensor input, int window_rows, \
+ int window_cols, int row_stride, int col_stride, \
+ const Eigen::PaddingType& padding); \
+ extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>;
+
+DECLARE_GPU_SPEC(float);
+#undef DECLARE_GPU_SPEC
+} // namespace functor
+
+// Note(jiayq): Currently, the Caffe custom implementation is faster than the
+// default Eigen implementation so we are using the custom kernel as the
+// default. However, you can explicitly invoke the eigen version using
+// kernel_label_map.
+REGISTER_KERNEL_BUILDER(Name("MaxPool")
+ .Device(DEVICE_GPU)
+ .Label("eigen_tensor"),
+ MaxPoolingOp<Eigen::GpuDevice, float>);
+#endif // GOOGLE_CUDA
+
+// The operation to compute MaxPool gradients.
+// It takes three inputs:
+// - The original input tensor
+// - The original output tensor
+// - Backprop tensor for output
+// It produces one output: backprop tensor for input.
+template <class Device, class T>
+class MaxPoolingGradOp : public OpKernel {
+ public:
+ explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+ OP_REQUIRES(context, ksize_.size() == 4,
+ errors::InvalidArgument(
+ "Sliding window ksize field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+ OP_REQUIRES(context, stride_.size() == 4,
+ errors::InvalidArgument(
+ "Sliding window strides field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+ errors::Unimplemented(
+ "Pooling is not yet supported on the batch dimension."));
+ OP_REQUIRES(
+ context, ksize_[3] == 1 && stride_[3] == 1,
+ errors::Unimplemented(
+ "MaxPoolingGrad is not yet supported on the depth dimension."));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& tensor_in = context->input(0);
+ const Tensor& tensor_out = context->input(1);
+ const Tensor& out_backprop = context->input(2);
+
+ // For maxpooling, tensor_in should have 4 dimensions.
+ OP_REQUIRES(context, tensor_in.dims() == 4,
+ errors::InvalidArgument("tensor_in must be 4-dimensional"));
+ OP_REQUIRES(context, tensor_out.dims() == 4,
+ errors::InvalidArgument("tensor_out must be 4-dimensional"));
+ // For maxpooling, out_backprop should have 4 dimensions.
+ OP_REQUIRES(context, out_backprop.dims() == 4,
+ errors::InvalidArgument("out_backprop must be 4-dimensional"));
+
+ TensorShape output_shape = tensor_in.shape();
+
+ // Tensor index_tensor(context->allocator(), DT_INT32, output_shape);
+
+ Tensor tensor_out_dup;
+ OP_REQUIRES_OK(context,
+ context->allocate_temp(DataTypeToEnum<T>::v(),
+ tensor_out.shape(), &tensor_out_dup));
+ Tensor tensor_out_arg_max;
+ OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64>::v(),
+ tensor_out.shape(),
+ &tensor_out_arg_max));
+
+ PoolParameters params{context, ksize_, stride_, padding_,
+ tensor_in.shape()};
+ if (!context->status().ok()) {
+ return;
+ }
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+ output->flat<T>().setZero();
+
+ SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>::Compute(
+ &tensor_out_dup, &tensor_out_arg_max, tensor_in, params, padding_);
+ auto out_backprop_flat = out_backprop.flat<T>();
+ auto input_backprop_flat = output->flat<T>();
+ auto out_arg_max_flat = tensor_out_arg_max.flat<int64>();
+ int num_total_outputs = out_backprop.flat<T>().size();
+ int num_total_inputs = input_backprop_flat.size();
+
+ for (int index = 0; index < num_total_outputs; ++index) {
+ int input_backprop_index = out_arg_max_flat(index);
+ // Although this check is in the inner loop, it is worth its value
+ // so we don't end up with memory corruptions. Our benchmark shows that
+ // the performance impact is quite small
+ CHECK(input_backprop_index >= 0 &&
+ input_backprop_index < num_total_inputs)
+ << "Invalid input backprop index: " << input_backprop_index << ", "
+ << num_total_inputs;
+ input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
+ }
+ }
+
+ private:
+ std::vector<int32> ksize_;
+ std::vector<int32> stride_;
+ Padding padding_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_CPU),
+ MaxPoolingGradOp<CPUDevice, float>);
+
+#ifdef GOOGLE_CUDA
+
+static void MaxPoolingBackwardCustomKernel(
+ OpKernelContext* context, const std::vector<int32>& size,
+ const std::vector<int32>& stride, Padding padding, const Tensor* tensor_in,
+ const Tensor& out_backprop, const TensorShape& tensor_in_shape) {
+ Tensor* output = nullptr;
+
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, tensor_in_shape, &output));
+
+ PoolParameters params{context, size, stride, padding, tensor_in_shape};
+ if (!context->status().ok()) {
+ return;
+ }
+
+ MaxPoolBackwardNoMask(
+ tensor_in->flat<float>().data(), params.tensor_in_batch,
+ params.tensor_in_rows, params.tensor_in_cols, params.depth,
+ params.out_height, params.out_width, params.window_rows,
+ params.window_cols, params.row_stride, params.col_stride, params.pad_rows,
+ params.pad_cols, out_backprop.flat<float>().data(),
+ output->flat<float>().data(), context->eigen_device<Eigen::GpuDevice>());
+}
+
+template <class T>
+class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
+ public:
+ typedef Eigen::GpuDevice Device;
+
+ explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+ OP_REQUIRES(context, ksize_.size() == 4,
+ errors::InvalidArgument(
+ "Sliding window ksize field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+ OP_REQUIRES(context, stride_.size() == 4,
+ errors::InvalidArgument(
+ "Sliding window strides field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+ errors::Unimplemented(
+ "Pooling is not yet supported on the batch dimension."));
+
+ use_dnn_ = CanUseCudnn();
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& tensor_in = context->input(0);
+ const Tensor& tensor_out = context->input(1);
+ const Tensor& out_backprop = context->input(2);
+
+ // For maxpooling, tensor_in should have 4 dimensions.
+ OP_REQUIRES(context, tensor_in.dims() == 4,
+ errors::InvalidArgument("tensor_in must be 4-dimensional 4"));
+ OP_REQUIRES(context, tensor_out.dims() == 4,
+ errors::InvalidArgument("tensor_out must be 4-dimensional"));
+ // For maxpooling, out_backprop should have 4 dimensions.
+ OP_REQUIRES(context, out_backprop.dims() == 4,
+ errors::InvalidArgument("out_backprop must be 4-dimensional"));
+
+ TensorShape output_shape = tensor_in.shape();
+
+ if (use_dnn_) {
+ DnnPoolingGradOp<T>::Compute(
+ context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize_,
+ stride_, padding_, &tensor_in, &tensor_out, out_backprop,
+ output_shape);
+ } else {
+ MaxPoolingBackwardCustomKernel(context, ksize_, stride_, padding_,
+ &tensor_in, out_backprop, output_shape);
+ }
+ }
+
+ private:
+ std::vector<int32> ksize_;
+ std::vector<int32> stride_;
+ Padding padding_;
+ bool use_dnn_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_GPU),
+ MaxPoolingGradOp<Eigen::GpuDevice, float>);
+
+#endif // GOOGLE_CUDA
+
+template <typename Device, typename T>
+struct LaunchMaxPoolingNoMask;
+
+template <typename Device, typename T>
+class MaxPoolingNoMaskOp : public OpKernel {
+ public:
+ explicit MaxPoolingNoMaskOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+ OP_REQUIRES(context, ksize_.size() == 4,
+ errors::InvalidArgument("Sliding window ksize field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+ OP_REQUIRES(context, stride_.size() == 4,
+ errors::InvalidArgument("Sliding window stride field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+ errors::Unimplemented(
+ "Pooling is not yet supported on the batch dimension."));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& tensor_in = context->input(0);
+
+ PoolParameters params{context, ksize_, stride_, padding_,
+ tensor_in.shape()};
+ if (!context->status().ok()) {
+ return;
+ }
+
+ TensorShape out_shape({params.tensor_in_batch, params.out_height,
+ params.out_width, params.depth});
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+ LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
+ output);
+ }
+
+ private:
+ std::vector<int32> ksize_;
+ std::vector<int32> stride_;
+ Padding padding_;
+};
+
+template <typename Device, typename T>
+struct LaunchMaxPoolingWithArgmax;
+
+template <typename Device, typename T>
+class MaxPoolingWithArgmaxOp : public OpKernel {
+ public:
+ explicit MaxPoolingWithArgmaxOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+ OP_REQUIRES(context, ksize_.size() == 4,
+ errors::InvalidArgument(
+ "Sliding window ksize field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+ OP_REQUIRES(context, stride_.size() == 4,
+ errors::InvalidArgument(
+ "Sliding window stride field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+ errors::Unimplemented(
+ "Pooling is not yet supported on the batch dimension."));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& tensor_in = context->input(0);
+
+ PoolParameters params{context, ksize_, stride_, padding_,
+ tensor_in.shape()};
+ if (!context->status().ok()) {
+ return;
+ }
+
+ TensorShape out_shape({params.tensor_in_batch, params.out_height,
+ params.out_width, params.depth});
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+ Tensor* argmax = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(1, out_shape, &argmax));
+
+ LaunchMaxPoolingWithArgmax<Device, T>::launch(context, params, tensor_in,
+ output, argmax);
+ }
+
+ private:
+ std::vector<int32> ksize_;
+ std::vector<int32> stride_;
+ Padding padding_;
+};
+
+template <typename Device, typename T>
+struct LaunchMaxPoolingGradWithArgmax;
+
+template <typename Device, typename T>
+class MaxPoolingGradWithArgmaxOp : public OpKernel {
+ public:
+ explicit MaxPoolingGradWithArgmaxOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+ OP_REQUIRES(context, ksize_.size() == 4,
+ errors::InvalidArgument(
+ "Sliding window ksize field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+ OP_REQUIRES(context, stride_.size() == 4,
+ errors::InvalidArgument(
+ "Sliding window stride field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+ errors::Unimplemented(
+ "Pooling is not yet supported on the batch dimension."));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& tensor_in = context->input(0);
+ const Tensor& grad_in = context->input(1);
+ const Tensor& argmax = context->input(2);
+
+ PoolParameters params{context, ksize_, stride_, padding_,
+ tensor_in.shape()};
+ if (!context->status().ok()) {
+ return;
+ }
+
+ TensorShape out_shape({params.tensor_in_batch, params.tensor_in_rows,
+ params.tensor_in_cols, params.depth});
+ Tensor* grad_out = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &grad_out));
+
+ LaunchMaxPoolingGradWithArgmax<Device, T>::launch(context, params, grad_in,
+ argmax, grad_out);
+ }
+
+ private:
+ std::vector<int32> ksize_;
+ std::vector<int32> stride_;
+ Padding padding_;
+};
+
+#if GOOGLE_CUDA
+
+template <typename T>
+struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
+ static void launch(OpKernelContext* context, const PoolParameters& params,
+ const Tensor& input, Tensor* output) {
+ bool status = MaxPoolForwardWithOptionalArgmax(
+ input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
+ params.tensor_in_cols, params.depth, params.out_height,
+ params.out_width, params.window_rows, params.window_cols,
+ params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
+ output->flat<T>().data(), nullptr, context->eigen_gpu_device());
+ if (!status) {
+ context->SetStatus(
+ errors::Internal("Failed launching MaxPoolForwardNoMask"));
+ }
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_GPU),
+ MaxPoolingNoMaskOp<Eigen::GpuDevice, float>);
+
+template <typename T>
+struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
+ static void launch(OpKernelContext* context, const PoolParameters& params,
+ const Tensor& input, Tensor* output, Tensor* argmax) {
+ bool status = MaxPoolForwardWithOptionalArgmax(
+ input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
+ params.tensor_in_cols, params.depth, params.out_height,
+ params.out_width, params.window_rows, params.window_cols,
+ params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
+ output->flat<T>().data(),
+ reinterpret_cast<int64*>(argmax->flat<int64>().data()),
+ context->eigen_gpu_device());
+ if (!status) {
+ context->SetStatus(
+ errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
+ }
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<int64>("Targmax"),
+ MaxPoolingWithArgmaxOp<Eigen::GpuDevice, float>);
+
+template <typename T>
+struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
+ static void launch(OpKernelContext* context, const PoolParameters& params,
+ const Tensor& grad_in, const Tensor& argmax,
+ Tensor* grad_out) {
+ const int input_size = params.tensor_in_batch * params.tensor_in_rows *
+ params.tensor_in_cols * params.depth;
+ const int output_size = params.tensor_in_batch * params.out_height *
+ params.out_width * params.depth;
+ const int top_offset = params.out_height * params.out_width * params.depth;
+ const int bottom_offset =
+ params.tensor_in_rows * params.tensor_in_cols * params.depth;
+ bool status = MaxPoolBackwardWithArgmax(
+ output_size, input_size, grad_in.flat<T>().data(),
+ reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
+ bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device());
+ if (!status) {
+ context->SetStatus(
+ errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
+ }
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<int64>("Targmax"),
+ MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, float>);
+
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/maxpooling_op.h b/tensorflow/core/kernels/maxpooling_op.h
new file mode 100644
index 0000000000..a074174118
--- /dev/null
+++ b/tensorflow/core/kernels/maxpooling_op.h
@@ -0,0 +1,29 @@
+#ifndef TENSORFLOW_KERNELS_MAXPOOLING_OP_H_
+#define TENSORFLOW_KERNELS_MAXPOOLING_OP_H_
+// Functor definition for MaxPoolingOp, must be compilable by nvcc.
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct SpatialMaxPooling {
+ void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
+ typename TTypes<T, 4>::ConstTensor input, int window_rows,
+ int window_cols, int row_stride, int col_stride,
+ const Eigen::PaddingType& padding) {
+ // Because we swap the layout, we swap the row/cols as well
+ output.swap_layout().device(d) =
+ Eigen::SpatialMaxPooling(input.swap_layout(), window_cols, window_rows,
+ col_stride, row_stride, padding);
+ }
+};
+
+} // namespace functor
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_MAXPOOLING_OP_H_
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
new file mode 100644
index 0000000000..65262eb54e
--- /dev/null
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -0,0 +1,261 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/maxpooling_op.h"
+#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
+
+namespace tensorflow {
+namespace {
+// This is Yangqing's custom kernel for the maxpooling operation. There are
+// three functions: MaxPoolForwardNCHW and MaxPoolForwardNHWC are the two
+// forward functions, dealing with the forward case. MaxPoolBackward is the
+// backward function that deals with the backward case for both storage orders.
+// The parameters to the kernels in the forward function is as follows:
+// nthreads: the number of threads, which is equal to the output size.
+// bottom_data: the bottom data of N*H*W*C (or N*C*H*W) items.
+// height, width, pooled_height, pooled_width: the input and output sizes.
+// kernel_h, kernel_w: the kernel sizes.
+// stride_h, stride_w: the strides.
+// pad_t, pad_l: the padding values on the top and left side.
+// top_data: the maxpool output.
+// mask: the output mask of the same size as top_data. It is stored in
+// int form, keeping track of the flattened index of the input item that
+// produces the max output. If a nullptr is passed in for mask, no mask
+// will be produced.
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+ i < (n); i += blockDim.x * gridDim.x)
+
+// To call the forward and backward functions, use e.g.:
+// const int kThreadsPerBlock = 1024
+// const int output_size = batch * channels * pooled_height * pooled_width;
+// MaxPoolForwardNCHW<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+// kThreadsPerBlock, 0, cuda_stream>>>(...);
+template <typename dtype>
+__global__ void MaxPoolForwardNCHW(const int nthreads, const dtype* bottom_data,
+ const int channels, const int height,
+ const int width, const int pooled_height,
+ const int pooled_width, const int kernel_h,
+ const int kernel_w, const int stride_h,
+ const int stride_w, const int pad_t,
+ const int pad_l, dtype* top_data,
+ int64* mask) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+ int hstart = ph * stride_h - pad_t;
+ int wstart = pw * stride_w - pad_l;
+ int hend = min(hstart + kernel_h, height);
+ int wend = min(wstart + kernel_w, width);
+ hstart = max(hstart, 0);
+ wstart = max(wstart, 0);
+ dtype maxval = -FLT_MAX;
+ int maxidx = -1;
+ const dtype* bottom_data_n = bottom_data + n * channels * height * width;
+ for (int h = hstart; h < hend; ++h) {
+ for (int w = wstart; w < wend; ++w) {
+ int idx = c * height * width + h * width + w;
+ if (bottom_data_n[idx] > maxval) {
+ maxidx = idx;
+ maxval = bottom_data_n[idx];
+ }
+ }
+ }
+ top_data[index] = maxval;
+ if (mask != nullptr) {
+ mask[index] = maxidx;
+ }
+ }
+}
+
+template <typename dtype>
+__global__ void MaxPoolForwardNHWC(const int nthreads, const dtype* bottom_data,
+ const int height, const int width,
+ const int channels, const int pooled_height,
+ const int pooled_width, const int kernel_h,
+ const int kernel_w, const int stride_h,
+ const int stride_w, const int pad_t,
+ const int pad_l, dtype* top_data,
+ int64* mask) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ int n = index;
+ int c = n % channels;
+ n /= channels;
+ int wstart = (n % pooled_width) * stride_w - pad_l;
+ n /= pooled_width;
+ int hstart = (n % pooled_height) * stride_h - pad_t;
+ n /= pooled_height;
+ int hend = min(hstart + kernel_h, height);
+ int wend = min(wstart + kernel_w, width);
+ hstart = max(hstart, 0);
+ wstart = max(wstart, 0);
+ dtype maxval = -FLT_MAX;
+ int maxidx = -1;
+ const dtype* bottom_data_n = bottom_data + n * height * width * channels;
+ for (int h = hstart; h < hend; ++h) {
+ for (int w = wstart; w < wend; ++w) {
+ int idx = (h * width + w) * channels + c;
+ if (bottom_data_n[idx] > maxval) {
+ maxidx = idx;
+ maxval = bottom_data_n[idx];
+ }
+ }
+ }
+ top_data[index] = maxval;
+ if (mask != nullptr) {
+ mask[index] = maxidx;
+ }
+ }
+}
+
+template <typename dtype>
+__global__ void MaxPoolBackwardNoMaskNHWC(
+ const int nthreads, const dtype* bottom_data, const int height,
+ const int width, const int channels, const int pooled_height,
+ const int pooled_width, const int kernel_h, const int kernel_w,
+ const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+ const dtype* top_diff, dtype* bottom_diff) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // First find out the index to the maximum, since we have no mask.
+ int n = index;
+ int c = n % channels;
+ n /= channels;
+ int wstart = (n % pooled_width) * stride_w - pad_l;
+ n /= pooled_width;
+ int hstart = (n % pooled_height) * stride_h - pad_t;
+ n /= pooled_height;
+ int hend = min(hstart + kernel_h, height);
+ int wend = min(wstart + kernel_w, width);
+ hstart = max(hstart, 0);
+ wstart = max(wstart, 0);
+ dtype maxval = -FLT_MAX;
+ int maxidx = -1;
+ const dtype* bottom_data_n = bottom_data + n * height * width * channels;
+ for (int h = hstart; h < hend; ++h) {
+ for (int w = wstart; w < wend; ++w) {
+ int idx = (h * width + w) * channels + c;
+ if (bottom_data_n[idx] > maxval) {
+ maxidx = idx;
+ maxval = bottom_data_n[idx];
+ }
+ }
+ }
+
+ // Atomically accumulate the bottom diff. The index could still be
+ // uninitialized, if all the bottom_data are NaN.
+ if (maxidx != -1) {
+ atomicAdd(bottom_diff + n * height * width * channels + maxidx,
+ top_diff[index]);
+ }
+ }
+}
+
+// The parameters to the kernels in the backward function is as follows:
+// nthreads: the number of threads, which is equal to the output size.
+// top_diff: the gradient of the output data, of size N*Hout*Wout*C (or
+// N*C*Hout*Wout). As we have stored the flattened index of the input
+// entries, the backward function is agnostic of the input storage order.
+// mask: the output mask of the same size as top_data. It is stored in
+// int form, keeping track of the flattened index of the input item that
+// produces the max output.
+// top_offset: the pre-computed per-image offset of the maxpool output. This
+// is equal to Hout*Wout*C. We choose to pre-compute this so we do not
+// need to compute it every time inside the kernel.
+// bottom_offset: the pre-computed per-image offset of the maxpool input.
+// This is equal to H*W*C.
+// bottom_diff: the gradient with respect to the input.
+// This function relies on atomicAdd to avoid race conditions. Also, before the
+// kernel is run, you will need to make sure that bottom_diff is filled with
+// zero first.
+template <typename dtype>
+__global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
+ const int64* mask, const int top_offset,
+ const int bottom_offset, dtype* bottom_diff) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ int image_id = (index / top_offset);
+ atomicAdd(bottom_diff + image_id * bottom_offset + mask[index],
+ top_diff[index]);
+ }
+}
+
+template <typename dtype>
+__global__ void SetZero(const int nthreads, dtype* bottom_diff) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) { *(bottom_diff + index) = dtype(0); }
+}
+
+#undef CUDA_1D_KERNEL_LOOP
+} // namespace
+
+bool MaxPoolForwardWithOptionalArgmax(
+ const float* bottom_data, const int batch, const int height,
+ const int width, const int channels, const int pooled_height,
+ const int pooled_width, const int kernel_h, const int kernel_w,
+ const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+ float* top_data, int64* mask, const Eigen::GpuDevice& d) {
+ const int kThreadsPerBlock = 1024;
+ const int output_size = batch * channels * pooled_height * pooled_width;
+
+ MaxPoolForwardNHWC<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+ kThreadsPerBlock, 0, d.stream()>>>(
+ output_size, bottom_data, height, width, channels, pooled_height,
+ pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
+ top_data, mask);
+ return d.ok();
+}
+
+bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
+ const int height, const int width,
+ const int channels, const int pooled_height,
+ const int pooled_width, const int kernel_h,
+ const int kernel_w, const int stride_h,
+ const int stride_w, const int pad_t, const int pad_l,
+ const float* top_diff, float* bottom_diff,
+ const Eigen::GpuDevice& d) {
+ const int kThreadsPerBlock = 1024;
+ const int bottom_size = batch * channels * height * width;
+ const int top_size = batch * channels * pooled_height * pooled_width;
+
+ SetZero<<<(bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+ kThreadsPerBlock, 0, d.stream()>>>(bottom_size, bottom_diff);
+
+ MaxPoolBackwardNoMaskNHWC<<<(top_size + kThreadsPerBlock - 1) /
+ kThreadsPerBlock,
+ kThreadsPerBlock, 0, d.stream()>>>(
+ top_size, bottom_data, height, width, channels, pooled_height,
+ pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
+ top_diff, bottom_diff);
+ return d.ok();
+}
+
+bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
+ const float* top_diff, const int64* mask,
+ const int top_offset, const int bottom_offset,
+ float* bottom_diff, const Eigen::GpuDevice& d) {
+ const int kThreadsPerBlock = 1024;
+ SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+ kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
+ MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+ kThreadsPerBlock, 0, d.stream()>>>(
+ output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
+ return d.ok();
+}
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_GPU_KERNELS(T) \
+ template struct functor::SpatialMaxPooling<GPUDevice, T>;
+
+DEFINE_GPU_KERNELS(float)
+
+#undef DEFINE_GPU_KERNELS
+
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.h b/tensorflow/core/kernels/maxpooling_op_gpu.h
new file mode 100644
index 0000000000..bfdac904cc
--- /dev/null
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.h
@@ -0,0 +1,42 @@
+#if !GOOGLE_CUDA
+#error This file must only be included when building with Cuda support
+#endif
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
+
+namespace tensorflow {
+
+// Run the forward pass of max pooling, optionally writing the argmax indices to
+// the mask array, if it is not nullptr. If mask is passed in as nullptr, the
+// argmax indices are not written.
+bool MaxPoolForwardWithOptionalArgmax(
+ const float* bottom_data, const int batch, const int height,
+ const int width, const int channels, const int pooled_height,
+ const int pooled_width, const int kernel_h, const int kernel_w,
+ const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+ float* top_data, int64* mask, const Eigen::GpuDevice& d);
+
+bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
+ const float* top_diff, const int64* mask,
+ const int top_offset, const int bottom_offset,
+ float* bottom_diff, const Eigen::GpuDevice& d);
+
+bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
+ const int height, const int width,
+ const int channels, const int pooled_height,
+ const int pooled_width, const int kernel_h,
+ const int kernel_w, const int stride_h,
+ const int stride_w, const int pad_t, const int pad_l,
+ const float* top_diff, float* bottom_diff,
+ const Eigen::GpuDevice& d);
+
+} // namespace tensorflow
+
+#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_
diff --git a/tensorflow/core/kernels/no_op.cc b/tensorflow/core/kernels/no_op.cc
new file mode 100644
index 0000000000..b4f9df81a6
--- /dev/null
+++ b/tensorflow/core/kernels/no_op.cc
@@ -0,0 +1,8 @@
+#include "tensorflow/core/kernels/no_op.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_CPU), NoOp);
+REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_GPU), NoOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/no_op.h b/tensorflow/core/kernels/no_op.h
new file mode 100644
index 0000000000..a3bcbd7680
--- /dev/null
+++ b/tensorflow/core/kernels/no_op.h
@@ -0,0 +1,17 @@
+#ifndef TENSORFLOW_KERNELS_NO_OP_H_
+#define TENSORFLOW_KERNELS_NO_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class NoOp : public OpKernel {
+ public:
+ explicit NoOp(OpKernelConstruction* context) : OpKernel(context) {}
+ void Compute(OpKernelContext* context) override {}
+ bool IsExpensive() override { return false; }
+};
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_NO_OP_H_
diff --git a/tensorflow/core/kernels/ops_testutil.cc b/tensorflow/core/kernels/ops_testutil.cc
new file mode 100644
index 0000000000..7bea17b9e2
--- /dev/null
+++ b/tensorflow/core/kernels/ops_testutil.cc
@@ -0,0 +1,18 @@
+#include "tensorflow/core/kernels/ops_testutil.h"
+
+namespace tensorflow {
+namespace test {
+
+NodeDef Node(const string& name, const string& op,
+ const std::vector<string>& inputs) {
+ NodeDef def;
+ def.set_name(name);
+ def.set_op(op);
+ for (const string& s : inputs) {
+ def.add_input(s);
+ }
+ return def;
+}
+
+} // namespace test
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/ops_testutil.h b/tensorflow/core/kernels/ops_testutil.h
new file mode 100644
index 0000000000..7a3405bf04
--- /dev/null
+++ b/tensorflow/core/kernels/ops_testutil.h
@@ -0,0 +1,191 @@
+#ifndef TENSORFLOW_KERNELS_OPS_TESTUTIL_H_
+#define TENSORFLOW_KERNELS_OPS_TESTUTIL_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/env.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+
+namespace test {
+
+// Return a NodeDef with the specified name/op/inputs.
+NodeDef Node(const string& name, const string& op,
+ const std::vector<string>& inputs);
+
+} // namespace test
+
+// Helpful functions to test operators.
+//
+// This class will eventually be replaced / heavily modified
+// to use the BrainClient interface.
+class OpsTestBase : public ::testing::Test {
+ public:
+ OpsTestBase() : device_type_(DEVICE_CPU) {
+ device_.reset(
+ DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+ CHECK(device_.get()) << "Could not create CPU device";
+ }
+
+ ~OpsTestBase() override {
+ gtl::STLDeleteElements(&tensors_);
+ context_.reset(nullptr);
+ }
+
+ void set_node_def(const NodeDef& node_def) { node_def_.CopyFrom(node_def); }
+
+ // Clients can manipulate the underlying NodeDef via this accessor.
+ NodeDef* node_def() { return &node_def_; }
+
+ // Initializes an operator that takes in 'input_types' as input
+ // and output types as output.
+ //
+ // Returns the status of initialization.
+ Status InitOp() {
+ Status status;
+ kernel_ = CreateOpKernel(device_type_, device_.get(), allocator(),
+ node_def_, &status);
+ if (kernel_ != nullptr) input_types_ = kernel_->input_types();
+ return status;
+ }
+
+ // Adds an input for every element described by the shape.
+ // 'input_mapping' maps an index (0...NumElements(shape)) to a
+ // value.
+ //
+ // TODO(vrv): Replace with something like a BrainClient Feed.
+ template <typename T>
+ void AddInput(const TensorShape& shape, std::function<T(int)> input_mapping) {
+ CHECK_GT(input_types_.size(), inputs_.size())
+ << "Adding more inputs than types; perhaps you need to call MakeOp";
+ bool is_ref = IsRefType(input_types_[inputs_.size()]);
+ Tensor* input = new Tensor(device_->GetAllocator(AllocatorAttributes()),
+ DataTypeToEnum<T>::v(), shape);
+ test::FillFn(input, input_mapping);
+ tensors_.push_back(input);
+ if (is_ref) {
+ CHECK_EQ(RemoveRefType(input_types_[inputs_.size()]),
+ DataTypeToEnum<T>::v());
+ inputs_.push_back({&lock_for_refs_, input});
+ } else {
+ CHECK_EQ(input_types_[inputs_.size()], DataTypeToEnum<T>::v());
+ inputs_.push_back({nullptr, input});
+ }
+ }
+
+ // Like AddInput but takes in an explicit arrayslice of data.
+ template <typename T>
+ void AddInputFromArray(const TensorShape& shape,
+ const gtl::ArraySlice<T>& data) {
+ CHECK_GT(input_types_.size(), inputs_.size())
+ << "Adding more inputs than types; perhaps you need to call MakeOp";
+ bool is_ref = IsRefType(input_types_[inputs_.size()]);
+ Tensor* input = new Tensor(device_->GetAllocator(AllocatorAttributes()),
+ DataTypeToEnum<T>::v(), shape);
+ test::FillValues<T>(input, data);
+ tensors_.push_back(input);
+ if (is_ref) {
+ CHECK_EQ(RemoveRefType(input_types_[inputs_.size()]),
+ DataTypeToEnum<T>::v());
+ inputs_.push_back({&lock_for_refs_, input});
+ } else {
+ CHECK_EQ(input_types_[inputs_.size()], DataTypeToEnum<T>::v());
+ inputs_.push_back({nullptr, input});
+ }
+ }
+
+ // Runs an operation producing 'num_outputs' outputs.
+ //
+ // Returns the context's status after running the operation.
+ Status RunOpKernel() {
+ OpKernelContext::Params params;
+ params.device = device_.get();
+ params.frame_iter = FrameAndIter(0, 0);
+ params.inputs = &inputs_;
+ params.op_kernel = kernel_.get();
+ params.output_alloc_attr = [this, &params](int index) {
+ AllocatorAttributes attr;
+ const bool on_host =
+ (kernel_->output_memory_types()[index] == HOST_MEMORY);
+ attr.set_on_host(on_host);
+ return attr;
+ };
+ checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper;
+ params.slice_reader_cache = &slice_reader_cache_wrapper;
+
+ context_.reset(new OpKernelContext(params));
+ device_->Compute(kernel_.get(), context_.get());
+ return context_->status();
+ }
+
+ // Returns the tensor input for 'input_index'.
+ //
+ // REQUIRES: 0 <= input_index < context_->num_inputs()
+ const Tensor& GetInput(int input_index) const {
+ CHECK_LT(input_index, context_->num_inputs());
+ CHECK(!IsRefType(context_->input_dtype(input_index)));
+ return context_->input(input_index);
+ }
+
+ TensorValue mutable_input(int input_index) {
+ CHECK_LT(input_index, inputs_.size());
+ return inputs_[input_index];
+ }
+ // Returns the tensor output for 'output_index'.
+ //
+ // REQUIRES: 0 <= output_index < context_->num_outputs()
+ Tensor* GetOutput(int output_index) {
+ CHECK_LT(output_index, context_->num_outputs());
+ return context_->mutable_output(output_index);
+ }
+
+ Allocator* allocator() {
+ return device_->GetAllocator(AllocatorAttributes());
+ }
+
+ const DataTypeVector& output_types() const { return kernel_->output_types(); }
+
+ protected:
+ std::unique_ptr<Device> device_;
+
+ std::unique_ptr<OpKernel> kernel_;
+ NodeDef node_def_;
+ DataTypeVector input_types_;
+ DeviceType device_type_;
+
+ mutex lock_for_refs_; // Used as the Mutex for inputs added as refs
+
+ gtl::InlinedVector<TensorValue, 4> inputs_;
+ // Owns Tensors.
+ std::vector<Tensor*> tensors_;
+
+ std::unique_ptr<OpKernelContext> context_;
+
+ private:
+ TF_DISALLOW_COPY_AND_ASSIGN(OpsTestBase);
+};
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_OPS_TESTUTIL_H_
diff --git a/tensorflow/core/kernels/ops_util.cc b/tensorflow/core/kernels/ops_util.cc
new file mode 100644
index 0000000000..ca2925128e
--- /dev/null
+++ b/tensorflow/core/kernels/ops_util.cc
@@ -0,0 +1,113 @@
+#include <cmath>
+
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+
+void RequireDefaultOps() {
+// TODO(opensource): Use a more generic sounding preprocessor name than
+// GOOGLE_CUDA (maybe SUPPORT_CUDA?)
+#if GOOGLE_CUDA
+ void RequireGPUDevice();
+ RequireGPUDevice();
+#endif
+}
+
+Status Get2dOutputSize(const int in_height, const int in_width,
+ int filter_height, int filter_width, int row_stride,
+ int col_stride, Padding padding, int* new_height,
+ int* new_width, int* pad_rows, int* pad_cols) {
+ int pad_bottom_unused, pad_right_unused;
+ return Get2dOutputSizeVerbose(
+ in_height, in_width, filter_height, filter_width, row_stride, col_stride,
+ padding, new_height, new_width, pad_rows, &pad_bottom_unused, pad_cols,
+ &pad_right_unused);
+}
+
+Status Get2dOutputSizeVerbose(const int in_height, const int in_width,
+ int filter_height, int filter_width,
+ int row_stride, int col_stride, Padding padding,
+ int* new_height, int* new_width, int* pad_top,
+ int* pad_bottom, int* pad_left, int* pad_right) {
+ // Cannot have strides larger than the patch size.
+ if (row_stride > filter_height || col_stride > filter_width) {
+ return errors::InvalidArgument(
+ "stride must be less than or equal to kernel size");
+ }
+ switch (padding) {
+ case Padding::VALID:
+ *new_height = ceil((in_height - filter_height + 1.f) /
+ static_cast<float>(row_stride));
+ *new_width = ceil((in_width - filter_width + 1.f) /
+ static_cast<float>(col_stride));
+ *pad_top = 0;
+ *pad_bottom = 0;
+ *pad_left = 0;
+ *pad_right = 0;
+ break;
+ case Padding::SAME:
+ *new_height = ceil(in_height / static_cast<float>(row_stride));
+ *new_width = ceil(in_width / static_cast<float>(col_stride));
+ // Calculate padding for top/bottom/left/right, spilling any excess
+ // padding to bottom and right.
+ const int pad_needed_height =
+ (*new_height - 1) * row_stride + filter_height - in_height;
+ *pad_top = pad_needed_height / 2;
+ CHECK_GE(pad_needed_height, 0);
+ *pad_bottom = pad_needed_height - *pad_top;
+
+ const int pad_needed_width =
+ (*new_width - 1) * col_stride + filter_width - in_width;
+ *pad_left = pad_needed_width / 2;
+ CHECK_GE(pad_needed_width, 0);
+ *pad_right = pad_needed_width - *pad_left;
+ break;
+ }
+ if (*new_height < 0 || *new_width < 0) {
+ return errors::InvalidArgument("computed output size would be negative");
+ }
+ return Status::OK();
+}
+
+Eigen::PaddingType BrainPadding2EigenPadding(Padding padding) {
+ switch (padding) {
+ case Padding::VALID:
+ return Eigen::PADDING_VALID;
+ case Padding::SAME:
+ return Eigen::PADDING_SAME;
+ }
+ return Eigen::PADDING_SAME; // Prevent compiler warning about missing return
+}
+
+Status GetBroadcastSize(const int index, const int in_size,
+ const int ksize, const int stride,
+ const int pad_size, int* bindex, int* bsize) {
+ // Cannot have strides larger than the patch size.
+ if (stride > ksize) {
+ return errors::InvalidArgument(
+ "stride must be less than or equal to kernel size");
+ }
+ // Cannot have index beyond the input size.
+ if (index * stride > in_size) {
+ return errors::InvalidArgument(
+ "index * stride must be less than or equal to input size");
+ }
+ *bindex = index * stride;
+ *bsize = ksize;
+ if (*bindex < pad_size) {
+ // If the current index is in the padding area, start broadcast from index
+ // 0 with broadcast size reduced by padding size.
+ *bsize = ksize + *bindex - pad_size;
+ *bindex = 0;
+ } else {
+ // Otherwise, start broadcast from current index reduced by padding size.
+ *bindex -= pad_size;
+ }
+ if (*bindex + ksize > in_size) {
+ *bsize = std::min((in_size - *bindex), ksize);
+ }
+ return Status::OK();
+}
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/ops_util.h b/tensorflow/core/kernels/ops_util.h
new file mode 100644
index 0000000000..283338f8df
--- /dev/null
+++ b/tensorflow/core/kernels/ops_util.h
@@ -0,0 +1,180 @@
+#ifndef TENSORFLOW_KERNELS_OPS_UTIL_H_
+#define TENSORFLOW_KERNELS_OPS_UTIL_H_
+
+// This file contains utilities for various operations.
+
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+// Call this function from a test if op kernels are not being
+// registered. This can happen if the test is linked in a shared
+// mode and has no direct references to any code from this directory.
+void RequireDefaultOps();
+
+// Get2dOutputSize(): Given an input tensor, kernel, stride and padding
+// type, the function computes the output and padding dimensions.
+//
+// Convolution layers take in an input tensor of shape (D, C, R, B), and
+// convolve it with a set of filters, which can also be presented as a
+// tensor (D, K, K, M), where M is the number of filters, K is the filter size,
+// and each 3-dimensional tensor of size (D, K, K) is a filter. For
+// simplicity we assume that we always use square filters (which is usually the
+// case in images). It also takes in a few additional parameters:
+//
+// Stride (S): the stride with which we apply the filters. This is the offset
+// between locations where we apply the filters. A larger stride
+// means that the output will be spatially smaller.
+//
+// Padding (P): the padding we apply to the input tensor along the R and C
+// dimensions. This is usually used to make sure that the spatial dimension
+// do not shrink when we progress with convolutions. Two types of padding are
+// often used:
+// SAME: the pad value is computed so that the output will have size R/S
+// and C/S.
+// VALID: no padding is carried out.
+// The padded area is zero-filled.
+//
+// The output dimensions for convolution and many other operations, when given
+// all the parameters above, are as follows:
+// - When Padding = SAME: the output size is (B, R', C', M), where
+// R' = ceil(float(R) / float(S))
+// C' = ceil(float(C) / float(S))
+// where ceil is the ceiling function. The number of padded rows and columns
+// are computed as:
+// Pr = ((R' - 1) * S + K - R) / 2
+// Pc = ((C' - 1) * S + K - C) / 2
+// When the stride is 1, we have the simplified case
+// R'=R, C'=C, Pr=Pc=(K-1)/2.
+// This is where SAME comes from - the output has the same size as the input
+// has.
+//
+// - When Padding = VALID: the output size is computed as
+// R' = ceil(float(R - K + 1) / float(S))
+// C' = ceil(float(C - K + 1) / float(S))
+// and the number of padded rows and columns are computed in the same way.
+// When the stride is 1, we have the simplified case
+// R'=R-K+1, C'=C-K+1, Pr=0, Pc=0.
+//
+// For convolution, mathematically, the output value at location (b, r', c', m)
+// is the inner product of two vectors: the chunk of input at
+// (b, (r'*S-Pr) : (r'*S-Pr+K), (c'*S-Pc) : (c'*S-Pc+K), :),
+// and the filter at (m, :, :, :).
+//
+Status Get2dOutputSize(const int in_height, const int in_width,
+ int filter_height, int filter_width, int row_stride,
+ int col_stride, Padding padding, int* new_height,
+ int* new_width, int* pad_rows, int* pad_cols);
+
+// Returns the same output dimensions as in Get2dOutputSize, but returns verbose
+// padding dimensions (top/bottom/left/right). Any excess padding (caused by
+// an odd padding size value) is added to the 'pad_bottom' and 'pad_right'
+// dimensions.
+Status Get2dOutputSizeVerbose(const int in_height, const int in_width,
+ int filter_height, int filter_width,
+ int row_stride, int col_stride, Padding padding,
+ int* new_height, int* new_width, int* pad_top,
+ int* pad_bottom, int* pad_left, int* pad_right);
+
+// Calculates broadcast starting index and size. For SAME padding, addition
+// padding could be applied to right, left, top and bottom. Depending on the
+// current index, input size, kernel size, stride, padding size, the starting
+// index and size for broadcast for that dimension are different from the
+// current index and kernel size.
+// This is mainly used by gradient algorithms for pooling operations.
+Status GetBroadcastSize(const int index, const int in_size,
+ const int ksize, const int stride,
+ const int pad_size, int* bindex, int* bsize);
+
+// Converts Brain's Padding to Eigen's PaddingType.
+Eigen::PaddingType BrainPadding2EigenPadding(Padding padding);
+
+// Given a shape 's' of a tensor of type T. Returns true iff the
+// number of bytes occupied by each dim 0 (i.e., &tensor(i + 1, ...) -
+// &tensor(i, ...)) is multiple of EIGEN_ALIGN_BYTES.
+template <typename T>
+bool IsInnerDimsSizeAligned(const TensorShape& s) {
+ if (s.dims() == 0) return false;
+ const int64 dim0_size = s.dim_size(0);
+ if (dim0_size == 0) return false;
+ const int64 bytes_per_dim0 = (s.num_elements() / dim0_size) * sizeof(T);
+ return bytes_per_dim0 % EIGEN_MAX_ALIGN_BYTES == 0;
+}
+
+// Returns in 'col_data', image patches in storage order (height, width, depth)
+// extracted from image at 'input_data', which is requred to be in storage
+// order (batch, height, width, depth).
+// Implementation written by Yangqing Jia (jiayq).
+template <typename T>
+void Im2col(const T* input_data, const int depth, const int height,
+ const int width, const int filter_h, const int filter_w,
+ const int pad_t, const int pad_l, const int pad_b, const int pad_r,
+ const int stride_h, const int stride_w, T* col_data) {
+ int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
+ int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
+
+ int h_pad = -pad_t;
+ for (int h = 0; h < height_col; ++h) {
+ int w_pad = -pad_l;
+ for (int w = 0; w < width_col; ++w) {
+ for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
+ for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
+ if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
+ memcpy(col_data, input_data + (ih * width + iw) * depth,
+ sizeof(T) * depth);
+ } else {
+ // This should be simply padded with zero.
+ memset(col_data, 0, sizeof(T) * depth);
+ }
+ col_data += depth;
+ }
+ }
+ w_pad += stride_w;
+ }
+ h_pad += stride_h;
+ }
+}
+
+// Returns in 'im_data' image patch in storage order (height, width, depth),
+// constructed from patches in 'col_data', which is required to be in storage
+// order (out_height * out_width, filter_height, filter_width, in_depth).
+// Implementation by Yangqing Jia (jiayq).
+template <typename T>
+void Col2im(const T* col_data, const int depth, const int height,
+ const int width, const int filter_h, const int filter_w,
+ const int pad_t, const int pad_l, const int pad_b, const int pad_r,
+ const int stride_h, const int stride_w, T* im_data) {
+ memset(im_data, 0, sizeof(T) * height * width * depth);
+ int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
+ int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
+ int h_pad = -pad_t;
+ for (int h = 0; h < height_col; ++h) {
+ int w_pad = -pad_l;
+ for (int w = 0; w < width_col; ++w) {
+ T* im_patch_data = im_data + (h_pad * width + w_pad) * depth;
+ for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
+ for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
+ if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
+ // TODO(andydavis) Vectorize this loop (if compiler does not).
+ for (int i = 0; i < depth; ++i) {
+ im_patch_data[i] += col_data[i];
+ }
+ }
+ im_patch_data += depth;
+ col_data += depth;
+ }
+ // Jump over remaining number of depth.
+ im_patch_data += depth * (width - filter_w);
+ }
+ w_pad += stride_w;
+ }
+ h_pad += stride_h;
+ }
+}
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_OPS_UTIL_H_
diff --git a/tensorflow/core/kernels/ops_util_test.cc b/tensorflow/core/kernels/ops_util_test.cc
new file mode 100644
index 0000000000..bc4f57e220
--- /dev/null
+++ b/tensorflow/core/kernels/ops_util_test.cc
@@ -0,0 +1,265 @@
+#include "tensorflow/core/kernels/ops_util.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+namespace {
+
+class OpsUtilTest : public ::testing::Test {
+ protected:
+ OpsUtilTest() {}
+ ~OpsUtilTest() override {}
+
+ // Padding structure.
+ struct padding_struct {
+ // Input parameters.
+ struct {
+ int in_height;
+ int in_width;
+ int filter_height;
+ int filter_width;
+ int row_stride;
+ int col_stride;
+ Padding padding;
+ } input;
+ // Output.
+ struct {
+ int new_height;
+ int new_width;
+ int pad_top;
+ int pad_bottom;
+ int pad_left;
+ int pad_right;
+ } output;
+ };
+
+ // Broadcast structure.
+ struct bcast_struct {
+ // Input parameters.
+ struct {
+ int index; // Current index.
+ int in_size; // Size of the dimension.
+ int ksize; // Kernel size.
+ int stride; // Stride.
+ int pad_size; // Padding size.
+ } input;
+ // Output.
+ struct {
+ int new_index; // New starting index.
+ int new_size; // New broadcast size.
+ } output;
+ };
+
+ static void VerifyGet2dOutputSizeBoundaries(padding_struct pad_struct,
+ error::Code code) {
+ int new_height, new_width, pad_rows, pad_cols;
+ Status status = Get2dOutputSize(
+ pad_struct.input.in_height, pad_struct.input.in_width,
+ pad_struct.input.filter_height, pad_struct.input.filter_width,
+ pad_struct.input.row_stride, pad_struct.input.col_stride,
+ pad_struct.input.padding, &new_height, &new_width, &pad_rows,
+ &pad_cols);
+ EXPECT_EQ(status.code(), code) << status;
+ }
+
+ static void VerifyGet2dOutputSizeValues(padding_struct pad_struct,
+ error::Code code) {
+ int new_height, new_width, pad_rows, pad_cols;
+ Status status = Get2dOutputSize(
+ pad_struct.input.in_height, pad_struct.input.in_width,
+ pad_struct.input.filter_height, pad_struct.input.filter_width,
+ pad_struct.input.row_stride, pad_struct.input.col_stride,
+ pad_struct.input.padding, &new_height, &new_width, &pad_rows,
+ &pad_cols);
+ EXPECT_EQ(status.code(), code) << status;
+ EXPECT_EQ(pad_struct.output.new_height, new_height);
+ EXPECT_EQ(pad_struct.output.new_width, new_width);
+ EXPECT_EQ(pad_struct.output.pad_top, pad_rows);
+ EXPECT_EQ(pad_struct.output.pad_left, pad_cols);
+ }
+
+ static void VerifyGet2dOutputVerboseSizeValues(padding_struct pad_struct,
+ error::Code code) {
+ int new_height, new_width, pad_top, pad_bottom, pad_left, pad_right;
+ Status status = Get2dOutputSizeVerbose(
+ pad_struct.input.in_height, pad_struct.input.in_width,
+ pad_struct.input.filter_height, pad_struct.input.filter_width,
+ pad_struct.input.row_stride, pad_struct.input.col_stride,
+ pad_struct.input.padding, &new_height, &new_width, &pad_top,
+ &pad_bottom, &pad_left, &pad_right);
+ EXPECT_EQ(status.code(), code) << status;
+ EXPECT_EQ(pad_struct.output.new_height, new_height);
+ EXPECT_EQ(pad_struct.output.new_width, new_width);
+ EXPECT_EQ(pad_struct.output.pad_top, pad_top);
+ EXPECT_EQ(pad_struct.output.pad_bottom, pad_bottom);
+ EXPECT_EQ(pad_struct.output.pad_left, pad_left);
+ EXPECT_EQ(pad_struct.output.pad_right, pad_right);
+ }
+
+ static void VerifyBoundaries(bcast_struct bcast, error::Code code) {
+ int new_index, new_size;
+ Status status = GetBroadcastSize(
+ bcast.input.index, bcast.input.in_size, bcast.input.ksize,
+ bcast.input.stride, bcast.input.pad_size, &new_index, &new_size);
+ EXPECT_EQ(status.code(), code) << status;
+ }
+
+ static void VerifyBcastValues(bcast_struct bcast) {
+ int new_index, new_size;
+ EXPECT_EQ(Status::OK(),
+ GetBroadcastSize(bcast.input.index, bcast.input.in_size,
+ bcast.input.ksize, bcast.input.stride,
+ bcast.input.pad_size, &new_index, &new_size));
+ EXPECT_EQ(bcast.output.new_index, new_index);
+ EXPECT_EQ(bcast.output.new_size, new_size);
+ }
+};
+
+// Test stride > ksize fails with INVALID_ARGUMENT.
+TEST_F(OpsUtilTest, Get2dOutputSizeInvalidTest) {
+ padding_struct pad_struct = {{3, 3, 1, 2, 2, 2, SAME}, {3, 3, 1, 1, 1, 1}};
+ VerifyGet2dOutputSizeBoundaries(pad_struct, error::INVALID_ARGUMENT);
+}
+
+TEST_F(OpsUtilTest, Get2dOutputSizeNegativeSizeTest) {
+ padding_struct pad_struct = {{1, 1, 3, 3, 1, 1, VALID}, {-1, -1, 0, 0, 0, 0}};
+ VerifyGet2dOutputSizeBoundaries(pad_struct, error::INVALID_ARGUMENT);
+}
+
+TEST_F(OpsUtilTest, Get2dOutputSizeSquareFilterTest) {
+ padding_struct pad_struct1 = {{3, 3, 2, 2, 2, 2, SAME}, {2, 2, 0, 0, 0, 0}};
+ padding_struct pad_struct2 = {{3, 3, 2, 2, 2, 2, VALID}, {1, 1, 0, 0, 0, 0}};
+ VerifyGet2dOutputSizeValues(pad_struct1, error::OK);
+ VerifyGet2dOutputSizeValues(pad_struct2, error::OK);
+}
+
+TEST_F(OpsUtilTest, Get2dOutputSizeNonSquareFilterTest) {
+ padding_struct pad_struct1 = {{4, 5, 1, 2, 1, 1, SAME}, {4, 5, 0, 0, 0, 0}};
+ padding_struct pad_struct2 = {{4, 5, 1, 2, 1, 1, VALID}, {4, 4, 0, 0, 0, 0}};
+ VerifyGet2dOutputSizeValues(pad_struct1, error::OK);
+ VerifyGet2dOutputSizeValues(pad_struct2, error::OK);
+}
+
+TEST_F(OpsUtilTest, Get2dOutputSizeUnevenStrideTest) {
+ padding_struct pad_struct1 = {{4, 4, 2, 2, 1, 2, VALID}, {3, 2, 0, 0, 0, 0}};
+ padding_struct pad_struct2 = {{4, 4, 2, 2, 2, 1, VALID}, {2, 3, 0, 0, 0, 0}};
+ VerifyGet2dOutputSizeValues(pad_struct1, error::OK);
+ VerifyGet2dOutputSizeValues(pad_struct2, error::OK);
+}
+
+TEST_F(OpsUtilTest, Get2dOutputSizeVerbose) {
+ padding_struct pad_struct1 = {{3, 3, 2, 2, 2, 2, SAME}, {2, 2, 0, 1, 0, 1}};
+ padding_struct pad_struct2 = {{3, 3, 2, 2, 2, 2, VALID}, {1, 1, 0, 0, 0, 0}};
+ VerifyGet2dOutputVerboseSizeValues(pad_struct1, error::OK);
+ VerifyGet2dOutputVerboseSizeValues(pad_struct2, error::OK);
+}
+
+// Test stride > ksize fails with INVALID_ARGUMENT.
+TEST_F(OpsUtilTest, GetBroadcastTest3_1_2_0) {
+ bcast_struct bcast = {{0, 3, 1, 2, 0}, {0, 3}};
+ VerifyBoundaries(bcast, error::INVALID_ARGUMENT);
+}
+
+// Test index * stride > in_size fails with INVALID_ARGUMENT.
+TEST_F(OpsUtilTest, GetBroadcastTestBadIndex) {
+ bcast_struct bcast = {{2, 3, 1, 2, 0}, {0, 3}};
+ VerifyBoundaries(bcast, error::INVALID_ARGUMENT);
+}
+
+// in_size = 3, ksize = 3, stride = 1, pad_size = 0
+TEST_F(OpsUtilTest, GetBroadcastTest3_3_1_0) {
+ bcast_struct bcast[] = {
+ {{0, 3, 3, 1, 0}, {0, 3}},
+ {{1, 3, 3, 1, 0}, {1, 2}},
+ {{2, 3, 3, 1, 0}, {2, 1}},
+ };
+ for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+ VerifyBcastValues(bcast[i]);
+ }
+}
+
+// in_size = 3, ksize = 3, stride = 1, pad_size = 1
+TEST_F(OpsUtilTest, GetBroadcastTest3_3_1_1) {
+ bcast_struct bcast[] = {
+ {{0, 3, 3, 1, 1}, {0, 2}},
+ {{1, 3, 3, 1, 1}, {0, 3}},
+ {{2, 3, 3, 1, 1}, {1, 2}},
+ };
+ for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+ VerifyBcastValues(bcast[i]);
+ }
+}
+
+// in_size = 3, ksize = 3, stride = 1, pad_size = 2
+TEST_F(OpsUtilTest, GetBroadcastTest3_3_1_2) {
+ bcast_struct bcast[] = {
+ {{0, 3, 3, 1, 2}, {0, 1}},
+ {{1, 3, 3, 1, 2}, {0, 2}},
+ {{2, 3, 3, 1, 2}, {0, 3}},
+ };
+ for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+ VerifyBcastValues(bcast[i]);
+ }
+}
+
+// in_size = 3, ksize = 3, stride = 2, pad_size = 0
+TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_0) {
+ bcast_struct bcast[] = {
+ {{0, 3, 3, 2, 0}, {0, 3}}, {{1, 3, 3, 2, 0}, {2, 1}},
+ };
+ for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+ VerifyBcastValues(bcast[i]);
+ }
+}
+
+// in_size = 3, ksize = 3, stride = 2, pad_size = 1
+TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_1) {
+ bcast_struct bcast[] = {
+ {{0, 3, 3, 2, 1}, {0, 2}}, {{1, 3, 3, 2, 1}, {1, 2}},
+ };
+ for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+ VerifyBcastValues(bcast[i]);
+ }
+}
+
+// in_size = 3, ksize = 3, stride = 2, pad_size = 2
+TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_2) {
+ bcast_struct bcast[] = {
+ {{0, 3, 3, 2, 2}, {0, 1}},
+ };
+ for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+ VerifyBcastValues(bcast[i]);
+ }
+}
+
+// in_size = 3, ksize = 3, stride = 3, pad_size = 0
+TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_0) {
+ bcast_struct bcast[] = {
+ {{0, 3, 3, 3, 0}, {0, 3}},
+ };
+ for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+ VerifyBcastValues(bcast[i]);
+ }
+}
+
+// in_size = 3, ksize = 3, stride = 3, pad_size = 1
+TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_1) {
+ bcast_struct bcast[] = {
+ {{0, 3, 3, 3, 1}, {0, 2}}, {{1, 3, 3, 3, 1}, {2, 1}},
+ };
+ for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+ VerifyBcastValues(bcast[i]);
+ }
+}
+
+// in_size = 3, ksize = 3, stride = 3, pad_size = 2
+TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_2) {
+ bcast_struct bcast[] = {
+ {{0, 3, 3, 3, 2}, {0, 1}},
+ };
+ for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+ VerifyBcastValues(bcast[i]);
+ }
+}
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
new file mode 100644
index 0000000000..cb125ea2fe
--- /dev/null
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -0,0 +1,114 @@
+// See docs in ../ops/array_ops.cc.
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/concat_op.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/public/status.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// --------------------------------------------------------------------------
+template <typename Device, typename T>
+class PackOp : public OpKernel {
+ public:
+ typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>
+ ConstMatrixVector;
+
+ explicit PackOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+ void Compute(OpKernelContext* c) override {
+ OpInputList values;
+ OP_REQUIRES_OK(c, c->input_list("values", &values));
+ const int num = values.size();
+
+ // Verify that all input shapes match
+ for (int i = 1; i < num; i++) {
+ OP_REQUIRES(c, values[0].shape().IsSameSize(values[i].shape()),
+ errors::InvalidArgument(
+ "Shapes of all inputs must match: values[0].shape = ",
+ values[0].shape().ShortDebugString(), " != values[", i,
+ "].shape = ", values[i].shape().ShortDebugString()));
+ }
+
+ TensorShape output_shape(values[0].shape());
+ output_shape.InsertDim(0, num);
+
+ // In the num = 1 case, just reshape the input
+ if (num == 1) {
+ Tensor output;
+ CHECK(output.CopyFrom(values[0], output_shape));
+ c->set_output(0, output);
+ return;
+ }
+
+ // Allocate output
+ Tensor* output;
+ OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
+
+ const int output_size = output->NumElements();
+ if (output_size > 0) {
+ auto output_flat = output->shaped<T, 2>({1, output_size});
+
+ // Except for shapes, pack is a special case of concat, so we reuse the
+ // same computational kernels.
+ ConstMatrixVector inputs_flat;
+ inputs_flat.reserve(num);
+ for (int i = 0; i < num; ++i) {
+ inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+ values[i].shaped<T, 2>({1, values[i].NumElements()})));
+ }
+ if (std::is_same<Device, GPUDevice>::value) {
+ ConcatGPU<T>(c->eigen_gpu_device(), inputs_flat, &output_flat);
+ } else {
+ ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+ }
+ }
+ }
+};
+
+#define REGISTER_PACK(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Pack").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ PackOp<CPUDevice, type>)
+
+TF_CALL_ALL_TYPES(REGISTER_PACK);
+REGISTER_PACK(quint8);
+REGISTER_PACK(qint8);
+REGISTER_PACK(qint32);
+REGISTER_PACK(bfloat16);
+
+#undef REGISTER_PACK
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Pack").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+ PackOp<GPUDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+#undef REGISTER_GPU
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Pack")
+ .Device(DEVICE_GPU)
+ .HostMemory("values")
+ .HostMemory("output")
+ .TypeConstraint<int32>("T"),
+ PackOp<CPUDevice, int32>);
+
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
new file mode 100644
index 0000000000..6c66e54e3d
--- /dev/null
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -0,0 +1,159 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/pad_op.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class PadOp : public OpKernel {
+ public:
+ explicit PadOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& in0 = context->input(0);
+ const Tensor& in1 = context->input(1);
+ const int dims = in0.dims();
+ static const int kMinDims = 0;
+ static const int kMaxDims = 5;
+ OP_REQUIRES(context, kMinDims <= dims && dims <= kMaxDims,
+ errors::Unimplemented("inputs rank not in [", kMinDims, ",",
+ kMaxDims, "]: ", dims));
+ OP_REQUIRES(
+ context,
+ TensorShapeUtils::IsMatrix(in1.shape()) && in1.dim_size(1) == 2,
+ errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
+ in1.shape().DebugString()));
+ const int fixed_dims =
+ (kAllowLegacyScalars && dims == 0 && in1.dim_size(0) == 1) ? 1 : dims;
+ OP_REQUIRES(
+ context, fixed_dims == in1.dim_size(0),
+ errors::InvalidArgument(
+ "The first dimension of paddings must be the rank of inputs",
+ in1.shape().DebugString(), " ", in0.shape().DebugString()));
+
+ // Compute the shape of the output tensor, and allocate it.
+ TensorShape output_shape;
+ TTypes<int32>::ConstMatrix paddings = in1.matrix<int32>();
+ for (int d = 0; d < fixed_dims; ++d) {
+ const int32 before_d = paddings(d, 0); // Pad before existing elements.
+ const int32 after_d = paddings(d, 1); // Pad after exisitng elements.
+ OP_REQUIRES(context, before_d >= 0 && after_d >= 0,
+ errors::InvalidArgument("Paddings must be non-negative: ",
+ before_d, " ", after_d));
+ const int size_d =
+ (kAllowLegacyScalars && d == in0.dims()) ? 1 : in0.dim_size(d);
+ output_shape.AddDim(before_d + size_d + after_d);
+ }
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+ // Invoke the dims-specific implementation.
+ switch (fixed_dims) {
+ case 0:
+ Operate<0>(context, in0.tensor<T, 0>(), paddings, output);
+ break;
+ case 1:
+ // TODO(irving): Once Pad doesn't need a scalar special case,
+ // change flat to tensor. That is, once !kAllowLegacyScalars.
+ Operate<1>(context, in0.flat<T>(), paddings, output);
+ break;
+ case 2:
+ Operate<2>(context, in0.tensor<T, 2>(), paddings, output);
+ break;
+ case 3:
+ Operate<3>(context, in0.tensor<T, 3>(), paddings, output);
+ break;
+ case 4:
+ Operate<4>(context, in0.tensor<T, 4>(), paddings, output);
+ break;
+ case 5:
+ Operate<5>(context, in0.tensor<T, 5>(), paddings, output);
+ break;
+ default:
+ OP_REQUIRES(context, false,
+ errors::InvalidArgument("Only ranks up to 5 supported: ",
+ in0.shape().DebugString()));
+ }
+ }
+
+ private:
+ template <int Dims>
+ void Operate(OpKernelContext* context,
+ typename TTypes<T, Dims>::ConstTensor input,
+ TTypes<int32>::ConstMatrix paddings, Tensor* output) {
+ CHECK_EQ(Dims, paddings.dimension(0));
+ CHECK_EQ(2, paddings.dimension(1));
+ Eigen::array<std::pair<int32, int32>, Dims> paddings_array;
+ for (int i = 0; i < Dims; ++i) {
+ paddings_array[i] = std::make_pair(paddings(i, 0), paddings(i, 1));
+ }
+ functor::Pad<Device, T, Dims> functor;
+ functor(context->eigen_device<Device>(), output->tensor<T, Dims>(), input,
+ paddings_array);
+ }
+};
+
+#define REGISTER_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER(Name("Pad") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("paddings"), \
+ PadOp<CPUDevice, type>)
+
+TF_CALL_ALL_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T, Dims) \
+ template <> \
+ void Pad<GPUDevice, T, Dims>::operator()( \
+ const GPUDevice& d, typename TTypes<T, Dims>::Tensor output, \
+ typename TTypes<T, Dims>::ConstTensor input, \
+ Eigen::array<std::pair<int32, int32>, Dims> paddings); \
+ extern template struct Pad<GPUDevice, T, Dims>;
+
+#define DECLARE_GPU_SPECS(T) \
+ DECLARE_GPU_SPEC(T, 0); \
+ DECLARE_GPU_SPEC(T, 1); \
+ DECLARE_GPU_SPEC(T, 2); \
+ DECLARE_GPU_SPEC(T, 3); \
+ DECLARE_GPU_SPEC(T, 4); \
+ DECLARE_GPU_SPEC(T, 5);
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
+} // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(T) \
+ REGISTER_KERNEL_BUILDER(Name("Pad") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<T>("T") \
+ .HostMemory("paddings"), \
+ PadOp<GPUDevice, T>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
+#endif // GOOGLE_CUDA
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/pad_op.h b/tensorflow/core/kernels/pad_op.h
new file mode 100644
index 0000000000..c4f8a4abda
--- /dev/null
+++ b/tensorflow/core/kernels/pad_op.h
@@ -0,0 +1,27 @@
+#ifndef TENSORFLOW_KERNELS_PAD_OP_H_
+#define TENSORFLOW_KERNELS_PAD_OP_H_
+// Functor definition for PadOp, must be compilable by nvcc.
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by PadOp to do the computations.
+template <typename Device, typename T, int Dims>
+struct Pad {
+ // Pad "input" into "output", as specified by "paddings". See pad_op.cc for
+ // details.
+ void operator()(const Device& d, typename TTypes<T, Dims>::Tensor output,
+ typename TTypes<T, Dims>::ConstTensor input,
+ Eigen::array<std::pair<int32, int32>, Dims> paddings) {
+ output.device(d) = input.pad(paddings);
+ }
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_PAD_OP_H_
diff --git a/tensorflow/core/kernels/pad_op_gpu.cu.cc b/tensorflow/core/kernels/pad_op_gpu.cu.cc
new file mode 100644
index 0000000000..35a03a2cb2
--- /dev/null
+++ b/tensorflow/core/kernels/pad_op_gpu.cu.cc
@@ -0,0 +1,26 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/pad_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Definition of the GPU implementations declared in pad_op.cc.
+#define DEFINE_GPU_SPECS(T) \
+ template struct functor::Pad<GPUDevice, T, 0>; \
+ template struct functor::Pad<GPUDevice, T, 1>; \
+ template struct functor::Pad<GPUDevice, T, 2>; \
+ template struct functor::Pad<GPUDevice, T, 3>; \
+ template struct functor::Pad<GPUDevice, T, 4>; \
+ template struct functor::Pad<GPUDevice, T, 5>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
+
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
new file mode 100644
index 0000000000..35e9bd75fa
--- /dev/null
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -0,0 +1,252 @@
+#include "tensorflow/core/kernels/pooling_ops_common.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/public/tensor.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
+#include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
+#include "tensorflow/stream_executor/dnn.h"
+#include "tensorflow/stream_executor/stream.h"
+#endif // GOOGLE_CUDA
+
+namespace tensorflow {
+
+PoolParameters::PoolParameters(OpKernelContext* context,
+ const std::vector<int32>& ksize,
+ const std::vector<int32>& stride,
+ Padding padding,
+ const TensorShape& tensor_in_shape) {
+ // For maxpooling, tensor_in should have 4 dimensions.
+ OP_REQUIRES(context, tensor_in_shape.dims() == 4,
+ errors::InvalidArgument("tensor_in must be 4-dimensional"));
+
+ depth = tensor_in_shape.dim_size(3);
+ tensor_in_cols = tensor_in_shape.dim_size(2);
+ tensor_in_rows = tensor_in_shape.dim_size(1);
+ tensor_in_batch = tensor_in_shape.dim_size(0);
+ window_rows = ksize[1];
+ window_cols = ksize[2];
+ depth_window = ksize[3];
+ row_stride = stride[1];
+ col_stride = stride[2];
+ depth_stride = stride[3];
+
+ // We only support 2D pooling across width/height and depthwise
+ // pooling, not a combination.
+ OP_REQUIRES(context,
+ (depth_window == 1 || (window_rows == 1 && window_cols == 1)),
+ errors::Unimplemented(
+ "MaxPooling supports exactly one of pooling across depth "
+ "or pooling across width/height."));
+
+ if (depth_window == 1) {
+ OP_REQUIRES_OK(context, Get2dOutputSize(
+ tensor_in_rows, tensor_in_cols, window_rows,
+ window_cols, row_stride, col_stride, padding,
+ &out_height, &out_width, &pad_rows, &pad_cols));
+ } else {
+ // Our current version of depthwise max pooling does not support
+ // any padding, and expects the depth_window to equal the
+ // depth_stride (no overlapping).
+ OP_REQUIRES(
+ context, depth % depth_window == 0,
+ errors::Unimplemented("Depthwise max pooling requires the depth "
+ "window to evenly divide the input depth"));
+ OP_REQUIRES(
+ context, depth_stride == depth_window,
+ errors::Unimplemented("Depthwise max pooling requires the depth "
+ "window to equal the depth stride"));
+
+ // The current version of depthwise max is only implemented on CPU.
+ OP_REQUIRES(context,
+ (DeviceType(static_cast<Device*>(context->device())
+ ->attributes()
+ .device_type()) == DeviceType(DEVICE_CPU)),
+ errors::Unimplemented("Depthwise max pooling is currently "
+ "only implemented for CPU devices."));
+
+ pad_depth = 0;
+ out_depth = depth / depth_window;
+ }
+}
+
+TensorShape PoolParameters::forward_output_shape() {
+ if (depth_window == 1) {
+ // Spatial pooling
+ return TensorShape({tensor_in_batch, out_height, out_width, depth});
+ } else {
+ // Depthwise pooling
+ return TensorShape(
+ {tensor_in_batch, tensor_in_rows, tensor_in_cols, out_depth});
+ }
+}
+
+#ifdef GOOGLE_CUDA
+
+namespace {
+template <typename T>
+perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory,
+ uint64 size) {
+ perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory),
+ size * sizeof(T));
+ perftools::gputools::DeviceMemory<T> typed(wrapped);
+ return typed;
+}
+} // namespace
+
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T) \
+ template <> \
+ void TransformDepth<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in, \
+ const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle, \
+ typename TTypes<T, 4>::Tensor out); \
+ extern template struct TransformDepth<GPUDevice, T>;
+
+DECLARE_GPU_SPEC(float);
+#undef DECLARE_GPU_SPEC
+} // namespace functor
+
+template <typename T>
+void DnnPoolingGradOp<T>::Compute(
+ OpKernelContext* context,
+ perftools::gputools::dnn::PoolingMode pooling_mode,
+ const std::vector<int32>& size, const std::vector<int32>& stride,
+ Padding padding, const Tensor* tensor_in, const Tensor* tensor_out,
+ const Tensor& out_backprop, const TensorShape& tensor_in_shape) {
+ CHECK((pooling_mode == perftools::gputools::dnn::PoolingMode::kMaximum) ||
+ (tensor_in && tensor_out))
+ << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
+ "specified";
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, tensor_in_shape, &output));
+
+ PoolParameters params{context, size, stride, padding, tensor_in_shape};
+ if (!context->status().ok()) {
+ return;
+ }
+
+ /// For now, cudnn does not support NHWC format, so we need to convert it
+ /// to NCHW before calling cudnn. We need to get rid of this once it is done
+ Tensor transformed_input;
+ OP_REQUIRES_OK(context, context->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({tensor_in_shape.dim_size(0),
+ tensor_in_shape.dim_size(3),
+ tensor_in_shape.dim_size(1),
+ tensor_in_shape.dim_size(2)}),
+ &transformed_input));
+ Tensor transformed_input_backprop;
+ OP_REQUIRES_OK(context, context->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({tensor_in_shape.dim_size(0),
+ tensor_in_shape.dim_size(3),
+ tensor_in_shape.dim_size(1),
+ tensor_in_shape.dim_size(2)}),
+ &transformed_input_backprop));
+ Tensor transformed_output;
+ OP_REQUIRES_OK(
+ context,
+ context->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({out_backprop.dim_size(0), out_backprop.dim_size(3),
+ out_backprop.dim_size(1), out_backprop.dim_size(2)}),
+ &transformed_output));
+ Tensor transformed_output_backprop;
+ OP_REQUIRES_OK(
+ context,
+ context->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({out_backprop.dim_size(0), out_backprop.dim_size(3),
+ out_backprop.dim_size(1), out_backprop.dim_size(2)}),
+ &transformed_output_backprop));
+
+ auto nhwc_to_nchw = Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2);
+ if (tensor_in) {
+ // For AvgPoolGrad, the original input tensor is not necessary. However,
+ // cudnn still requires them to run, although they do not affect the
+ // results.
+ functor::TransformDepth<GPUDevice, T>()(
+ context->eigen_device<Device>(), tensor_in->tensor<T, 4>(),
+ nhwc_to_nchw, transformed_input.tensor<T, 4>());
+ }
+ if (tensor_out) {
+ // For AvgPoolGrad, the original output tensor is not necessary. However,
+ // cudnn still requires them to run, although they do not affect the
+ // results.
+ functor::TransformDepth<GPUDevice, T>()(
+ context->eigen_device<Device>(), tensor_out->tensor<T, 4>(),
+ nhwc_to_nchw, transformed_output.tensor<T, 4>());
+ }
+ functor::TransformDepth<GPUDevice, T>()(
+ context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
+ nhwc_to_nchw, transformed_output_backprop.tensor<T, 4>());
+
+ /// Get ready to call cudnn
+ perftools::gputools::dnn::PoolingDescriptor pooling_desc;
+ pooling_desc.set_pooling_mode(pooling_mode)
+ .set_window_height(params.window_rows)
+ .set_window_width(params.window_cols)
+ .set_vertical_stride(params.row_stride)
+ .set_horizontal_stride(params.col_stride)
+ .set_vertical_padding(params.pad_rows)
+ .set_horizontal_padding(params.pad_cols);
+
+ perftools::gputools::dnn::BatchDescriptor orig_output_desc;
+ orig_output_desc.set_count(params.tensor_in_batch)
+ .set_height(params.out_height)
+ .set_width(params.out_width)
+ .set_feature_map_count(params.depth)
+ .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+
+ perftools::gputools::dnn::BatchDescriptor orig_input_desc;
+ orig_input_desc.set_count(params.tensor_in_batch)
+ .set_height(params.tensor_in_rows)
+ .set_width(params.tensor_in_cols)
+ .set_feature_map_count(params.depth)
+ .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+
+ auto orig_output_data =
+ AsDeviceMemory(transformed_output.template flat<T>().data(),
+ transformed_output.template flat<T>().size());
+ auto orig_input_data =
+ AsDeviceMemory(transformed_input.template flat<T>().data(),
+ transformed_input.template flat<T>().size());
+ auto output_backprop =
+ AsDeviceMemory(transformed_output_backprop.template flat<T>().data(),
+ transformed_output_backprop.template flat<T>().size());
+ auto input_backprop =
+ AsDeviceMemory(transformed_input_backprop.template flat<T>().data(),
+ transformed_input_backprop.template flat<T>().size());
+
+ auto* stream = context->op_device_context<GPUDeviceContext>()->stream();
+ OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+ bool status =
+ stream->ThenPoolBackward(pooling_desc, orig_input_desc, orig_input_data,
+ orig_output_desc, orig_output_data,
+ output_backprop, &input_backprop)
+ .ok();
+ OP_REQUIRES(context, status,
+ errors::Internal("cudnn PoolBackward launch failed"));
+
+ /// Transform the output data from NCHW back to NHWC
+ auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+ auto nchw_to_nhwc = Eigen::DSizes<Eigen::DenseIndex, 4>(0, 2, 3, 1);
+ functor::TransformDepth<GPUDevice, T>()(
+ context->eigen_device<Device>(),
+ toConstTensor(transformed_input_backprop).template tensor<T, 4>(),
+ nchw_to_nhwc, output->tensor<T, 4>());
+}
+
+template class DnnPoolingGradOp<float>;
+
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
new file mode 100644
index 0000000000..5bf44b6e40
--- /dev/null
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -0,0 +1,264 @@
+#ifndef TENSORFLOW_KERNELS_POOLING_OPS_COMMON_H_
+#define TENSORFLOW_KERNELS_POOLING_OPS_COMMON_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/avgpooling_op.h"
+#include "tensorflow/core/kernels/maxpooling_op.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// A helper class to manage sizes and shapes for pooling operations.
+struct PoolParameters {
+ // Updates context->status if there is an invalid input.
+ PoolParameters(OpKernelContext* context, const std::vector<int32>& ksize,
+ const std::vector<int32>& stride, Padding padding,
+ const TensorShape& tensor_in_shape);
+
+ // Returns the shape of the output for "forward" pooling operations.
+ TensorShape forward_output_shape();
+
+ int depth;
+
+ int tensor_in_cols;
+ int tensor_in_rows;
+ int tensor_in_batch;
+
+ int window_rows;
+ int window_cols;
+ int depth_window;
+
+ int row_stride;
+ int col_stride;
+ int depth_stride;
+
+ int out_height;
+ int out_width;
+ int out_depth;
+
+ int pad_rows;
+ int pad_cols;
+ int pad_depth;
+};
+
+// An implementation of MaxPooling (forward).
+template <typename Device, typename T>
+class MaxPoolingOp : public UnaryOp<T> {
+ public:
+ explicit MaxPoolingOp(OpKernelConstruction* context) : UnaryOp<T>(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+ OP_REQUIRES(context, ksize_.size() == 4,
+ errors::InvalidArgument("Sliding window ksize field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+ OP_REQUIRES(context, stride_.size() == 4,
+ errors::InvalidArgument("Sliding window stride field must "
+ "specify 4 dimensions"));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+ errors::Unimplemented(
+ "Pooling is not yet supported on the batch dimension."));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& tensor_in = context->input(0);
+ PoolParameters params{context, ksize_, stride_, padding_,
+ tensor_in.shape()};
+ if (!context->status().ok()) {
+ return;
+ }
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(
+ 0, params.forward_output_shape(), &output));
+
+ if (params.depth_window > 1) {
+ DepthwiseMaxPool(context, output, tensor_in, params);
+ } else {
+ SpatialMaxPool(context, output, tensor_in, params, padding_);
+ }
+ }
+
+ private:
+ // Single-threaded implementation of DepthwiseMaxPool which
+ // does not handle all of the same options as SpatialMaxPool
+ // (strict assumptions on no padding, stride).
+ //
+ // TODO(vrv): implement a more general depthwise-max pool that works
+ // on GPU as well.
+ void DepthwiseMaxPool(OpKernelContext* context, Tensor* output,
+ const Tensor& tensor_in, const PoolParameters& params) {
+ Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+ in_by_pool(tensor_in.flat<T>().data(), params.depth_window,
+ tensor_in.NumElements() / params.depth_window);
+ Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_by_pool(
+ output->flat<T>().data(), 1, output->NumElements());
+ out_by_pool = in_by_pool.colwise().maxCoeff();
+ }
+
+ void SpatialMaxPool(OpKernelContext* context, Tensor* output,
+ const Tensor& tensor_in, const PoolParameters& params,
+ const Padding& padding) {
+ // On GPU, use Eigen's Spatial Max Pooling. On CPU, use an
+ // EigenMatrix version that is currently faster than Eigen's
+ // Spatial MaxPooling implementation.
+ //
+ // TODO(vrv): Remove this once we no longer need it.
+ if (std::is_same<Device, GPUDevice>::value) {
+ Eigen::PaddingType pt = BrainPadding2EigenPadding(padding);
+ functor::SpatialMaxPooling<Device, T>()(
+ context->eigen_device<Device>(), output->tensor<T, 4>(),
+ tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols,
+ params.row_stride, params.col_stride, pt);
+ } else {
+ typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+ ConstEigenMatrixMap;
+ typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+ EigenMatrixMap;
+
+ ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
+ params.tensor_in_cols * params.tensor_in_rows *
+ params.tensor_in_batch);
+ EigenMatrixMap out_mat(
+ output->flat<T>().data(), params.depth,
+ params.out_width * params.out_height * params.tensor_in_batch);
+
+ // Initializes the output tensor with MIN<T>.
+ output->flat<T>().setConstant(Eigen::NumTraits<T>::lowest());
+
+ // The following code basically does the following:
+ // 1. Flattens the input and output tensors into two dimensional arrays.
+ // tensor_in_as_matrix:
+ // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
+ // output_as_matrix:
+ // depth by (out_width * out_height * tensor_in_batch)
+ //
+ // 2. Walks through the set of columns in the flattened
+ // tensor_in_as_matrix,
+ // and updates the corresponding column(s) in output_as_matrix with the
+ // max value.
+ for (int b = 0; b < params.tensor_in_batch; ++b) {
+ for (int h = 0; h < params.tensor_in_rows; ++h) {
+ for (int w = 0; w < params.tensor_in_cols; ++w) {
+ // (h_start, h_end) * (w_start, w_end) is the range that the input
+ // vector projects to.
+ const int hpad = h + params.pad_rows;
+ const int wpad = w + params.pad_cols;
+ const int h_start =
+ (hpad < params.window_rows)
+ ? 0
+ : (hpad - params.window_rows) / params.row_stride + 1;
+ const int h_end =
+ std::min(hpad / params.row_stride + 1, params.out_height);
+ const int w_start =
+ (wpad < params.window_cols)
+ ? 0
+ : (wpad - params.window_cols) / params.col_stride + 1;
+ const int w_end =
+ std::min(wpad / params.col_stride + 1, params.out_width);
+ // compute elementwise max
+ const int in_offset =
+ (b * params.tensor_in_rows + h) * params.tensor_in_cols + w;
+ for (int ph = h_start; ph < h_end; ++ph) {
+ for (int pw = w_start; pw < w_end; ++pw) {
+ const int out_offset =
+ (b * params.out_height + ph) * params.out_width + pw;
+ out_mat.col(out_offset) =
+ out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset));
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ std::vector<int32> ksize_;
+ std::vector<int32> stride_;
+ Padding padding_;
+};
+
+template <typename Device, typename T>
+void SpatialAvgPool(OpKernelContext* context, Tensor* output,
+ const Tensor& input, const PoolParameters& params,
+ const Padding& padding) {
+ typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+ ConstEigenMatrixMap;
+ typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+ EigenMatrixMap;
+
+ auto in_flat = input.flat<T>();
+ auto out_flat = output->flat<T>();
+
+ ConstEigenMatrixMap in_mat(
+ in_flat.data(), params.depth,
+ params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
+ EigenMatrixMap out_mat(
+ out_flat.data(), params.depth,
+ params.out_width * params.out_height * params.tensor_in_batch);
+ Eigen::Matrix<T, Eigen::Dynamic, 1> out_count(out_mat.cols());
+ out_count.setZero();
+
+ // Initializes output to zero.
+ out_flat.setZero();
+
+ // The following code basically does the following:
+ // 1. Flattens the input and output tensors into two dimensional arrays.
+ // tensor_in_as_matrix:
+ // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
+ // output_as_matrix:
+ // depth by (out_width * out_height * tensor_in_batch)
+ //
+ // 2. Walks through the set of columns in the flattened
+ // tensor_in_as_matrix,
+ // and updates the corresponding column(s) in output_as_matrix with the
+ // average value.
+ for (int b = 0; b < params.tensor_in_batch; ++b) {
+ for (int h = 0; h < params.tensor_in_rows; ++h) {
+ for (int w = 0; w < params.tensor_in_cols; ++w) {
+ // (h_start, h_end) * (w_start, w_end) is the range that the input
+ // vector projects to.
+ const int hpad = h + params.pad_rows;
+ const int wpad = w + params.pad_cols;
+ const int h_start =
+ (hpad < params.window_rows)
+ ? 0
+ : (hpad - params.window_rows) / params.row_stride + 1;
+ const int h_end =
+ std::min(hpad / params.row_stride + 1, params.out_height);
+ const int w_start =
+ (wpad < params.window_cols)
+ ? 0
+ : (wpad - params.window_cols) / params.col_stride + 1;
+ const int w_end =
+ std::min(wpad / params.col_stride + 1, params.out_width);
+ const int in_offset =
+ (b * params.tensor_in_rows + h) * params.tensor_in_cols + w;
+ Eigen::DSizes<ptrdiff_t, 2> in_indices(0, in_offset);
+ for (int ph = h_start; ph < h_end; ++ph) {
+ for (int pw = w_start; pw < w_end; ++pw) {
+ const int out_offset =
+ (b * params.out_height + ph) * params.out_width + pw;
+ out_mat.col(out_offset) += in_mat.col(in_offset);
+ out_count(out_offset)++;
+ }
+ }
+ }
+ }
+ }
+ DCHECK_GT(out_count.minCoeff(), 0);
+ out_mat.array().rowwise() /= out_count.transpose().array();
+}
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_POOLING_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/pooling_ops_common_gpu.h b/tensorflow/core/kernels/pooling_ops_common_gpu.h
new file mode 100644
index 0000000000..87a3ef5186
--- /dev/null
+++ b/tensorflow/core/kernels/pooling_ops_common_gpu.h
@@ -0,0 +1,39 @@
+#if !GOOGLE_CUDA
+#error This file must only be included when building with Cuda support
+#endif
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_GPU_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_GPU_H_
+
+#include "tensorflow/stream_executor/dnn.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/avgpooling_op.h"
+#include "tensorflow/core/kernels/maxpooling_op.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+// A helper class that launch the cudnn pooling backward operations.
+// The original input and output tensors are optional for AvgPoolGrad, but
+// mandatory for MaxPoolGrad.
+template <typename T>
+class DnnPoolingGradOp {
+ public:
+ typedef GPUDevice Device;
+ static void Compute(OpKernelContext* context,
+ perftools::gputools::dnn::PoolingMode pooling_mode,
+ const std::vector<int32>& size,
+ const std::vector<int32>& stride, Padding padding,
+ const Tensor* tensor_in, const Tensor* tensor_out,
+ const Tensor& out_backprop,
+ const TensorShape& tensor_in_shape);
+};
+
+} // namespace tensorflow
+
+#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_GPU_H_
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
new file mode 100644
index 0000000000..1b13f68a3a
--- /dev/null
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -0,0 +1,153 @@
+#include "tensorflow/core/kernels/queue_base.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+namespace {
+
+template <DataType DT>
+void HandleSliceToElement(const Tensor& parent, Tensor* element, int index) {
+ typedef typename EnumToDataType<DT>::Type T;
+ auto parent_as_matrix = parent.flat_outer_dims<T>();
+ element->flat<T>() = parent_as_matrix.chip(index, 0);
+}
+
+template <DataType DT>
+void HandleElementToSlice(const Tensor& element, Tensor* parent, int index) {
+ typedef typename EnumToDataType<DT>::Type T;
+ auto parent_as_matrix = parent->flat_outer_dims<T>();
+ parent_as_matrix.chip(index, 0) = element.flat<T>();
+}
+
+} // namespace
+
+// static
+Status QueueBase::CopySliceToElement(const Tensor& parent, Tensor* element,
+ int index) {
+#define HANDLE_TYPE(DT) \
+ if (parent.dtype() == DT) { \
+ HandleSliceToElement<DT>(parent, element, index); \
+ return Status::OK(); \
+ }
+ HANDLE_TYPE(DT_FLOAT);
+ HANDLE_TYPE(DT_DOUBLE);
+ HANDLE_TYPE(DT_INT32);
+ HANDLE_TYPE(DT_UINT8);
+ HANDLE_TYPE(DT_INT16);
+ HANDLE_TYPE(DT_INT8);
+ HANDLE_TYPE(DT_STRING);
+ HANDLE_TYPE(DT_INT64);
+#undef HANDLE_TYPE
+ return errors::Unimplemented("Unhandled data type: ", parent.dtype());
+}
+
+// static
+Status QueueBase::CopyElementToSlice(const Tensor& element, Tensor* parent,
+ int index) {
+#define HANDLE_TYPE(DT) \
+ if (element.dtype() == DT) { \
+ HandleElementToSlice<DT>(element, parent, index); \
+ return Status::OK(); \
+ }
+ HANDLE_TYPE(DT_FLOAT);
+ HANDLE_TYPE(DT_DOUBLE);
+ HANDLE_TYPE(DT_INT32);
+ HANDLE_TYPE(DT_UINT8);
+ HANDLE_TYPE(DT_INT16);
+ HANDLE_TYPE(DT_INT8);
+ HANDLE_TYPE(DT_STRING);
+ HANDLE_TYPE(DT_INT64);
+#undef HANDLE_TYPE
+ return errors::Unimplemented("Unhandled data type: ", element.dtype());
+}
+
+QueueBase::QueueBase(const DataTypeVector& component_dtypes,
+ const std::vector<TensorShape>& component_shapes,
+ const string& name)
+ : component_dtypes_(component_dtypes),
+ component_shapes_(component_shapes),
+ name_(name) {}
+
+Status QueueBase::ValidateTupleCommon(const Tuple& tuple) const {
+ if (tuple.size() != static_cast<size_t>(num_components())) {
+ return errors::InvalidArgument(
+ "Wrong number of components in tuple. Expected ", num_components(),
+ ", got ", tuple.size());
+ }
+ for (size_t i = 0; i < tuple.size(); ++i) {
+ if (tuple[i].dtype() != component_dtypes_[i]) {
+ return errors::InvalidArgument(
+ "Type mismatch in tuple component ", i, ". Expected ",
+ DataTypeString(component_dtypes_[i]), ", got ",
+ DataTypeString(tuple[i].dtype()));
+ }
+ }
+ return Status::OK();
+}
+
+// static
+string QueueBase::ShapeListString(const gtl::ArraySlice<TensorShape>& shapes) {
+ string result = "[";
+ bool first = true;
+ for (const TensorShape& shape : shapes) {
+ strings::StrAppend(&result, (first ? "" : ", "), shape.ShortDebugString());
+ first = false;
+ }
+ strings::StrAppend(&result, "]");
+ return result;
+}
+
+Status QueueBase::MatchesNodeDefOp(const NodeDef& node_def,
+ const string& op) const {
+ if (node_def.op() != op) {
+ return errors::InvalidArgument("Shared queue '", name_, "' has type '", op,
+ "' that does not match type of Node '",
+ node_def.name(), "': ", node_def.op());
+ }
+ return Status::OK();
+}
+
+Status QueueBase::MatchesNodeDefCapacity(const NodeDef& node_def,
+ int32 capacity) const {
+ int32 requested_capacity = -1;
+ TF_RETURN_IF_ERROR(GetNodeAttr(node_def, "capacity", &requested_capacity));
+ if (requested_capacity < 0) requested_capacity = kUnbounded;
+ if (requested_capacity != capacity) {
+ return errors::InvalidArgument("Shared queue '", name_, "' has capacity ",
+ capacity, " but requested capacity was ",
+ requested_capacity);
+ }
+ return Status::OK();
+}
+
+Status QueueBase::MatchesNodeDefTypes(const NodeDef& node_def) const {
+ DataTypeVector requested_dtypes;
+ TF_RETURN_IF_ERROR(
+ GetNodeAttr(node_def, "component_types", &requested_dtypes));
+ if (requested_dtypes != component_dtypes_) {
+ return errors::InvalidArgument("Shared queue '", name_,
+ "' has component types ",
+ DataTypeSliceString(component_dtypes_),
+ " but requested component types were ",
+ DataTypeSliceString(requested_dtypes));
+ }
+ return Status::OK();
+}
+
+Status QueueBase::MatchesNodeDefShapes(const NodeDef& node_def) const {
+ std::vector<TensorShape> requested_shapes;
+ TF_RETURN_IF_ERROR(GetNodeAttr(node_def, "shapes", &requested_shapes));
+ if (requested_shapes != component_shapes_) {
+ return errors::InvalidArgument("Shared queue '", name_,
+ "' has component shapes ",
+ ShapeListString(component_shapes_),
+ " but requested component shapes were ",
+ ShapeListString(requested_shapes));
+ }
+ return Status::OK();
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h
new file mode 100644
index 0000000000..4897102974
--- /dev/null
+++ b/tensorflow/core/kernels/queue_base.h
@@ -0,0 +1,77 @@
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_QUEUE_BASE_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_QUEUE_BASE_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/queue_interface.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+// Functionality common to QueueInterface implementations.
+class QueueBase : public QueueInterface {
+ public:
+ // As a possible value of 'capacity'.
+ static const int32 kUnbounded = INT_MAX;
+
+ // Args:
+ // component_dtypes: The types of each component in a queue-element tuple.
+ // component_shapes: The shapes of each component in a queue-element tuple,
+ // which must either be empty (if the shapes are not specified) or
+ // or have the same size as component_dtypes.
+ // name: A name to use for the queue.
+ QueueBase(const DataTypeVector& component_dtypes,
+ const std::vector<TensorShape>& component_shapes,
+ const string& name);
+
+ // Implementations of QueueInterface methods --------------------------------
+ const DataTypeVector& component_dtypes() const override {
+ return component_dtypes_;
+ }
+
+ // Other public methods -----------------------------------------------------
+ const std::vector<TensorShape>& component_shapes() const {
+ return component_shapes_;
+ }
+
+ protected:
+ // Returns the number of components in a queue-element tuple.
+ int32 num_components() const { return component_dtypes_.size(); }
+
+ // True if shapes were specified. If so, inputs will be validated
+ // against them, etc.
+ bool specified_shapes() const { return component_shapes_.size() > 0; }
+
+ // Code common to Validate*Tuple().
+ Status ValidateTupleCommon(const Tuple& tuple) const;
+
+ // Copies the index^th slice (in the first dimension) of parent into element.
+ static Status CopySliceToElement(const Tensor& parent, Tensor* element,
+ int index);
+
+ // Copies element into the index^th slice (in the first dimension) of parent.
+ static Status CopyElementToSlice(const Tensor& element, Tensor* parent,
+ int index);
+
+ ~QueueBase() override {}
+
+ // Helpers for implementing MatchesNodeDef().
+ static string ShapeListString(const gtl::ArraySlice<TensorShape>& shapes);
+ Status MatchesNodeDefOp(const NodeDef& node_def, const string& op) const;
+ Status MatchesNodeDefCapacity(const NodeDef& node_def, int32 capacity) const;
+ Status MatchesNodeDefTypes(const NodeDef& node_def) const;
+ Status MatchesNodeDefShapes(const NodeDef& node_def) const;
+
+ const DataTypeVector component_dtypes_;
+ const std::vector<TensorShape> component_shapes_;
+ const string name_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(QueueBase);
+};
+
+} // namespace tensorflow
+
+#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_QUEUE_BASE_H_
diff --git a/tensorflow/core/kernels/queue_ops.cc b/tensorflow/core/kernels/queue_ops.cc
new file mode 100644
index 0000000000..c70dc76777
--- /dev/null
+++ b/tensorflow/core/kernels/queue_ops.cc
@@ -0,0 +1,288 @@
+// See docs in ../ops/data_flow_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/queue_interface.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+class QueueOpKernel : public AsyncOpKernel {
+ public:
+ explicit QueueOpKernel(OpKernelConstruction* context)
+ : AsyncOpKernel(context) {}
+
+ void ComputeAsync(OpKernelContext* ctx, DoneCallback callback) final {
+ QueueInterface* queue;
+ OP_REQUIRES_OK_ASYNC(ctx, GetResourceFromContext(ctx, "handle", &queue),
+ callback);
+ ComputeAsync(ctx, queue, [callback, queue]() {
+ queue->Unref();
+ callback();
+ });
+ }
+
+ protected:
+ virtual void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+ DoneCallback callback) = 0;
+};
+
+class QueueAccessOpKernel : public QueueOpKernel {
+ public:
+ explicit QueueAccessOpKernel(OpKernelConstruction* context)
+ : QueueOpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("timeout_ms", &timeout_));
+ // TODO(keveman): Enable timeout.
+ OP_REQUIRES(context, timeout_ == -1,
+ errors::InvalidArgument("Timeout not supported yet."));
+ }
+
+ protected:
+ int64 timeout_;
+};
+
+// Defines an EnqueueOp, the execution of which enqueues a tuple of
+// tensors in the given Queue.
+//
+// The op has 1 + k inputs, where k is the number of components in the
+// tuples stored in the given Queue:
+// - Input 0: queue handle.
+// - Input 1: 0th element of the tuple.
+// - ...
+// - Input (1+k): kth element of the tuple.
+class EnqueueOp : public QueueAccessOpKernel {
+ public:
+ explicit EnqueueOp(OpKernelConstruction* context)
+ : QueueAccessOpKernel(context) {}
+
+ protected:
+ void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+ DoneCallback callback) override {
+ DataTypeVector expected_inputs = {DT_STRING_REF};
+ for (DataType dt : queue->component_dtypes()) {
+ expected_inputs.push_back(dt);
+ }
+ OP_REQUIRES_OK_ASYNC(ctx, ctx->MatchSignature(expected_inputs, {}),
+ callback);
+
+ QueueInterface::Tuple tuple;
+ OpInputList components;
+ OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("components", &components),
+ callback);
+ for (const Tensor& Tcomponent : components) {
+ tuple.push_back(Tcomponent);
+ }
+
+ OP_REQUIRES_OK_ASYNC(ctx, queue->ValidateTuple(tuple), callback);
+ queue->TryEnqueue(tuple, ctx, callback);
+ }
+
+ private:
+ TF_DISALLOW_COPY_AND_ASSIGN(EnqueueOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("QueueEnqueue").Device(DEVICE_CPU), EnqueueOp);
+
+// Defines an EnqueueManyOp, the execution of which slices each
+// component of a tuple of tensors along the 0th dimension, and
+// enqueues tuples of slices in the given Queue.
+//
+// The op has 1 + k inputs, where k is the number of components in the
+// tuples stored in the given Queue:
+// - Input 0: queue handle.
+// - Input 1: 0th element of the tuple.
+// - ...
+// - Input (1+k): kth element of the tuple.
+//
+// N.B. All tuple components must have the same size in the 0th
+// dimension.
+class EnqueueManyOp : public QueueAccessOpKernel {
+ public:
+ explicit EnqueueManyOp(OpKernelConstruction* context)
+ : QueueAccessOpKernel(context) {}
+
+ protected:
+ void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+ DoneCallback callback) override {
+ DataTypeVector expected_inputs = {DT_STRING_REF};
+ for (DataType dt : queue->component_dtypes()) {
+ expected_inputs.push_back(dt);
+ }
+ OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {}));
+
+ QueueInterface::Tuple tuple;
+ OpInputList components;
+ OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("components", &components),
+ callback);
+ for (const Tensor& Tcomponent : components) {
+ tuple.push_back(Tcomponent);
+ }
+
+ OP_REQUIRES_OK_ASYNC(ctx, queue->ValidateManyTuple(tuple), callback);
+ queue->TryEnqueueMany(tuple, ctx, callback);
+ }
+
+ ~EnqueueManyOp() override {}
+
+ private:
+ TF_DISALLOW_COPY_AND_ASSIGN(EnqueueManyOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("QueueEnqueueMany").Device(DEVICE_CPU),
+ EnqueueManyOp);
+
+// Defines a DequeueOp, the execution of which dequeues a tuple of
+// tensors from the given Queue.
+//
+// The op has one input, which is the handle of the appropriate
+// Queue. The op has k outputs, where k is the number of components in
+// the tuples stored in the given Queue, and output i is the ith
+// component of the dequeued tuple.
+class DequeueOp : public QueueAccessOpKernel {
+ public:
+ explicit DequeueOp(OpKernelConstruction* context)
+ : QueueAccessOpKernel(context) {}
+
+ protected:
+ void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+ DoneCallback callback) override {
+ OP_REQUIRES_OK_ASYNC(
+ ctx, ctx->MatchSignature({DT_STRING_REF}, queue->component_dtypes()),
+ callback);
+
+ queue->TryDequeue(ctx, [ctx, callback](const QueueInterface::Tuple& tuple) {
+ if (!ctx->status().ok()) {
+ callback();
+ return;
+ }
+ OpOutputList output_components;
+ OP_REQUIRES_OK_ASYNC(
+ ctx, ctx->output_list("components", &output_components), callback);
+ for (int i = 0; i < ctx->num_outputs(); ++i) {
+ output_components.set(i, tuple[i]);
+ }
+ callback();
+ });
+ }
+
+ ~DequeueOp() override {}
+
+ private:
+ TF_DISALLOW_COPY_AND_ASSIGN(DequeueOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("QueueDequeue").Device(DEVICE_CPU), DequeueOp);
+
+// Defines a DequeueManyOp, the execution of which concatenates the
+// requested number of elements from the given Queue along the 0th
+// dimension, and emits the result as a single tuple of tensors.
+//
+// The op has two inputs:
+// - Input 0: the handle to a queue.
+// - Input 1: the number of elements to dequeue.
+//
+// The op has k outputs, where k is the number of components in the
+// tuples stored in the given Queue, and output i is the ith component
+// of the dequeued tuple.
+class DequeueManyOp : public QueueAccessOpKernel {
+ public:
+ explicit DequeueManyOp(OpKernelConstruction* context)
+ : QueueAccessOpKernel(context) {}
+
+ protected:
+ void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+ DoneCallback callback) override {
+ const Tensor& Tnum_elements = ctx->input(1);
+ int32 num_elements = Tnum_elements.flat<int32>()(0);
+
+ OP_REQUIRES_ASYNC(
+ ctx, num_elements >= 0,
+ errors::InvalidArgument("DequeueManyOp must request a positive number "
+ "of elements"),
+ callback);
+
+ OP_REQUIRES_OK_ASYNC(ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32},
+ queue->component_dtypes()),
+ callback);
+
+ queue->TryDequeueMany(
+ num_elements, ctx, [ctx, callback](const QueueInterface::Tuple& tuple) {
+ if (!ctx->status().ok()) {
+ callback();
+ return;
+ }
+ OpOutputList output_components;
+ OP_REQUIRES_OK_ASYNC(
+ ctx, ctx->output_list("components", &output_components),
+ callback);
+ for (int i = 0; i < ctx->num_outputs(); ++i) {
+ output_components.set(i, tuple[i]);
+ }
+ callback();
+ });
+ }
+
+ ~DequeueManyOp() override {}
+
+ private:
+ TF_DISALLOW_COPY_AND_ASSIGN(DequeueManyOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("QueueDequeueMany").Device(DEVICE_CPU),
+ DequeueManyOp);
+
+// Defines a QueueCloseOp, which closes the given Queue. Closing a
+// Queue signals that no more elements will be enqueued in it.
+//
+// The op has one input, which is the handle of the appropriate Queue.
+class QueueCloseOp : public QueueOpKernel {
+ public:
+ explicit QueueCloseOp(OpKernelConstruction* context)
+ : QueueOpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("cancel_pending_enqueues",
+ &cancel_pending_enqueues_));
+ }
+
+ protected:
+ void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+ DoneCallback callback) override {
+ queue->Close(ctx, cancel_pending_enqueues_, callback);
+ }
+
+ private:
+ bool cancel_pending_enqueues_;
+ TF_DISALLOW_COPY_AND_ASSIGN(QueueCloseOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("QueueClose").Device(DEVICE_CPU), QueueCloseOp);
+
+// Defines a QueueSizeOp, which computes the number of elements in the
+// given Queue, and emits it as an output tensor.
+//
+// The op has one input, which is the handle of the appropriate Queue;
+// and one output, which is a single-element tensor containing the current
+// size of that Queue.
+class QueueSizeOp : public QueueOpKernel {
+ public:
+ explicit QueueSizeOp(OpKernelConstruction* context)
+ : QueueOpKernel(context) {}
+
+ protected:
+ void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+ DoneCallback callback) override {
+ Tensor* Tqueue_size = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &Tqueue_size));
+ Tqueue_size->flat<int32>().setConstant(queue->size());
+ callback();
+ }
+
+ private:
+ TF_DISALLOW_COPY_AND_ASSIGN(QueueSizeOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("QueueSize").Device(DEVICE_CPU), QueueSizeOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/random_crop_op.cc b/tensorflow/core/kernels/random_crop_op.cc
new file mode 100644
index 0000000000..4fc12e92cb
--- /dev/null
+++ b/tensorflow/core/kernels/random_crop_op.cc
@@ -0,0 +1,103 @@
+// See docs in ../ops/image_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/util/guarded_philox_random.h"
+
+namespace tensorflow {
+
+template <typename T>
+class RandomCropOp : public OpKernel {
+ public:
+ explicit RandomCropOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, generator_.Init(context));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ OP_REQUIRES(context, input.dims() == 3,
+ errors::InvalidArgument("input must be 3-dimensional",
+ input.shape().ShortDebugString()));
+ const Tensor& shape_t = context->input(1);
+ OP_REQUIRES(context, shape_t.dims() == 1,
+ errors::InvalidArgument("shape_t must be 1-dimensional",
+ shape_t.shape().ShortDebugString()));
+ OP_REQUIRES(context, shape_t.NumElements() == 2,
+ errors::InvalidArgument("shape_t must have two elements",
+ shape_t.shape().ShortDebugString()));
+
+ auto shape_vec = shape_t.vec<int64>();
+ const int32 target_height = shape_vec(0);
+ const int32 target_width = shape_vec(1);
+
+ const int32 height = input.dim_size(0);
+ const int32 width = input.dim_size(1);
+ const int32 channels = input.dim_size(2);
+
+ // Initialize shape to the batch size of the input, then add
+ // the rest of the dimensions
+ Tensor* output = nullptr;
+ const auto output_shape =
+ TensorShape({target_height, target_width, channels});
+ OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+ // If the target size matches the actual size, then do nothing.
+ if ((target_height == height) && (target_width == width)) {
+ *output = context->input(0);
+ }
+
+ // TODO(shlens): Implement edge case to guarantee output size dimensions.
+ // Edge case. The target dimensions are larger then the image, so
+ // zero-pad the image. This guarantees that the image will *always*
+ // be [target_height, target_width] in size.
+ OP_REQUIRES(context, width >= target_width, errors::FailedPrecondition(
+ "width must be >= target_width: width = ", width,
+ ", target_width = ", target_width));
+ OP_REQUIRES(context, height >= target_height, errors::FailedPrecondition(
+ "height must be >= target_height: height = ", height,
+ ", target_height = ", target_height));
+
+ int32 offset_height = 0;
+ int32 offset_width = 0;
+
+ auto local_gen = generator_.ReserveSamples32(2);
+ random::SimplePhilox random(&local_gen);
+
+ if (width > target_width) {
+ offset_width = random.Rand32() % (width - target_width + 1);
+ }
+ if (height > target_height) {
+ offset_height = random.Rand32() % (height - target_height + 1);
+ }
+
+ // TODO(shlens): Do this more efficiently with memcpy once padding is
+ // available for smaller images.
+ typename TTypes<T, 3>::ConstTensor input_data = input.tensor<T, 3>();
+ typename TTypes<T, 3>::Tensor output_data = output->tensor<T, 3>();
+
+ for (int y = 0; y < target_height; ++y) {
+ for (int x = 0; x < target_width; ++x) {
+ for (int c = 0; c < channels; ++c) {
+ output_data(y, x, c) =
+ input_data(y + offset_height, x + offset_width, c);
+ }
+ }
+ }
+ }
+
+ private:
+ GuardedPhiloxRandom generator_;
+};
+
+#define REGISTER_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("RandomCrop").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ RandomCropOp<type>)
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/random_crop_op_test.cc b/tensorflow/core/kernels/random_crop_op_test.cc
new file mode 100644
index 0000000000..1f232f4969
--- /dev/null
+++ b/tensorflow/core/kernels/random_crop_op_test.cc
@@ -0,0 +1,60 @@
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+
+class RandomCropOpTest : public OpsTestBase {
+ protected:
+ RandomCropOpTest() {
+ RequireDefaultOps();
+ EXPECT_OK(NodeDefBuilder("random_crop_op", "RandomCrop")
+ .Input(FakeInput(DT_UINT8))
+ .Input(FakeInput(DT_INT64))
+ .Attr("T", DT_UINT8)
+ .Finalize(node_def()));
+ EXPECT_OK(InitOp());
+ }
+};
+
+TEST_F(RandomCropOpTest, Basic) {
+ AddInputFromArray<uint8>(TensorShape({1, 2, 1}), {2, 2});
+ AddInputFromArray<int64>(TensorShape({2}), {1, 1});
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_UINT8, TensorShape({1, 1, 1}));
+ test::FillValues<uint8>(&expected, {2});
+ test::ExpectTensorEqual<uint8>(expected, *GetOutput(0));
+}
+
+TEST_F(RandomCropOpTest, SameSizeOneChannel) {
+ AddInputFromArray<uint8>(TensorShape({2, 1, 1}), {1, 2});
+ AddInputFromArray<int64>(TensorShape({2}), {2, 1});
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_UINT8, TensorShape({2, 1, 1}));
+ test::FillValues<uint8>(&expected, {1, 2});
+ test::ExpectTensorEqual<uint8>(expected, *GetOutput(0));
+}
+
+TEST_F(RandomCropOpTest, SameSizeMultiChannel) {
+ AddInputFromArray<uint8>(TensorShape({2, 1, 3}), {1, 2, 3, 4, 5, 6});
+ AddInputFromArray<int64>(TensorShape({2}), {2, 1});
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_UINT8, TensorShape({2, 1, 3}));
+ test::FillValues<uint8>(&expected, {1, 2, 3, 4, 5, 6});
+ test::ExpectTensorEqual<uint8>(expected, *GetOutput(0));
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
new file mode 100644
index 0000000000..09b66d30e6
--- /dev/null
+++ b/tensorflow/core/kernels/random_op.cc
@@ -0,0 +1,276 @@
+// See docs in ../ops/random_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/random_op.h"
+
+#include <algorithm>
+#include <memory>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/lib/hash/crc32c.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/util/guarded_philox_random.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+// The default implementation of the functor, which should never be invoked
+// But we still need to provide implementation for now for the linker to work,
+// since we do not support all the distributions yet.
+template <typename Device, class Distribution>
+struct FillPhiloxRandom {
+ typedef typename Distribution::ResultElementType T;
+ void operator()(OpKernelContext*, const Device&, random::PhiloxRandom gen,
+ T* data, int64 size) {
+ LOG(FATAL) << "Default FillPhiloxRandom should not be executed.";
+ }
+};
+
+#if GOOGLE_CUDA
+// Declaration for the partial specialization with GPU
+template <class Distribution>
+struct FillPhiloxRandom<GPUDevice, Distribution> {
+ typedef typename Distribution::ResultElementType T;
+ void operator()(OpKernelContext* ctx, const GPUDevice&,
+ random::PhiloxRandom gen, T* data, int64 size);
+};
+
+#endif
+
+// A class to fill a specified range of random groups
+template <class Distribution, bool VariableSamplesPerOutput>
+struct FillPhiloxRandomTask;
+
+// Specialization for distribution that takes a fixed number of samples for
+// each output.
+template <class Distribution>
+struct FillPhiloxRandomTask<Distribution, false> {
+ typedef typename Distribution::ResultElementType T;
+ static void Run(random::PhiloxRandom gen, T* data, int64 size,
+ int64 start_group, int64 limit_group) {
+ Distribution dist;
+ const int kGroupSize = Distribution::kResultElementCount;
+
+ gen.Skip(start_group);
+ int64 offset = start_group * kGroupSize;
+
+ // First fill all the full-size groups
+ int64 limit_group_full = std::min(limit_group, size / kGroupSize);
+ for (int64 index = start_group; index < limit_group_full; ++index) {
+ auto samples = dist(&gen);
+ std::copy(&samples[0], &samples[0] + kGroupSize, data + offset);
+ offset += kGroupSize;
+ }
+
+ // If there are any remaining elements that need to be filled, process them
+ if (limit_group_full < limit_group) {
+ int remaining_size = size - limit_group_full * kGroupSize;
+ auto samples = dist(&gen);
+ std::copy(&samples[0], &samples[0] + remaining_size, data + offset);
+ }
+ }
+};
+
+// Specialization for distribution that takes a varaiable number of samples for
+// each output. This will be slower due to the generality.
+template <class Distribution>
+struct FillPhiloxRandomTask<Distribution, true> {
+ typedef typename Distribution::ResultElementType T;
+ static const int64 kReservedSamplesPerOutput = 256;
+
+ static void Run(random::PhiloxRandom base_gen, T* data, int64 size,
+ int64 start_group, int64 limit_group) {
+ using random::PhiloxRandom;
+ using random::SingleSampleAdapter;
+
+ Distribution dist;
+ const int kGroupSize = Distribution::kResultElementCount;
+
+ static const int kGeneratorSkipPerOutputGroup =
+ kGroupSize * kReservedSamplesPerOutput /
+ PhiloxRandom::kResultElementCount;
+
+ int64 offset = start_group * kGroupSize;
+
+ // First fill all the full-size groups
+ int64 limit_group_full = std::min(limit_group, size / kGroupSize);
+ int64 group_index;
+ for (group_index = start_group; group_index < limit_group_full;
+ ++group_index) {
+ // Reset the generator to the beginning of the output group region
+ // This is necessary if we want the results to be independent of order
+ // of work
+ PhiloxRandom gen = base_gen;
+ gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+ SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+ auto samples = dist(&single_samples);
+ std::copy(&samples[0], &samples[0] + kGroupSize, data + offset);
+ offset += kGroupSize;
+ }
+
+ // If there are any remaining elements that need to be filled, process them
+ if (limit_group_full < limit_group) {
+ PhiloxRandom gen = base_gen;
+ gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+ SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+ int remaining_size = size - limit_group_full * kGroupSize;
+ auto samples = dist(&single_samples);
+ std::copy(&samples[0], &samples[0] + remaining_size, data + offset);
+ }
+ }
+};
+
+// Partial specialization for CPU to fill the entire region with randoms
+// It splits the work into several tasks and run them in parallel
+template <class Distribution>
+struct FillPhiloxRandom<CPUDevice, Distribution> {
+ typedef typename Distribution::ResultElementType T;
+ void operator()(OpKernelContext* context, const CPUDevice&,
+ random::PhiloxRandom gen, T* data, int64 size) {
+ const int kGroupSize = Distribution::kResultElementCount;
+
+ auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+
+ int64 total_group_count = (size + kGroupSize - 1) / kGroupSize;
+
+ // Limit to maximum six threads for now. The performance scaling is very
+ // sub-linear. Too many threads causes a much worse overall performance.
+ int num_workers = 6;
+ Shard(num_workers, worker_threads.workers, total_group_count, kGroupSize,
+ [&gen, data, size](int64 start_group, int64 limit_group) {
+ FillPhiloxRandomTask<
+ Distribution,
+ Distribution::kVariableSamplesPerOutput>::Run(gen, data, size,
+ start_group,
+ limit_group);
+ });
+ }
+};
+} // namespace functor
+
+// For now, use the same interface as RandomOp, so we can choose either one
+// at the run-time.
+template <typename Device, class Distribution>
+class PhiloxRandomOp : public OpKernel {
+ public:
+ typedef typename Distribution::ResultElementType T;
+ explicit PhiloxRandomOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, generator_.Init(ctx));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor& input = ctx->input(0);
+ OP_REQUIRES(
+ ctx, TensorShapeUtils::IsLegacyVector(input.shape()),
+ errors::InvalidArgument("shape must be a vector of {int32,int64}."));
+ Tensor* output = nullptr;
+ if (input.dtype() == DataType::DT_INT32) {
+ auto vec = input.flat<int32>();
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShapeUtils::MakeShape(
+ vec.data(), vec.size()),
+ &output));
+ } else if (input.dtype() == DataType::DT_INT64) {
+ auto vec = input.flat<int64>();
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShapeUtils::MakeShape(
+ vec.data(), vec.size()),
+ &output));
+ } else {
+ OP_REQUIRES(ctx, false, errors::InvalidArgument(
+ "shape must be a vector of {int32,int64}."));
+ }
+ functor::FillPhiloxRandom<Device, Distribution>()(
+ ctx, ctx->eigen_device<Device>(),
+ ReserveRandomOutputs(output->flat<T>().size()),
+ output->flat<T>().data(), output->flat<T>().size());
+ }
+
+ private:
+ GuardedPhiloxRandom generator_;
+
+ // Reserve enough random samples in the generator for the given output count.
+ random::PhiloxRandom ReserveRandomOutputs(int64 output_count) {
+ int64 conservative_sample_count = output_count << 8;
+ return generator_.ReserveSamples128(conservative_sample_count);
+ }
+};
+
+#define REGISTER(TYPE) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("RandomUniform") \
+ .Device(DEVICE_CPU) \
+ .HostMemory("shape") \
+ .TypeConstraint<TYPE>("dtype"), \
+ PhiloxRandomOp<CPUDevice, random::UniformDistribution< \
+ random::PhiloxRandom, TYPE> >); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("RandomStandardNormal") \
+ .Device(DEVICE_CPU) \
+ .HostMemory("shape") \
+ .TypeConstraint<TYPE>("dtype"), \
+ PhiloxRandomOp<CPUDevice, random::NormalDistribution< \
+ random::PhiloxRandom, TYPE> >); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("TruncatedNormal") \
+ .Device(DEVICE_CPU) \
+ .HostMemory("shape") \
+ .TypeConstraint<TYPE>("dtype"), \
+ PhiloxRandomOp< \
+ CPUDevice, \
+ random::TruncatedNormalDistribution< \
+ random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >)
+
+REGISTER(float);
+REGISTER(double);
+
+#undef REGISTER
+
+#if GOOGLE_CUDA
+
+#define REGISTER(TYPE) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("RandomUniform") \
+ .Device(DEVICE_GPU) \
+ .HostMemory("shape") \
+ .TypeConstraint<int32>("T") \
+ .TypeConstraint<TYPE>("dtype"), \
+ PhiloxRandomOp<GPUDevice, random::UniformDistribution< \
+ random::PhiloxRandom, TYPE> >); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("RandomStandardNormal") \
+ .Device(DEVICE_GPU) \
+ .HostMemory("shape") \
+ .TypeConstraint<int32>("T") \
+ .TypeConstraint<TYPE>("dtype"), \
+ PhiloxRandomOp<GPUDevice, random::NormalDistribution< \
+ random::PhiloxRandom, TYPE> >); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("TruncatedNormal") \
+ .Device(DEVICE_GPU) \
+ .HostMemory("shape") \
+ .TypeConstraint<int32>("T") \
+ .TypeConstraint<TYPE>("dtype"), \
+ PhiloxRandomOp< \
+ GPUDevice, \
+ random::TruncatedNormalDistribution< \
+ random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >)
+
+REGISTER(float);
+REGISTER(double);
+
+#undef REGISTER
+
+#endif // GOOGLE_CUDA
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/random_op.h b/tensorflow/core/kernels/random_op.h
new file mode 100644
index 0000000000..7c7eed4227
--- /dev/null
+++ b/tensorflow/core/kernels/random_op.h
@@ -0,0 +1,16 @@
+#ifndef TENSORFLOW_KERNELS_RANDOM_OP_H_
+#define TENSORFLOW_KERNELS_RANDOM_OP_H_
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+namespace functor {
+
+template <typename Device, class Distribution>
+struct FillPhiloxRandom;
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_RANDOM_OP_H_
diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc
new file mode 100644
index 0000000000..15cf85f27e
--- /dev/null
+++ b/tensorflow/core/kernels/random_op_gpu.cu.cc
@@ -0,0 +1,152 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/random_op.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <class Distribution, bool VariableSamplesPerOutput>
+struct FillPhiloxRandomKernel;
+
+// A cuda kernel to fill the data with random numbers from the specified
+// distribution. Each output takes a fixed number of samples.
+template <class Distribution>
+struct FillPhiloxRandomKernel<Distribution, false> {
+ typedef typename Distribution::ResultElementType T;
+ PHILOX_DEVICE_FUNC void Run(random::PhiloxRandom gen, T* data, int64 size) {
+ Distribution dist;
+ const int kGroupSize = Distribution::kResultElementCount;
+
+ const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+ const int32 total_thread_count = gridDim.x * blockDim.x;
+ int32 offset = thread_id * kGroupSize;
+ gen.Skip(thread_id);
+
+ while (offset < size) {
+ typename Distribution::ResultType samples = dist(&gen);
+
+ for (int i = 0; i < kGroupSize; ++i) {
+ if (offset >= size) {
+ return;
+ }
+ data[offset] = samples[i];
+ ++offset;
+ }
+
+ offset += (total_thread_count - 1) * kGroupSize;
+ gen.Skip(total_thread_count - 1);
+ }
+ }
+};
+
+// A cuda kernel to fill the data with random numbers from the specified
+// distribution. Each output takes a variable number of samples.
+template <class Distribution>
+struct FillPhiloxRandomKernel<Distribution, true> {
+ typedef typename Distribution::ResultElementType T;
+ PHILOX_DEVICE_FUNC void Run(const random::PhiloxRandom& base_gen, T* data,
+ int64 size) {
+ using random::PhiloxRandom;
+ using random::SingleSampleAdapter;
+
+ const int kReservedSamplesPerOutput = 256;
+ const int kGroupSize = Distribution::kResultElementCount;
+ const int kGeneratorSkipPerOutputGroup = kGroupSize *
+ kReservedSamplesPerOutput /
+ PhiloxRandom::kResultElementCount;
+
+ const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+ const int32 total_thread_count = gridDim.x * blockDim.x;
+ int64 group_index = thread_id;
+ int64 offset = group_index * kGroupSize;
+ Distribution dist;
+
+ while (offset < size) {
+ // Since each output takes a variable number of samples, we need to
+ // realign the generator to the beginning for the current output group
+ PhiloxRandom gen = base_gen;
+ gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+ SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+ typename Distribution::ResultType samples = dist(&single_samples);
+
+ for (int i = 0; i < kGroupSize; ++i) {
+ if (offset >= size) {
+ return;
+ }
+ data[offset] = samples[i];
+ ++offset;
+ }
+
+ offset += (total_thread_count - 1) * kGroupSize;
+ group_index += total_thread_count;
+ }
+ }
+};
+
+// A simple launch pad to call the correct function templates to fill the data
+template <class Distribution>
+__global__ void __launch_bounds__(1024)
+ FillPhiloxRandomKernelLaunch(random::PhiloxRandom base_gen,
+ typename Distribution::ResultElementType* data,
+ int64 size) {
+ FillPhiloxRandomKernel<Distribution,
+ Distribution::kVariableSamplesPerOutput>()
+ .Run(base_gen, data, size);
+}
+
+// Partial specialization for GPU
+template <class Distribution>
+struct FillPhiloxRandom<GPUDevice, Distribution> {
+ typedef typename Distribution::ResultElementType T;
+ typedef GPUDevice Device;
+ void operator()(OpKernelContext*, const Device& d, random::PhiloxRandom gen,
+ T* data, int64 size) {
+ const int32 block_size = d.maxCudaThreadsPerBlock();
+ const int32 num_blocks =
+ (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
+ block_size;
+
+ FillPhiloxRandomKernelLaunch<
+ Distribution><<<num_blocks, block_size, 0, d.stream()>>>(gen, data,
+ size);
+ }
+};
+
+// Explicit instantiation of the GPU distributions functors
+// clang-format off
+// NVCC cannot handle ">>" properly
+template struct FillPhiloxRandom<
+ GPUDevice, random::UniformDistribution<random::PhiloxRandom, float> >;
+template struct FillPhiloxRandom<
+ GPUDevice, random::UniformDistribution<random::PhiloxRandom, double> >;
+template struct FillPhiloxRandom<
+ GPUDevice, random::NormalDistribution<random::PhiloxRandom, float> >;
+template struct FillPhiloxRandom<
+ GPUDevice, random::NormalDistribution<random::PhiloxRandom, double> >;
+template struct FillPhiloxRandom<
+ GPUDevice, random::TruncatedNormalDistribution<
+ random::SingleSampleAdapter<random::PhiloxRandom>, float> >;
+template struct FillPhiloxRandom<
+ GPUDevice, random::TruncatedNormalDistribution<
+ random::SingleSampleAdapter<random::PhiloxRandom>, double> >;
+// clang-format on
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/random_op_test.cc b/tensorflow/core/kernels/random_op_test.cc
new file mode 100644
index 0000000000..751b61cfba
--- /dev/null
+++ b/tensorflow/core/kernels/random_op_test.cc
@@ -0,0 +1,99 @@
+#include <random>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+
+Tensor Int32(int32 v) {
+ Tensor t(DT_INT32, TensorShape({}));
+ t.scalar<int32>()() = v;
+ return t;
+}
+
+Graph* RandomUniform(int64 n) {
+ Graph* g = new Graph(OpRegistry::Global());
+ test::graph::RandomUniform(g, test::graph::Constant(g, Int32(n)), DT_FLOAT);
+ return g;
+}
+
+Graph* RandomNormal(int64 n) {
+ Graph* g = new Graph(OpRegistry::Global());
+ test::graph::RandomGaussian(g, test::graph::Constant(g, Int32(n)), DT_FLOAT);
+ return g;
+}
+
+Graph* RandomParameters(int64 n) {
+ Graph* g = new Graph(OpRegistry::Global());
+ test::graph::RandomParameters(g, test::graph::Constant(g, Int32(n)),
+ DT_FLOAT);
+ return g;
+}
+
+#define BM_RNG(DEVICE, RNG) \
+ static void BM_##DEVICE##_##RNG(int iters, int arg) { \
+ testing::ItemsProcessed(static_cast<int64>(iters) * arg); \
+ test::Benchmark(#DEVICE, RNG(arg)).Run(iters); \
+ } \
+ BENCHMARK(BM_##DEVICE##_##RNG)->Range(1 << 20, 8 << 20);
+
+BM_RNG(cpu, RandomUniform);
+BM_RNG(cpu, RandomNormal);
+BM_RNG(cpu, RandomParameters);
+
+BM_RNG(gpu, RandomUniform);
+BM_RNG(gpu, RandomNormal);
+BM_RNG(gpu, RandomParameters);
+
+static void BM_PhiloxRandom(int iters) {
+ // Fill 2M random numbers
+ int count = 2 << 20;
+
+ testing::ItemsProcessed(static_cast<int64>(iters) * count);
+
+ random::PhiloxRandom gen(0x12345);
+
+ int val = 1;
+ for (int i = 0; i < iters; ++i) {
+ for (int j = 0; j < count; j += 4) {
+ /// each invocation of gen() returns 128-bit samples
+ auto samples = gen();
+
+ // use the result trivially so the compiler does not optimize it away
+ val ^= samples[0] ^ samples[1] ^ samples[2] ^ samples[3];
+ }
+ }
+
+ // A anchor point to make sure the compiler does not cut corners
+ CHECK(val) << val;
+}
+BENCHMARK(BM_PhiloxRandom);
+
+static void BM_StdMTRandom(int iters) {
+ // Fill 2M random numbers
+ int count = 2 << 20;
+
+ testing::ItemsProcessed(static_cast<int64>(iters) * count);
+
+ std::mt19937 gen(0x12345);
+
+ int val = 1;
+ for (int i = 0; i < iters; ++i) {
+ for (int j = 0; j < count; ++j) {
+ /// each invocation of gen() returns 32-bit sample
+ uint32 sample = gen();
+
+ // use the result trivially so the compiler does not optimize it away
+ val ^= sample;
+ }
+ }
+
+ // A anchor point to make sure the compiler does not cut corners
+ CHECK(val) << val;
+}
+BENCHMARK(BM_StdMTRandom);
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/random_shuffle_op.cc b/tensorflow/core/kernels/random_shuffle_op.cc
new file mode 100644
index 0000000000..b87f4e58a0
--- /dev/null
+++ b/tensorflow/core/kernels/random_shuffle_op.cc
@@ -0,0 +1,89 @@
+// See docs in ../ops/random_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/util/guarded_philox_random.h"
+
+namespace tensorflow {
+
+// TODO(irving): If performance is critical, generate output directly instead
+// of an in-place shuffle using a pseudorandom permutation like
+//
+// https://github.com/otherlab/geode/blob/master/geode/random/permute.cpp
+//
+// This is probably also the right thing if we want a GPU version of shuffling.
+
+// We use our own version of std::random_shuffle to guarantee that exactly
+// size - 1 samples are used.
+template <class Iter, class Random>
+static inline void RandomShuffle(Iter first, Iter last, Random& uniform) {
+ if (first == last) return;
+ const auto stop = last - 1;
+ for (auto i = first; i != stop; ++i) {
+ using std::iter_swap;
+ iter_swap(i, i + uniform(last - i));
+ }
+}
+
+template <typename T>
+class RandomShuffleOp : public OpKernel {
+ public:
+ explicit RandomShuffleOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, generator_.Init(context));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+
+ if (input.NumElements() <= 1 || input.dim_size(0) <= 1) {
+ // No shuffling is required, so copy input directly to output
+ context->set_output(0, input);
+ } else {
+ // Reserve enough random samples for shuffling
+ const int64 size = input.dim_size(0);
+ const int64 samples = size - 1;
+ auto local_gen = generator_.ReserveSamples32(samples);
+ random::SingleSampleAdapter<random::PhiloxRandom> single(&local_gen);
+ const auto uniform = [&single](uint32 n) { return single() % n; };
+
+ if (input.dims() == 1) {
+ // For 1D data, copy and then shuffle in place
+ context->set_output(0, tensor::DeepCopy(input));
+ auto vec = context->mutable_output(0)->vec<T>();
+ RandomShuffle(vec.data(), vec.data() + size, uniform);
+ } else {
+ // For >= 2D, shuffle indices and then copy across
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, input.shape(), &output));
+ const auto input_mat = input.flat_outer_dims<T>();
+ auto output_mat = output->flat_outer_dims<T>();
+ std::vector<int> permutation(size);
+ for (int i = 0; i < size; i++) {
+ permutation[i] = i;
+ }
+ RandomShuffle(permutation.begin(), permutation.end(), uniform);
+ for (int i = 0; i < size; i++) {
+ output_mat.template chip<0>(i) =
+ input_mat.template chip<0>(permutation[i]);
+ }
+ }
+ }
+ }
+
+ private:
+ GuardedPhiloxRandom generator_;
+};
+
+#define REGISTER(T) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("RandomShuffle").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+ RandomShuffleOp<T>);
+TF_CALL_ALL_TYPES(REGISTER)
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
new file mode 100644
index 0000000000..561ec76e53
--- /dev/null
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -0,0 +1,740 @@
+// See docs in ../ops/data_flow_ops.cc.
+
+#include <deque>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/queue_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+class RandomShuffleQueue : public QueueBase {
+ public:
+ RandomShuffleQueue(int32 capacity, int32 min_after_dequeue, int64 seed,
+ int64 seed2, const DataTypeVector& component_dtypes,
+ const std::vector<TensorShape>& component_shapes,
+ const string& name);
+ Status Initialize(); // Must be called before any other method.
+
+ // Implementations of QueueInterface methods --------------------------------
+
+ Status ValidateTuple(const Tuple& tuple) override;
+ Status ValidateManyTuple(const Tuple& tuple) override;
+ void TryEnqueue(const Tuple& tuple, OpKernelContext* ctx,
+ DoneCallback callback) override;
+ void TryEnqueueMany(const Tuple& tuple, OpKernelContext* ctx,
+ DoneCallback callback) override;
+ void TryDequeue(OpKernelContext* ctx, CallbackWithTuple callback) override;
+ void TryDequeueMany(int num_elements, OpKernelContext* ctx,
+ CallbackWithTuple callback) override;
+ void Close(OpKernelContext* ctx, bool cancel_pending_enqueues,
+ DoneCallback callback) override;
+ Status MatchesNodeDef(const NodeDef& node_def) override;
+
+ int32 size() override {
+ mutex_lock lock(mu_);
+ return queues_[0].size();
+ }
+
+ private:
+ enum Action { kEnqueue, kDequeue };
+
+ ~RandomShuffleQueue() override {}
+
+ TensorShape ManyOutShape(int i, int batch_size) {
+ TensorShape shape({batch_size});
+ shape.AppendShape(component_shapes_[i]);
+ return shape;
+ }
+
+ // Helper for dequeuing a single random element from queues_.
+ void DequeueLocked(OpKernelContext* ctx, Tuple* tuple)
+ EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+ void Cancel(Action action, CancellationToken token);
+
+ // Helper for cancelling all pending Enqueue(Many) operations when
+ // Close is called with cancel_pending_enqueues.
+ void CloseAndCancel();
+
+ // Tries to enqueue/dequeue (or close) based on whatever is at the
+ // front of enqueue_attempts_/dequeue_attempts_. Appends to
+ // *finished the callback for any finished attempt (so it may be
+ // called once mu_ is released). Returns true if any progress was
+ // made.
+ struct CleanUp {
+ CleanUp(DoneCallback&& f, CancellationToken ct, CancellationManager* cm)
+ : finished(f), to_deregister(ct), cm(cm) {}
+ DoneCallback finished;
+ CancellationToken to_deregister;
+ CancellationManager* cm;
+ };
+ bool TryAttemptLocked(Action action, std::vector<CleanUp>* clean_up)
+ EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+ // Tries to make progress on the enqueues or dequeues at the front
+ // of the *_attempts_ queues.
+ void FlushUnlocked();
+
+ const int32 capacity_;
+ const int32 min_after_dequeue_;
+ const int64 original_seed_;
+ const int64 original_seed2_;
+
+ mutex mu_;
+ typedef std::vector<PersistentTensor> SubQueue;
+ std::vector<SubQueue> queues_ GUARDED_BY(mu_);
+ bool closed_ GUARDED_BY(mu_);
+ random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
+ random::SingleSampleAdapter<random::PhiloxRandom> generator_ GUARDED_BY(mu_);
+
+ enum RunResult { kNoProgress, kProgress, kComplete };
+ struct Attempt;
+ typedef std::function<RunResult(Attempt*)> RunCallback;
+ struct Attempt {
+ int32 elements_requested;
+ DoneCallback done_callback; // must be run outside mu_
+ OpKernelContext* context;
+ CancellationToken cancellation_token;
+ RunCallback run_callback; // must be run while holding mu_
+ bool is_cancelled;
+ Tuple tuple;
+
+ Attempt(int32 elements_requested, DoneCallback done_callback,
+ OpKernelContext* context, CancellationToken cancellation_token,
+ RunCallback run_callback)
+ : elements_requested(elements_requested),
+ done_callback(done_callback),
+ context(context),
+ cancellation_token(cancellation_token),
+ run_callback(run_callback),
+ is_cancelled(false) {}
+ };
+ std::deque<Attempt> enqueue_attempts_ GUARDED_BY(mu_);
+ std::deque<Attempt> dequeue_attempts_ GUARDED_BY(mu_);
+
+ TF_DISALLOW_COPY_AND_ASSIGN(RandomShuffleQueue);
+};
+
+RandomShuffleQueue::RandomShuffleQueue(
+ int capacity, int min_after_dequeue, int64 seed, int64 seed2,
+ const DataTypeVector& component_dtypes,
+ const std::vector<TensorShape>& component_shapes, const string& name)
+ : QueueBase(component_dtypes, component_shapes, name),
+ capacity_(capacity),
+ min_after_dequeue_(min_after_dequeue),
+ original_seed_(seed),
+ original_seed2_(seed2),
+ closed_(false),
+ generator_(&parent_generator_) {
+ if (seed == 0 && seed2 == 0) {
+ // If both seeds are unspecified, use completely random seeds.
+ seed = random::New64();
+ seed2 = random::New64();
+ }
+ parent_generator_ = random::PhiloxRandom(seed, seed2);
+}
+
+Status RandomShuffleQueue::Initialize() {
+ if (component_dtypes_.empty()) {
+ return errors::InvalidArgument("Empty component types for queue ", name_);
+ }
+ if (!component_shapes_.empty() &&
+ component_dtypes_.size() != component_shapes_.size()) {
+ return errors::InvalidArgument("Different number of component types (",
+ component_dtypes_.size(), ") vs. shapes (",
+ component_shapes_.size(), ").");
+ }
+
+ mutex_lock lock(mu_);
+ queues_.reserve(num_components());
+ for (int i = 0; i < num_components(); ++i) {
+ queues_.push_back(SubQueue());
+ queues_.back().reserve(min_after_dequeue_);
+ }
+ return Status::OK();
+}
+
+// TODO(mrry): If these checks become a bottleneck, find a way to
+// reduce the number of times that they are called.
+Status RandomShuffleQueue::ValidateTuple(const Tuple& tuple) {
+ TF_RETURN_IF_ERROR(ValidateTupleCommon(tuple));
+ if (specified_shapes()) {
+ for (size_t i = 0; i < tuple.size(); ++i) {
+ if (!tuple[i].shape().IsSameSize(component_shapes_[i])) {
+ return errors::InvalidArgument(
+ "Shape mismatch in tuple component ", i, ". Expected ",
+ component_shapes_[i].ShortDebugString(), ", got ",
+ tuple[i].shape().ShortDebugString());
+ }
+ }
+ }
+ return Status::OK();
+}
+
+// TODO(mrry): If these checks become a bottleneck, find a way to
+// reduce the number of times that they are called.
+Status RandomShuffleQueue::ValidateManyTuple(const Tuple& tuple) {
+ TF_RETURN_IF_ERROR(ValidateTupleCommon(tuple));
+ const int64 batch_size = tuple[0].dim_size(0);
+ if (specified_shapes()) {
+ for (size_t i = 0; i < tuple.size(); ++i) {
+ // Expected shape is [batch_size] + component_shapes_[i]
+ const TensorShape expected_shape = ManyOutShape(i, batch_size);
+ if (!tuple[i].shape().IsSameSize(expected_shape)) {
+ return errors::InvalidArgument(
+ "Shape mismatch in tuple component ", i, ". Expected ",
+ expected_shape.ShortDebugString(), ", got ",
+ tuple[i].shape().ShortDebugString());
+ }
+ }
+ } else {
+ for (size_t i = 1; i < tuple.size(); ++i) {
+ if (tuple[i].dim_size(0) != batch_size) {
+ return errors::InvalidArgument(
+ "All input tensors must have the same size in the 0th ",
+ "dimension. Component ", i, " has ", tuple[i].dim_size(0),
+ ", and should have ", batch_size);
+ }
+ }
+ }
+ return Status::OK();
+}
+
+void RandomShuffleQueue::DequeueLocked(OpKernelContext* ctx, Tuple* tuple) {
+ DCHECK_GT(queues_[0].size(), 0);
+ int64 index = generator_() % queues_[0].size();
+ (*tuple).reserve(num_components());
+ for (int i = 0; i < num_components(); ++i) {
+ (*tuple).push_back(*queues_[i][index].AccessTensor(ctx));
+ queues_[i][index] = queues_[i].back();
+ queues_[i].pop_back();
+ }
+}
+
+void RandomShuffleQueue::Cancel(Action action, CancellationToken token) {
+ DoneCallback callback = nullptr;
+ {
+ mutex_lock lock(mu_);
+ std::deque<Attempt>* attempts =
+ action == kEnqueue ? &enqueue_attempts_ : &dequeue_attempts_;
+
+ for (Attempt& attempt : *attempts) {
+ if (attempt.cancellation_token == token) {
+ attempt.is_cancelled = true;
+ if (action == kEnqueue) {
+ attempt.context->SetStatus(
+ errors::Cancelled("Enqueue operation was cancelled"));
+ } else {
+ attempt.context->SetStatus(
+ errors::Cancelled("Dequeue operation was cancelled"));
+ }
+ std::swap(callback, attempt.done_callback);
+ break;
+ }
+ }
+ }
+ if (callback) {
+ callback();
+ FlushUnlocked();
+ }
+}
+
+void RandomShuffleQueue::CloseAndCancel() {
+ std::vector<DoneCallback> callbacks;
+ {
+ mutex_lock lock(mu_);
+ closed_ = true;
+ for (Attempt& attempt : enqueue_attempts_) {
+ attempt.is_cancelled = true;
+ attempt.context->SetStatus(
+ errors::Cancelled("Enqueue operation was cancelled"));
+ callbacks.emplace_back(std::move(attempt.done_callback));
+ }
+ }
+ for (const DoneCallback& callback : callbacks) {
+ callback();
+ }
+ FlushUnlocked();
+}
+
+bool RandomShuffleQueue::TryAttemptLocked(
+ Action action, std::vector<CleanUp>* clean_up) {
+ std::deque<Attempt>* attempts =
+ action == kEnqueue ? &enqueue_attempts_ : &dequeue_attempts_;
+
+ bool progress = false;
+ bool done = false;
+ while (!done && !attempts->empty()) {
+ if (attempts->front().is_cancelled) {
+ if (action == kEnqueue) {
+ LOG(INFO) << "Skipping cancelled enqueue attempt";
+ } else {
+ LOG(INFO) << "Skipping cancelled dequeue attempt";
+ }
+ attempts->pop_front();
+ } else {
+ Attempt* cur_attempt = &attempts->front();
+ switch (cur_attempt->run_callback(cur_attempt)) {
+ case kNoProgress:
+ done = true;
+ break;
+ case kProgress:
+ done = true;
+ progress = true;
+ break;
+ case kComplete:
+ progress = true;
+ clean_up->emplace_back(std::move(cur_attempt->done_callback),
+ cur_attempt->cancellation_token,
+ cur_attempt->context->cancellation_manager());
+ attempts->pop_front();
+ break;
+ }
+ }
+ }
+ return progress;
+}
+
+void RandomShuffleQueue::FlushUnlocked() {
+ std::vector<CleanUp> clean_up;
+ Ref();
+ {
+ mutex_lock lock(mu_);
+ bool changed;
+ do {
+ changed = TryAttemptLocked(kEnqueue, &clean_up);
+ changed = TryAttemptLocked(kDequeue, &clean_up) || changed;
+ } while (changed);
+ }
+ Unref();
+ for (const auto& to_clean : clean_up) {
+ if (to_clean.to_deregister != CancellationManager::kInvalidToken) {
+ // NOTE(mrry): We can safely ignore the return value of
+ // DeregisterCallback because the mutex mu_ ensures that the
+ // cleanup action only executes once.
+ to_clean.cm->DeregisterCallback(to_clean.to_deregister);
+ }
+ to_clean.finished();
+ }
+}
+
+void RandomShuffleQueue::TryEnqueue(const Tuple& tuple, OpKernelContext* ctx,
+ DoneCallback callback) {
+ CancellationManager* cm = ctx->cancellation_manager();
+ CancellationToken token = cm->get_cancellation_token();
+ bool already_cancelled;
+ {
+ mutex_lock l(mu_);
+ already_cancelled = !cm->RegisterCallback(
+ token, [this, token]() { Cancel(kEnqueue, token); });
+ if (!already_cancelled) {
+ enqueue_attempts_.emplace_back(
+ 1, callback, ctx, token,
+ [tuple, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+ if (closed_) {
+ attempt->context->SetStatus(errors::Aborted(
+ "RandomShuffleQueue '", name_, "' is closed."));
+ return kComplete;
+ }
+ if (queues_[0].size() < static_cast<size_t>(capacity_)) {
+ for (int i = 0; i < num_components(); ++i) {
+ queues_[i].push_back(PersistentTensor(tuple[i]));
+ }
+ return kComplete;
+ } else {
+ return kNoProgress;
+ }
+ });
+ }
+ }
+ if (!already_cancelled) {
+ FlushUnlocked();
+ } else {
+ ctx->SetStatus(errors::Cancelled("Enqueue operation was cancelled"));
+ callback();
+ }
+}
+
+void RandomShuffleQueue::TryEnqueueMany(const Tuple& tuple,
+ OpKernelContext* ctx,
+ DoneCallback callback) {
+ const int64 batch_size = tuple[0].dim_size(0);
+ if (batch_size == 0) {
+ callback();
+ return;
+ }
+
+ CancellationManager* cm = ctx->cancellation_manager();
+ CancellationToken token = cm->get_cancellation_token();
+ bool already_cancelled;
+ {
+ mutex_lock l(mu_);
+ already_cancelled = !cm->RegisterCallback(
+ token, [this, token]() { Cancel(kEnqueue, token); });
+ if (!already_cancelled) {
+ enqueue_attempts_.emplace_back(
+ batch_size, callback, ctx, token,
+ [tuple, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+ if (closed_) {
+ attempt->context->SetStatus(errors::Aborted(
+ "RandomShuffleQueue '", name_, "' is closed."));
+ return kComplete;
+ }
+ RunResult result = kNoProgress;
+ while (queues_[0].size() < static_cast<size_t>(capacity_)) {
+ result = kProgress;
+ const int index =
+ tuple[0].dim_size(0) - attempt->elements_requested;
+ for (int i = 0; i < num_components(); ++i) {
+ TensorShape element_shape(tuple[i].shape());
+ element_shape.RemoveDim(0);
+ PersistentTensor element;
+ Tensor* element_access = nullptr;
+ attempt->context->allocate_persistent(
+ tuple[i].dtype(), element_shape, &element, &element_access);
+ attempt->context->SetStatus(
+ CopySliceToElement(tuple[i], element_access, index));
+ if (!attempt->context->status().ok()) return kComplete;
+ queues_[i].push_back(element);
+ }
+ --attempt->elements_requested;
+ if (attempt->elements_requested == 0) {
+ return kComplete;
+ }
+ }
+ return result;
+ });
+ }
+ }
+ if (!already_cancelled) {
+ FlushUnlocked();
+ } else {
+ ctx->SetStatus(errors::Cancelled("Enqueue operation was cancelled"));
+ callback();
+ }
+}
+
+void RandomShuffleQueue::TryDequeue(OpKernelContext* ctx,
+ CallbackWithTuple callback) {
+ CancellationManager* cm = ctx->cancellation_manager();
+ CancellationToken token = cm->get_cancellation_token();
+ bool already_cancelled;
+ {
+ mutex_lock l(mu_);
+ already_cancelled = !cm->RegisterCallback(
+ token, [this, token]() { Cancel(kDequeue, token); });
+ if (!already_cancelled) {
+ // TODO(josh11b): This makes two copies of callback, avoid this if possible.
+ dequeue_attempts_.emplace_back(
+ 1, [callback]() { callback(Tuple()); }, ctx, token,
+ [callback, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+ int32 s = queues_[0].size();
+ if (closed_ && s == 0) {
+ attempt->context->SetStatus(errors::OutOfRange(
+ "RandomShuffleQueue '", name_, "' is closed and has ",
+ "insufficient elements (requested ", 1, ", current size ", s,
+ ")"));
+ return kComplete;
+ }
+ if (!closed_) s -= min_after_dequeue_;
+ if (s > 0) {
+ Tuple tuple;
+ DequeueLocked(attempt->context, &tuple);
+ attempt->done_callback = [callback, tuple]() { callback(tuple); };
+ return kComplete;
+ } else {
+ return kNoProgress;
+ }
+ });
+ }
+ }
+ if (!already_cancelled) {
+ FlushUnlocked();
+ } else {
+ ctx->SetStatus(errors::Cancelled("Dequeue operation was cancelled"));
+ callback(Tuple());
+ }
+}
+
+void RandomShuffleQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
+ CallbackWithTuple callback) {
+ if (!specified_shapes()) {
+ ctx->SetStatus(
+ errors::InvalidArgument("RandomShuffleQueue's DequeueMany requires the "
+ "components to have specified shapes."));
+ callback(Tuple());
+ return;
+ }
+ if (num_elements == 0) {
+ Tuple tuple;
+ tuple.reserve(num_components());
+ for (int i = 0; i < num_components(); ++i) {
+ // TODO(josh11b,misard): Switch to allocate_output(). Problem is
+ // this breaks the abstraction boundary since we don't *really*
+ // know if and how the Tensors in the tuple we pass to callback
+ // correspond to the outputs of *ctx. For example, the
+ // ReaderRead Op uses TryDequeue() to get a filename out of a
+ // queue that is used internally by the reader and is not
+ // associated with any output of the ReaderRead.
+ // mrry@ adds:
+ // Maybe we need to pass a std::function<Tensor*(...)> (or
+ // better signature) that calls the appropriate allocator
+ // function in addition to ctx? (Or support a shim Allocator
+ // that has an internal OpKernelContext*, and dispatches to the
+ // appropriate method?)
+ // misard@ adds:
+ // I don't see that a std::function would help. The problem is
+ // that at this point (allocation time) the system doesn't know
+ // what is going to happen to the element read out of the
+ // queue. As long as we keep the generality that TensorFlow Ops
+ // do their own dynamic allocation in arbitrary C++ code, we
+ // need to preserve robustness to allocating output Tensors with
+ // the 'wrong' attributes, and fixing up with a copy. The only
+ // improvement I can see here in the future would be to support
+ // an optimized case where the queue 'knows' what attributes to
+ // use, and plumbs them through here.
+ Tensor element;
+ ctx->allocate_temp(component_dtypes_[i], ManyOutShape(i, 0), &element);
+ tuple.emplace_back(element);
+ }
+ callback(tuple);
+ return;
+ }
+
+ CancellationManager* cm = ctx->cancellation_manager();
+ CancellationToken token = cm->get_cancellation_token();
+ bool already_cancelled;
+ {
+ mutex_lock l(mu_);
+ already_cancelled = !cm->RegisterCallback(
+ token, [this, token]() { Cancel(kDequeue, token); });
+ if (!already_cancelled) {
+ // TODO(josh11b): This makes two copies of callback, avoid this if possible.
+ dequeue_attempts_.emplace_back(
+ num_elements, [callback]() { callback(Tuple()); }, ctx, token,
+ [callback, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+ int32 s = queues_[0].size();
+ if (closed_ && s < attempt->elements_requested) {
+ attempt->context->SetStatus(errors::OutOfRange(
+ "RandomSuffleQueue '", name_, "' is closed and has ",
+ "insufficient elements (requested ",
+ attempt->elements_requested, ", current size ", s, ")"));
+ return kComplete;
+ }
+
+ RunResult result = kNoProgress;
+ if (!closed_) s -= min_after_dequeue_;
+ for (; s > 0; --s) {
+ if (attempt->tuple.empty()) {
+ // Only allocate tuple when we have something to dequeue
+ // so we don't use exceessive memory when there are many
+ // blocked dequeue attempts waiting.
+ attempt->tuple.reserve(num_components());
+ for (int i = 0; i < num_components(); ++i) {
+ const TensorShape shape =
+ ManyOutShape(i, attempt->elements_requested);
+ Tensor element;
+ attempt->context->allocate_temp(component_dtypes_[i], shape,
+ &element);
+ attempt->tuple.emplace_back(element);
+ }
+ }
+ result = kProgress;
+ Tuple tuple;
+ DequeueLocked(attempt->context, &tuple);
+ const int index =
+ attempt->tuple[0].dim_size(0) - attempt->elements_requested;
+ for (int i = 0; i < num_components(); ++i) {
+ attempt->context->SetStatus(
+ CopyElementToSlice(tuple[i], &attempt->tuple[i], index));
+ if (!attempt->context->status().ok()) return kComplete;
+ }
+ tuple.clear();
+ --attempt->elements_requested;
+ if (attempt->elements_requested == 0) {
+ tuple = attempt->tuple;
+ attempt->done_callback = [callback, tuple]() {
+ callback(tuple);
+ };
+ return kComplete;
+ }
+ }
+ return result;
+ });
+ }
+ }
+ if (!already_cancelled) {
+ FlushUnlocked();
+ } else {
+ ctx->SetStatus(errors::Cancelled("Dequeue operation was cancelled"));
+ callback(Tuple());
+ }
+}
+
+void RandomShuffleQueue::Close(OpKernelContext* ctx,
+ bool cancel_pending_enqueues,
+ DoneCallback callback) {
+ if (cancel_pending_enqueues) {
+ CloseAndCancel();
+ callback();
+ } else {
+ {
+ mutex_lock lock(mu_);
+ enqueue_attempts_.emplace_back(
+ 0, callback, ctx, CancellationManager::kInvalidToken,
+ [this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+ if (closed_) {
+ attempt->context->SetStatus(errors::Aborted(
+ "RandomShuffleQueue '", name_, "' is already closed."));
+ } else {
+ closed_ = true;
+ }
+ return kComplete;
+ });
+ }
+ FlushUnlocked();
+ }
+}
+
+Status RandomShuffleQueue::MatchesNodeDef(const NodeDef& node_def) {
+ TF_RETURN_IF_ERROR(MatchesNodeDefOp(node_def, "RandomShuffleQueue"));
+ TF_RETURN_IF_ERROR(MatchesNodeDefCapacity(node_def, capacity_));
+
+ int32 min_after_dequeue = -1;
+ TF_RETURN_IF_ERROR(
+ GetNodeAttr(node_def, "min_after_dequeue", &min_after_dequeue));
+ if (min_after_dequeue != min_after_dequeue_) {
+ return errors::InvalidArgument(
+ "Shared queue '", name_, "' has min_after_dequeue ",
+ min_after_dequeue_, " but requested min_after_dequeue was ",
+ min_after_dequeue, ".");
+ }
+
+ int64 seed = -1;
+ int64 seed2 = -1;
+ TF_RETURN_IF_ERROR(GetNodeAttr(node_def, "seed", &seed));
+ TF_RETURN_IF_ERROR(GetNodeAttr(node_def, "seed2", &seed2));
+ if ((seed != 0 || seed2 != 0) &&
+ (seed != original_seed_ || seed2 != original_seed2_)) {
+ return errors::InvalidArgument(
+ "Shared queue '", name_, "' has random seeds (", original_seed_, ", ",
+ original_seed2_, ") but requested seeds are (", seed, ", ", seed2,
+ ").");
+ }
+
+ TF_RETURN_IF_ERROR(MatchesNodeDefTypes(node_def));
+ TF_RETURN_IF_ERROR(MatchesNodeDefShapes(node_def));
+
+ return Status::OK();
+}
+
+typedef std::shared_ptr<QueueInterface> QueueInterfacePtr;
+
+// Defines a RandomShuffleQueueOp, which produces a Queue (specifically, one
+// backed by RandomShuffleQueue) that persists across different graph
+// executions, and sessions. Running this op produces a single-element
+// tensor of handles to Queues in the corresponding device.
+class RandomShuffleQueueOp : public OpKernel {
+ public:
+ explicit RandomShuffleQueueOp(OpKernelConstruction* context)
+ : OpKernel(context), queue_handle_set_(false) {
+ OP_REQUIRES_OK(context, context->GetAttr("capacity", &capacity_));
+ OP_REQUIRES_OK(context,
+ context->allocate_persistent(DT_STRING, TensorShape({2}),
+ &queue_handle_, nullptr));
+ if (capacity_ < 0) {
+ capacity_ = RandomShuffleQueue::kUnbounded;
+ }
+ OP_REQUIRES_OK(context,
+ context->GetAttr("min_after_dequeue", &min_after_dequeue_));
+ OP_REQUIRES(context, min_after_dequeue_ >= 0,
+ errors::InvalidArgument("min_after_dequeue ",
+ min_after_dequeue_, " must be >= 0"));
+ OP_REQUIRES(
+ context, min_after_dequeue_ < capacity_,
+ errors::InvalidArgument("min_after_dequeue ", min_after_dequeue_,
+ " must be < capacity ", capacity_));
+ OP_REQUIRES_OK(context, context->GetAttr("seed", &seed_));
+ OP_REQUIRES_OK(context, context->GetAttr("seed2", &seed2_));
+
+ OP_REQUIRES_OK(context,
+ context->GetAttr("component_types", &component_types_));
+ OP_REQUIRES_OK(context, context->GetAttr("shapes", &component_shapes_));
+ }
+
+ ~RandomShuffleQueueOp() override {
+ // If the queue object was not shared, delete it.
+ if (queue_handle_set_ && cinfo_.resource_is_private_to_kernel()) {
+ TF_CHECK_OK(cinfo_.resource_manager()->Delete<QueueInterface>(
+ cinfo_.container(), cinfo_.name()));
+ }
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ mutex_lock l(mu_);
+ if (!queue_handle_set_) {
+ OP_REQUIRES_OK(ctx, SetQueueHandle(ctx));
+ }
+ ctx->set_output_ref(0, &mu_, queue_handle_.AccessTensor(ctx));
+ }
+
+ private:
+ Status SetQueueHandle(OpKernelContext* ctx) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+ TF_RETURN_IF_ERROR(cinfo_.Init(ctx->resource_manager(), def()));
+ QueueInterface* queue;
+ auto creator = [this](QueueInterface** ret) {
+ auto* q = new RandomShuffleQueue(capacity_, min_after_dequeue_, seed_,
+ seed2_, component_types_,
+ component_shapes_, cinfo_.name());
+ Status s = q->Initialize();
+ if (s.ok()) {
+ *ret = q;
+ } else {
+ q->Unref();
+ }
+ return s;
+ };
+ TF_RETURN_IF_ERROR(
+ cinfo_.resource_manager()->LookupOrCreate<QueueInterface>(
+ cinfo_.container(), cinfo_.name(), &queue, creator));
+ core::ScopedUnref unref_me(queue);
+ // Verify that the shared queue is compatible with the requested arguments.
+ TF_RETURN_IF_ERROR(queue->MatchesNodeDef(def()));
+ auto h = queue_handle_.AccessTensor(ctx)->flat<string>();
+ h(0) = cinfo_.container();
+ h(1) = cinfo_.name();
+ queue_handle_set_ = true;
+ return Status::OK();
+ }
+
+ int32 capacity_;
+ int32 min_after_dequeue_;
+ int64 seed_;
+ int64 seed2_;
+ DataTypeVector component_types_;
+ std::vector<TensorShape> component_shapes_;
+ ContainerInfo cinfo_;
+
+ mutex mu_;
+ PersistentTensor queue_handle_ GUARDED_BY(mu_);
+ bool queue_handle_set_ GUARDED_BY(mu_);
+
+ TF_DISALLOW_COPY_AND_ASSIGN(RandomShuffleQueueOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("RandomShuffleQueue").Device(DEVICE_CPU),
+ RandomShuffleQueueOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/range_sampler.cc b/tensorflow/core/kernels/range_sampler.cc
new file mode 100644
index 0000000000..a3f4e0b0cb
--- /dev/null
+++ b/tensorflow/core/kernels/range_sampler.cc
@@ -0,0 +1,305 @@
+#include "tensorflow/core/kernels/range_sampler.h"
+
+#include <vector>
+#include <unordered_set>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+
+namespace tensorflow {
+
+using gtl::ArraySlice;
+using gtl::MutableArraySlice;
+
+RangeSampler::~RangeSampler() {}
+
+void RangeSampler::SampleBatch(random::SimplePhilox* rnd, bool unique,
+ gtl::MutableArraySlice<int64> batch) const {
+ SampleBatchGetExpectedCount(
+ rnd, unique, batch, gtl::MutableArraySlice<float>(),
+ gtl::ArraySlice<int64>(), gtl::MutableArraySlice<float>());
+}
+
+void RangeSampler::SampleBatchGetExpectedCount(
+ random::SimplePhilox* rnd, bool unique, gtl::MutableArraySlice<int64> batch,
+ gtl::MutableArraySlice<float> batch_expected_count,
+ gtl::ArraySlice<int64> extras,
+ gtl::MutableArraySlice<float> extras_expected_count) const {
+ SampleBatchGetExpectedCountAvoid(rnd, unique, batch, batch_expected_count,
+ extras, extras_expected_count,
+ gtl::ArraySlice<int64>());
+}
+
+namespace {
+
+// Approximates the expected count of a value in the output of SampleBatch.
+//
+// If unique=false, then this is (Probability(value) * batch_size)
+//
+// We use batch_size and num_tries, where num_tries is the observed number of
+// tries it took to get batch_size unique values.
+//
+// Assuming (falsely) that the nubmer of tries to get a batch of batch_size
+// distinct values is _always_ num_tries, the probability that the value
+// is in a batch is (1 - (1-p)^num_tries)
+static float ExpectedCountHelper(float p, int batch_size, int num_tries) {
+ if (num_tries == batch_size) {
+ // This shortcut will always be taken if unique=false
+ return p * batch_size;
+ }
+ // numerically stable version of (1 - (1-p)^num_tries)
+ return -expm1(num_tries * log1p(-p));
+}
+
+} // namespace
+
+void RangeSampler::SampleBatchGetExpectedCountAvoid(
+ random::SimplePhilox* rnd, bool unique, MutableArraySlice<int64> batch,
+ MutableArraySlice<float> batch_expected_count, ArraySlice<int64> extras,
+ MutableArraySlice<float> extras_expected_count,
+ ArraySlice<int64> avoided_values) const {
+ const int batch_size = batch.size();
+ int num_tries;
+
+ if (unique) {
+ CHECK_LE(batch_size + avoided_values.size(), range_);
+ std::unordered_set<int64> used(batch_size);
+ used.insert(avoided_values.begin(), avoided_values.end());
+ int num_picked = 0;
+ num_tries = 0;
+ while (num_picked < batch_size) {
+ num_tries++;
+ CHECK_LT(num_tries, kint32max);
+ int64 value = Sample(rnd);
+ if (gtl::InsertIfNotPresent(&used, value)) {
+ batch[num_picked++] = value;
+ }
+ }
+ } else {
+ CHECK_EQ(avoided_values.size(), 0)
+ << "avoided_values only supported with unique=true";
+ for (int i = 0; i < batch_size; i++) {
+ batch[i] = Sample(rnd);
+ }
+ num_tries = batch_size;
+ }
+ // Compute the expected counts of the batch and the extra values
+ if (batch_expected_count.size() > 0) {
+ CHECK_EQ(batch_size, batch_expected_count.size());
+ for (int i = 0; i < batch_size; i++) {
+ batch_expected_count[i] =
+ ExpectedCountHelper(Probability(batch[i]), batch_size, num_tries);
+ }
+ }
+ CHECK_EQ(extras.size(), extras_expected_count.size());
+ for (size_t i = 0; i < extras.size(); i++) {
+ extras_expected_count[i] =
+ ExpectedCountHelper(Probability(extras[i]), batch_size, num_tries);
+ }
+}
+
+AllSampler::AllSampler(int64 range)
+ : RangeSampler(range), inv_range_(1.0 / range) {}
+
+void AllSampler::SampleBatchGetExpectedCountAvoid(
+ random::SimplePhilox* rnd, bool unique, MutableArraySlice<int64> batch,
+ MutableArraySlice<float> batch_expected_count, ArraySlice<int64> extras,
+ MutableArraySlice<float> extras_expected_count,
+ ArraySlice<int64> avoided_values) const {
+ const int batch_size = batch.size();
+ CHECK_EQ(range_, batch_size);
+ for (int i = 0; i < batch_size; i++) {
+ batch[i] = i;
+ }
+ if (batch_expected_count.size() > 0) {
+ CHECK_EQ(batch_size, batch_expected_count.size());
+ for (int i = 0; i < batch_size; i++) {
+ batch_expected_count[i] = 1;
+ }
+ }
+ CHECK_EQ(0, avoided_values.size());
+ CHECK_EQ(extras.size(), extras_expected_count.size());
+ for (size_t i = 0; i < extras.size(); i++) {
+ extras_expected_count[i] = 1;
+ }
+}
+
+UniformSampler::UniformSampler(int64 range)
+ : RangeSampler(range), inv_range_(1.0 / range) {}
+
+int64 UniformSampler::Sample(random::SimplePhilox* rnd) const {
+ return rnd->Uniform64(range_);
+}
+
+float UniformSampler::Probability(int64 value) const { return inv_range_; }
+
+LogUniformSampler::LogUniformSampler(int64 range)
+ : RangeSampler(range), log_range_(log(range + 1)) {}
+
+int64 LogUniformSampler::Sample(random::SimplePhilox* rnd) const {
+ const int64 value =
+ static_cast<int64>(exp(rnd->RandDouble() * log_range_)) - 1;
+ CHECK_GE(value, 0);
+ // Mathematically, value should be <= range_, but might not be due to some
+ // floating point roundoff, so we mod by range_.
+ return value % range_;
+}
+
+float LogUniformSampler::Probability(int64 value) const {
+ // value is returned iff the call to UniformDouble(log_range_) in the
+ // Sample() function returns a value between log(value + 1)
+ // and log(value + 2). The probability of this is:
+ // (log(value + 2) - log(value + 1)) / log_range
+ // To avoid two calls to log(), we compute this as follows:
+ return (log((value + 2.0) / (value + 1.0))) / log_range_;
+}
+
+ThreadUnsafeUnigramSampler::ThreadUnsafeUnigramSampler(int64 range)
+ : RangeSampler(range), picker_(range) {
+ CHECK_LT(range, kint32max);
+}
+
+int64 ThreadUnsafeUnigramSampler::Sample(random::SimplePhilox* rnd) const {
+ return picker_.Pick(rnd);
+}
+
+float ThreadUnsafeUnigramSampler::Probability(int64 value) const {
+ return static_cast<float>(picker_.get_weight(value)) / picker_.total_weight();
+}
+
+void ThreadUnsafeUnigramSampler::Update(ArraySlice<int64> values) {
+ int num_updates = std::min(static_cast<int>(values.size()),
+ kint32max - picker_.total_weight());
+ for (int i = 0; i < num_updates; i++) {
+ const int64 value = values[i];
+ picker_.set_weight(value, picker_.get_weight(value) + 1);
+ }
+}
+
+// Thread-safe unigram sampler
+UnigramSampler::UnigramSampler(int64 range)
+ : RangeSampler(range), unsafe_sampler_(range) {
+ CHECK_LT(range, kint32max);
+}
+
+int64 UnigramSampler::Sample(random::SimplePhilox* rnd) const {
+ mutex_lock lock(mu_); // could use reader lock
+ return unsafe_sampler_.Sample(rnd);
+}
+
+float UnigramSampler::Probability(int64 value) const {
+ mutex_lock lock(mu_); // could use reader lock
+ return unsafe_sampler_.Probability(value);
+}
+
+// Overriding at a high level results in far fewer lock aquisitions.
+void UnigramSampler::SampleBatchGetExpectedCountAvoid(
+ random::SimplePhilox* rnd, bool unique, MutableArraySlice<int64> batch,
+ MutableArraySlice<float> batch_expected_count, ArraySlice<int64> extras,
+ MutableArraySlice<float> extras_expected_count,
+ ArraySlice<int64> avoided_values) const {
+ mutex_lock lock(mu_); // could use reader lock
+ unsafe_sampler_.SampleBatchGetExpectedCountAvoid(
+ rnd, unique, batch, batch_expected_count, extras, extras_expected_count,
+ avoided_values);
+}
+
+void UnigramSampler::Update(ArraySlice<int64> values) {
+ mutex_lock lock(mu_);
+ unsafe_sampler_.Update(values);
+}
+
+FixedUnigramSampler::FixedUnigramSampler(Env* env, int64 range,
+ const string& vocab_file,
+ float distortion,
+ int32 num_reserved_ids,
+ int32 num_shards, int32 shard)
+ : RangeSampler(range),
+ total_weight_(0.0),
+ num_shards_(num_shards),
+ shard_(shard) {
+ FillReservedIds(num_reserved_ids);
+ // TODO(vanhoucke): make this non-crashing.
+ TF_CHECK_OK(LoadFromFile(env, vocab_file, distortion));
+ CHECK_EQ(range, weights_.size());
+ dist_sampler_.reset(new random::DistributionSampler(weights_));
+}
+
+FixedUnigramSampler::FixedUnigramSampler(int64 range,
+ const std::vector<float>& unigrams,
+ float distortion,
+ int32 num_reserved_ids,
+ int32 num_shards, int32 shard)
+ : RangeSampler(range),
+ total_weight_(0.0),
+ num_shards_(num_shards),
+ shard_(shard) {
+ FillReservedIds(num_reserved_ids);
+ LoadFromUnigrams(unigrams, distortion);
+ // TODO(vanhoucke): make this non-crashing.
+ CHECK_EQ(range, weights_.size());
+ dist_sampler_.reset(new random::DistributionSampler(weights_));
+}
+
+float FixedUnigramSampler::Probability(int64 value) const {
+ return weights_.at(value) / total_weight_;
+}
+
+int64 FixedUnigramSampler::Sample(random::SimplePhilox* rnd) const {
+ return dist_sampler_->Sample(rnd);
+}
+
+void FixedUnigramSampler::FillReservedIds(int32 num_reserved_ids) {
+ for (int32 word_id = 0; word_id < num_reserved_ids; ++word_id) {
+ if (word_id % num_shards_ == shard_) weights_.push_back(0.0);
+ }
+}
+
+Status FixedUnigramSampler::LoadFromFile(Env* env, const string& vocab_file,
+ float distortion) {
+ RandomAccessFile* file;
+ TF_RETURN_IF_ERROR(env->NewRandomAccessFile(vocab_file, &file));
+ io::InputBuffer in(file, 262144 /*bytes*/);
+ string line;
+ int32 word_id = weights_.size();
+ while (in.ReadLine(&line).ok()) {
+ // The vocabulary file should be in csv like format, with the last
+ // field the weight associated with the word.
+ std::vector<string> cols = str_util::Split(line, ',');
+ if (cols.size() == 0) continue;
+ // Skip entries that do not belong to this shard.
+ if (word_id % num_shards_ == shard_) {
+ float w = 0.0;
+ if (!strings::safe_strtof(cols.at(cols.size() - 1).c_str(), &w)) {
+ return errors::InvalidArgument("Wrong vocabulary format at line: ",
+ line);
+ }
+ w = pow(w, distortion);
+ total_weight_ += w;
+ weights_.push_back(w);
+ }
+ ++word_id;
+ }
+ return Status::OK();
+}
+
+void FixedUnigramSampler::LoadFromUnigrams(const std::vector<float>& unigrams,
+ float distortion) {
+ int32 word_id = weights_.size();
+ for (float w : unigrams) {
+ // Skip entries that do not belong to this shard.
+ if (word_id % num_shards_ == shard_) {
+ w = pow(w, distortion);
+ total_weight_ += w;
+ weights_.push_back(w);
+ }
+ ++word_id;
+ }
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/range_sampler.h b/tensorflow/core/kernels/range_sampler.h
new file mode 100644
index 0000000000..18364c2c03
--- /dev/null
+++ b/tensorflow/core/kernels/range_sampler.h
@@ -0,0 +1,237 @@
+#ifndef TENSORFLOW_KERNELS_RANGE_SAMPLER_H_
+#define TENSORFLOW_KERNELS_RANGE_SAMPLER_H_
+
+#include <vector>
+
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/random/distribution_sampler.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/lib/random/weighted_picker.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/status.h"
+
+namespace tensorflow {
+
+class Env;
+
+// Abstract subclass for sampling from the set of non-negative integers
+// [0, range)
+class RangeSampler {
+ public:
+ explicit RangeSampler(int range) : range_(range) { CHECK_GT(range_, 0); }
+ virtual ~RangeSampler();
+
+ // Sample a single value
+ virtual int64 Sample(random::SimplePhilox* rnd) const = 0;
+
+ // The probability that a single call to Sample() returns the given value.
+ // Assumes that value is in [0, range). No range checking is done.
+ virtual float Probability(int64 value) const = 0;
+
+ // Fill "batch" with samples from the distribution.
+ // If unique=true, then we re-pick each element until we get a
+ // value distinct from all previously picked values in the batch.
+ void SampleBatch(random::SimplePhilox* rnd, bool unique,
+ gtl::MutableArraySlice<int64> batch) const;
+
+ // Fill "batch" with samples from the distribution, and report
+ // "expected counts".
+ //
+ // The "expected count" of a value is an estimate of the expected
+ // number of occurrences of the value in the batch returned by a
+ // call to this function with the given parameters. If unique=true,
+ // the expected count is an inclusion probability. For details on
+ // this estimation, see the comment to "ExpectedCountHelper" in the
+ // .cc file.
+ //
+ // Expected counts for the elements of the returned "batch" are reported
+ // in the aligned array "batch_expected_count".
+ //
+ // The user can optionally provide "extras", containg values in the range.
+ // The expected counts for the extras are reported in the aligned array
+ // "extras_expected_count".
+ //
+ // "batch_expected_count" must have size equal to 0 or to the size of "batch".
+ // "extras" and "extras_expected_count" must have equal size.
+ void SampleBatchGetExpectedCount(
+ random::SimplePhilox* rnd, bool unique,
+ gtl::MutableArraySlice<int64> batch,
+ gtl::MutableArraySlice<float> batch_expected_count,
+ gtl::ArraySlice<int64> extras,
+ gtl::MutableArraySlice<float> extras_expected_count) const;
+
+ // Same as SampleBatchGetExpectedCount (see above), but with avoided values.
+ // We repick to avoid all of the values in "avoided_values".
+ // "avoided_values" is only supported with unique=true. If
+ // unique=false, then avoided_values must be empty.
+ virtual void SampleBatchGetExpectedCountAvoid(
+ random::SimplePhilox* rnd, bool unique,
+ gtl::MutableArraySlice<int64> batch,
+ gtl::MutableArraySlice<float> batch_expected_count,
+ gtl::ArraySlice<int64> extras,
+ gtl::MutableArraySlice<float> extras_expected_count,
+ gtl::ArraySlice<int64> avoided_values) const;
+
+ // Does this sampler need to be updated with values, e.g. UnigramSampler
+ virtual bool NeedsUpdates() const { return false; }
+
+ // Updates the underlying distribution
+ virtual void Update(gtl::ArraySlice<int64> values) {
+ LOG(FATAL) << "Update not supported for this sampler type.";
+ }
+
+ int64 range() { return range_; }
+
+ protected:
+ const int64 range_;
+};
+
+// An AllSampler only samples batches of size equal to range.
+// It returns the entire range.
+// It cannot sample single values.
+class AllSampler : public RangeSampler {
+ public:
+ explicit AllSampler(int64 range);
+
+ ~AllSampler() override {}
+
+ int64 Sample(random::SimplePhilox* rnd) const override {
+ LOG(FATAL) << "Should not be called";
+ }
+
+ float Probability(int64 value) const override {
+ LOG(FATAL) << "Should not be called";
+ }
+
+ void SampleBatchGetExpectedCountAvoid(
+ random::SimplePhilox* rnd, bool unique,
+ gtl::MutableArraySlice<int64> batch,
+ gtl::MutableArraySlice<float> batch_expected_count,
+ gtl::ArraySlice<int64> extras,
+ gtl::MutableArraySlice<float> extras_expected_count,
+ gtl::ArraySlice<int64> avoided_values) const override;
+
+ private:
+ const float inv_range_;
+};
+
+class UniformSampler : public RangeSampler {
+ public:
+ explicit UniformSampler(int64 range);
+
+ ~UniformSampler() override {}
+
+ int64 Sample(random::SimplePhilox* rnd) const override;
+
+ float Probability(int64 value) const override;
+
+ private:
+ const float inv_range_;
+};
+
+class LogUniformSampler : public RangeSampler {
+ public:
+ explicit LogUniformSampler(int64 range);
+
+ ~LogUniformSampler() override {}
+
+ int64 Sample(random::SimplePhilox* rnd) const override;
+
+ float Probability(int64 value) const override;
+
+ private:
+ const double log_range_;
+};
+
+// Thread-unsafe unigram sampler
+class ThreadUnsafeUnigramSampler : public RangeSampler {
+ public:
+ explicit ThreadUnsafeUnigramSampler(int64 range);
+ ~ThreadUnsafeUnigramSampler() override {}
+
+ int64 Sample(random::SimplePhilox* rnd) const override;
+
+ float Probability(int64 value) const override;
+
+ bool NeedsUpdates() const override { return true; }
+ void Update(gtl::ArraySlice<int64> values) override;
+
+ private:
+ random::WeightedPicker picker_;
+};
+
+// Thread-safe unigram sampler
+class UnigramSampler : public RangeSampler {
+ public:
+ explicit UnigramSampler(int64 range);
+ ~UnigramSampler() override {}
+
+ int64 Sample(random::SimplePhilox* rnd) const override;
+
+ float Probability(int64 value) const override;
+
+ // Overriding at a high level results in far fewer lock aquisitions.
+ void SampleBatchGetExpectedCountAvoid(
+ random::SimplePhilox* rnd, bool unique,
+ gtl::MutableArraySlice<int64> batch,
+ gtl::MutableArraySlice<float> batch_expected_count,
+ gtl::ArraySlice<int64> extras,
+ gtl::MutableArraySlice<float> extras_expected_count,
+ gtl::ArraySlice<int64> avoided_values) const override;
+
+ bool NeedsUpdates() const override { return true; }
+ void Update(gtl::ArraySlice<int64> values) override;
+
+ private:
+ ThreadUnsafeUnigramSampler unsafe_sampler_ GUARDED_BY(mu_);
+ mutable mutex mu_;
+};
+
+// A unigram sampler that uses a fixed unigram distribution read from a
+// file or passed in as an in-memory array instead of building up the
+// distribution from data on the fly. There is also an option to skew the
+// distribution by applying a distortion power to the weights.
+class FixedUnigramSampler : public RangeSampler {
+ public:
+ // The vocab_file is assumed to be a CSV, with the last entry of each row a
+ // value representing the counts or probabilities for the corresponding ID.
+ FixedUnigramSampler(Env* env, int64 range, const string& vocab_file,
+ float distortion, int32 num_reserved_ids,
+ int32 num_shards, int32 shard);
+
+ FixedUnigramSampler(int64 range, const std::vector<float>& unigrams,
+ float distortion, int32 num_reserved_ids,
+ int32 num_shards, int32 shard);
+
+ float Probability(int64 value) const override;
+
+ int64 Sample(random::SimplePhilox* rnd) const override;
+
+ private:
+ // Underlying distribution sampler.
+ std::unique_ptr<random::DistributionSampler> dist_sampler_;
+ // Weights for individual samples. The probability of a sample i is defined
+ // as weights_.at(i) / total_weight_.
+ std::vector<float> weights_;
+ // The total weights of all samples.
+ float total_weight_;
+ // Sharding information of the sampler. The whole vocabulary is sharded
+ // into num_shards_ smaller ranges and each sampler is responsible for one
+ // such smaller range, identified by the shard number.
+ int32 num_shards_;
+ int32 shard_;
+
+ // Fill the sampler with the appropriate number of reserved IDs.
+ void FillReservedIds(int32 num_reserved_ids);
+ // Load IDs to sample from a CSV file. It is assumed that the last item of
+ // each row contains a count or probability for the corresponding ID.
+ Status LoadFromFile(Env* env, const string& vocab_file, float distortion);
+ // Load from an in-memory array.
+ void LoadFromUnigrams(const std::vector<float>& unigrams, float distortion);
+};
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_RANGE_SAMPLER_H_
diff --git a/tensorflow/core/kernels/range_sampler_test.cc b/tensorflow/core/kernels/range_sampler_test.cc
new file mode 100644
index 0000000000..72c39009e4
--- /dev/null
+++ b/tensorflow/core/kernels/range_sampler_test.cc
@@ -0,0 +1,320 @@
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/core/kernels/range_sampler.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/env.h"
+
+namespace tensorflow {
+namespace {
+
+using gtl::ArraySlice;
+using gtl::MutableArraySlice;
+
+class RangeSamplerTest : public ::testing::Test {
+ protected:
+ void CheckProbabilitiesSumToOne() {
+ double sum = 0;
+ for (int i = 0; i < sampler_->range(); i++) {
+ sum += sampler_->Probability(i);
+ }
+ EXPECT_NEAR(sum, 1.0, 1e-4);
+ }
+ void CheckHistogram(int num_samples, float tolerance) {
+ const int range = sampler_->range();
+ std::vector<int> h(range);
+ std::vector<int64> a(num_samples);
+ // Using a fixed random seed to make the test deterministic.
+ random::PhiloxRandom philox(123, 17);
+ random::SimplePhilox rnd(&philox);
+ sampler_->SampleBatch(&rnd, false, &a);
+ for (int i = 0; i < num_samples; i++) {
+ int64 val = a[i];
+ ASSERT_GE(val, 0);
+ ASSERT_LT(val, range);
+ h[val]++;
+ }
+ for (int val = 0; val < range; val++) {
+ EXPECT_NEAR((h[val] + 0.0) / num_samples, sampler_->Probability(val),
+ tolerance);
+ }
+ }
+ void Update1() {
+ // Add the value 3 ten times.
+ std::vector<int64> a(10);
+ for (int i = 0; i < 10; i++) {
+ a[i] = 3;
+ }
+ sampler_->Update(a);
+ }
+ void Update2() {
+ // Add the value n n times.
+ int64 a[10];
+ for (int i = 0; i < 10; i++) {
+ a[i] = i;
+ }
+ for (int64 i = 1; i < 10; i++) {
+ sampler_->Update(ArraySlice<int64>(a + i, 10 - i));
+ }
+ }
+ std::unique_ptr<RangeSampler> sampler_;
+};
+
+TEST_F(RangeSamplerTest, UniformProbabilities) {
+ sampler_.reset(new UniformSampler(10));
+ for (int i = 0; i < 10; i++) {
+ CHECK_EQ(sampler_->Probability(i), sampler_->Probability(0));
+ }
+}
+
+TEST_F(RangeSamplerTest, UniformChecksum) {
+ sampler_.reset(new UniformSampler(10));
+ CheckProbabilitiesSumToOne();
+}
+
+TEST_F(RangeSamplerTest, UniformHistogram) {
+ sampler_.reset(new UniformSampler(10));
+ CheckHistogram(1000, 0.05);
+}
+
+TEST_F(RangeSamplerTest, LogUniformProbabilities) {
+ int range = 1000000;
+ sampler_.reset(new LogUniformSampler(range));
+ for (int i = 100; i < range; i *= 2) {
+ float ratio = sampler_->Probability(i) / sampler_->Probability(i / 2);
+ EXPECT_NEAR(ratio, 0.5, 0.1);
+ }
+}
+
+TEST_F(RangeSamplerTest, LogUniformChecksum) {
+ sampler_.reset(new LogUniformSampler(10));
+ CheckProbabilitiesSumToOne();
+}
+
+TEST_F(RangeSamplerTest, LogUniformHistogram) {
+ sampler_.reset(new LogUniformSampler(10));
+ CheckHistogram(1000, 0.05);
+}
+
+TEST_F(RangeSamplerTest, UnigramProbabilities1) {
+ sampler_.reset(new UnigramSampler(10));
+ Update1();
+ EXPECT_NEAR(sampler_->Probability(3), 0.55, 1e-4);
+ for (int i = 0; i < 10; i++) {
+ if (i != 3) {
+ ASSERT_NEAR(sampler_->Probability(i), 0.05, 1e-4);
+ }
+ }
+}
+TEST_F(RangeSamplerTest, UnigramProbabilities2) {
+ sampler_.reset(new UnigramSampler(10));
+ Update2();
+ for (int i = 0; i < 10; i++) {
+ ASSERT_NEAR(sampler_->Probability(i), (i + 1) / 55.0, 1e-4);
+ }
+}
+TEST_F(RangeSamplerTest, UnigramChecksum) {
+ sampler_.reset(new UnigramSampler(10));
+ Update1();
+ CheckProbabilitiesSumToOne();
+}
+
+TEST_F(RangeSamplerTest, UnigramHistogram) {
+ sampler_.reset(new UnigramSampler(10));
+ Update1();
+ CheckHistogram(1000, 0.05);
+}
+
+static const char kVocabContent[] =
+ "w1,1\n"
+ "w2,2\n"
+ "w3,4\n"
+ "w4,8\n"
+ "w5,16\n"
+ "w6,32\n"
+ "w7,64\n"
+ "w8,128\n"
+ "w9,256";
+TEST_F(RangeSamplerTest, FixedUnigramProbabilities) {
+ Env* env = Env::Default();
+ string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+ TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
+ sampler_.reset(new FixedUnigramSampler(env, 9, fname, 0.8, 0, 1, 0));
+ // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05
+ for (int i = 0; i < 9; i++) {
+ ASSERT_NEAR(sampler_->Probability(i), pow(2, i * 0.8) / 197.05, 1e-4);
+ }
+}
+TEST_F(RangeSamplerTest, FixedUnigramChecksum) {
+ Env* env = Env::Default();
+ string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+ TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
+ sampler_.reset(new FixedUnigramSampler(env, 9, fname, 0.8, 0, 1, 0));
+ CheckProbabilitiesSumToOne();
+}
+
+TEST_F(RangeSamplerTest, FixedUnigramHistogram) {
+ Env* env = Env::Default();
+ string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+ TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
+ sampler_.reset(new FixedUnigramSampler(env, 9, fname, 0.8, 0, 1, 0));
+ CheckHistogram(1000, 0.05);
+}
+TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve1) {
+ Env* env = Env::Default();
+ string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+ TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
+ sampler_.reset(new FixedUnigramSampler(env, 10, fname, 0.8, 1, 1, 0));
+ ASSERT_NEAR(sampler_->Probability(0), 0, 1e-4);
+ // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05
+ for (int i = 1; i < 10; i++) {
+ ASSERT_NEAR(sampler_->Probability(i), pow(2, (i - 1) * 0.8) / 197.05, 1e-4);
+ }
+}
+TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve2) {
+ Env* env = Env::Default();
+ string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+ TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
+ sampler_.reset(new FixedUnigramSampler(env, 11, fname, 0.8, 2, 1, 0));
+ ASSERT_NEAR(sampler_->Probability(0), 0, 1e-4);
+ ASSERT_NEAR(sampler_->Probability(1), 0, 1e-4);
+ // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05
+ for (int i = 2; i < 11; i++) {
+ ASSERT_NEAR(sampler_->Probability(i), pow(2, (i - 2) * 0.8) / 197.05, 1e-4);
+ }
+}
+TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesFromVector) {
+ std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256};
+ sampler_.reset(new FixedUnigramSampler(9, weights, 0.8, 0, 1, 0));
+ // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05
+ for (int i = 0; i < 9; i++) {
+ ASSERT_NEAR(sampler_->Probability(i), pow(2, i * 0.8) / 197.05, 1e-4);
+ }
+}
+TEST_F(RangeSamplerTest, FixedUnigramChecksumFromVector) {
+ std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256};
+ sampler_.reset(new FixedUnigramSampler(9, weights, 0.8, 0, 1, 0));
+ CheckProbabilitiesSumToOne();
+}
+TEST_F(RangeSamplerTest, FixedUnigramHistogramFromVector) {
+ std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256};
+ sampler_.reset(new FixedUnigramSampler(9, weights, 0.8, 0, 1, 0));
+ CheckHistogram(1000, 0.05);
+}
+TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve1FromVector) {
+ std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256};
+ sampler_.reset(new FixedUnigramSampler(10, weights, 0.8, 1, 1, 0));
+ ASSERT_NEAR(sampler_->Probability(0), 0, 1e-4);
+ // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05
+ for (int i = 1; i < 10; i++) {
+ ASSERT_NEAR(sampler_->Probability(i), pow(2, (i - 1) * 0.8) / 197.05, 1e-4);
+ }
+}
+TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve2FromVector) {
+ std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256};
+ sampler_.reset(new FixedUnigramSampler(11, weights, 0.8, 2, 1, 0));
+ ASSERT_NEAR(sampler_->Probability(0), 0, 1e-4);
+ ASSERT_NEAR(sampler_->Probability(1), 0, 1e-4);
+ // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05
+ for (int i = 2; i < 11; i++) {
+ ASSERT_NEAR(sampler_->Probability(i), pow(2, (i - 2) * 0.8) / 197.05, 1e-4);
+ }
+}
+
+// AllSampler cannot call Sample or Probability directly.
+// We will test SampleBatchGetExpectedCount instead.
+TEST_F(RangeSamplerTest, All) {
+ int batch_size = 10;
+ sampler_.reset(new AllSampler(10));
+ std::vector<int64> batch(batch_size);
+ std::vector<float> batch_expected(batch_size);
+ std::vector<int64> extras(2);
+ std::vector<float> extras_expected(2);
+ extras[0] = 0;
+ extras[1] = batch_size - 1;
+ sampler_->SampleBatchGetExpectedCount(nullptr, // no random numbers needed
+ false, &batch, &batch_expected, extras,
+ &extras_expected);
+ for (int i = 0; i < batch_size; i++) {
+ EXPECT_EQ(i, batch[i]);
+ EXPECT_EQ(1, batch_expected[i]);
+ }
+ EXPECT_EQ(1, extras_expected[0]);
+ EXPECT_EQ(1, extras_expected[1]);
+}
+
+TEST_F(RangeSamplerTest, Unique) {
+ // We sample num_batches batches, each without replacement.
+ //
+ // We check that the returned expected counts roughly agree with each other
+ // and with the average observed frequencies over the set of batches.
+ random::PhiloxRandom philox(123, 17);
+ random::SimplePhilox rnd(&philox);
+ const int range = 100;
+ const int batch_size = 50;
+ const int num_batches = 100;
+ sampler_.reset(new LogUniformSampler(range));
+ std::vector<int> histogram(range);
+ std::vector<int64> batch(batch_size);
+ std::vector<int64> all_values(range);
+ for (int i = 0; i < range; i++) {
+ all_values[i] = i;
+ }
+ std::vector<float> expected(range);
+
+ // Sample one batch and get the expected counts of all values
+ sampler_->SampleBatchGetExpectedCount(
+ &rnd, true, &batch, MutableArraySlice<float>(), all_values, &expected);
+ // Check that all elements are unique
+ std::set<int64> s(batch.begin(), batch.end());
+ CHECK_EQ(batch_size, s.size());
+
+ for (int trial = 0; trial < num_batches; trial++) {
+ std::vector<float> trial_expected(range);
+ sampler_->SampleBatchGetExpectedCount(&rnd, true, &batch,
+ MutableArraySlice<float>(),
+ all_values, &trial_expected);
+ for (int i = 0; i < range; i++) {
+ EXPECT_NEAR(expected[i], trial_expected[i], expected[i] * 0.5);
+ }
+ for (int i = 0; i < batch_size; i++) {
+ histogram[batch[i]]++;
+ }
+ }
+ for (int i = 0; i < range; i++) {
+ // Check that the computed expected count agrees with the average observed
+ // count.
+ const float average_count = static_cast<float>(histogram[i]) / num_batches;
+ EXPECT_NEAR(expected[i], average_count, 0.2);
+ }
+}
+
+TEST_F(RangeSamplerTest, Avoid) {
+ random::PhiloxRandom philox(123, 17);
+ random::SimplePhilox rnd(&philox);
+ sampler_.reset(new LogUniformSampler(100));
+ std::vector<int64> avoided(2);
+ avoided[0] = 17;
+ avoided[1] = 23;
+ std::vector<int64> batch(98);
+
+ // We expect to pick all elements of [0, 100) except the avoided two.
+ sampler_->SampleBatchGetExpectedCountAvoid(
+ &rnd, true, &batch, MutableArraySlice<float>(), ArraySlice<int64>(),
+ MutableArraySlice<float>(), avoided);
+
+ int sum = 0;
+ for (auto val : batch) {
+ sum += val;
+ }
+ const int expected_sum = 100 * 99 / 2 - avoided[0] - avoided[1];
+ EXPECT_EQ(expected_sum, sum);
+}
+
+} // namespace
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reader_base.cc b/tensorflow/core/kernels/reader_base.cc
new file mode 100644
index 0000000000..06211efb38
--- /dev/null
+++ b/tensorflow/core/kernels/reader_base.cc
@@ -0,0 +1,156 @@
+#include "tensorflow/core/kernels/reader_base.h"
+
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/coding.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+
+// ReaderBase ------------------------------------------------------
+
+ReaderBase::ReaderBase(const string& name) : name_(name) {}
+
+int64 ReaderBase::NumRecordsProduced() {
+ mutex_lock lock(mu_);
+ return num_records_produced_;
+}
+
+int64 ReaderBase::NumWorkUnitsCompleted() {
+ mutex_lock lock(mu_);
+ return work_finished_;
+}
+
+Status ReaderBase::Reset() {
+ mutex_lock lock(mu_);
+ return ResetLocked();
+}
+
+Status ReaderBase::ResetLocked() {
+ work_started_ = 0;
+ work_finished_ = 0;
+ num_records_produced_ = 0;
+ work_.clear();
+ return Status::OK();
+}
+
+Status ReaderBase::SerializeState(string* state) {
+ mutex_lock lock(mu_);
+ return SerializeStateLocked(state);
+}
+
+Status ReaderBase::SerializeStateLocked(string* state) {
+ return errors::Unimplemented("Reader SerializeState");
+}
+
+Status ReaderBase::RestoreState(const string& state) {
+ mutex_lock lock(mu_);
+ Status status = RestoreStateLocked(state);
+ if (!status.ok()) {
+ ResetLocked();
+ }
+ return status;
+}
+
+Status ReaderBase::RestoreStateLocked(const string& state) {
+ return errors::Unimplemented("Reader RestoreState");
+}
+
+void ReaderBase::Read(QueueInterface* queue, string* key, string* value,
+ OpKernelContext* context) {
+ mutex_lock lock(mu_);
+ while (true) {
+ if (!work_in_progress()) {
+ GetNextWorkLocked(queue, context);
+ if (!context->status().ok()) return;
+ }
+
+ bool produced = false;
+ bool at_end = false;
+ Status status = ReadLocked(key, value, &produced, &at_end);
+
+ if (!at_end && status.ok() && !produced) {
+ status = errors::Internal(
+ "ReadLocked() for ", name(),
+ " must set *at_end=true, *produced=true, or return an error.");
+ }
+ if (!status.ok() && produced) {
+ status = errors::Internal("ReadLocked() for ", name(),
+ " set *produced=true *and* returned an error: ",
+ status.ToString());
+ }
+ if (status.ok() && at_end) {
+ status = OnWorkFinishedLocked();
+ work_finished_ = work_started_;
+ }
+ if (!status.ok()) {
+ context->SetStatus(status);
+ return;
+ }
+ if (produced) {
+ ++num_records_produced_;
+ return;
+ }
+ }
+}
+
+void ReaderBase::GetNextWorkLocked(QueueInterface* queue,
+ OpKernelContext* context) {
+ Notification n;
+ queue->TryDequeue(
+ context, [this, context, &n](const QueueInterface::Tuple& tuple) {
+ if (context->status().ok()) {
+ if (tuple.size() != 1) {
+ context->SetStatus(
+ errors::InvalidArgument("Expected single component queue"));
+ } else if (tuple[0].dtype() != DT_STRING) {
+ context->SetStatus(errors::InvalidArgument(
+ "Expected queue with single string component"));
+ } else if (tuple[0].NumElements() != 1) {
+ context->SetStatus(errors::InvalidArgument(
+ "Expected to dequeue a one-element string tensor"));
+ } else {
+ work_ = tuple[0].flat<string>()(0);
+ ++work_started_;
+ Status status = OnWorkStartedLocked();
+ if (!status.ok()) {
+ context->SetStatus(status);
+ --work_started_;
+ }
+ }
+ }
+ n.Notify();
+ });
+ n.WaitForNotification();
+}
+
+void ReaderBase::SaveBaseState(ReaderBaseState* state) const {
+ state->Clear();
+ state->set_work_started(work_started_);
+ state->set_work_finished(work_finished_);
+ state->set_num_records_produced(num_records_produced_);
+ state->set_current_work(work_);
+}
+
+Status ReaderBase::RestoreBaseState(const ReaderBaseState& state) {
+ work_started_ = state.work_started();
+ work_finished_ = state.work_finished();
+ num_records_produced_ = state.num_records_produced();
+ work_ = state.current_work();
+ if (work_started_ < 0 || work_finished_ < 0 || num_records_produced_ < 0) {
+ return errors::InvalidArgument(
+ "Unexpected negative value when restoring in ", name(), ": ",
+ state.ShortDebugString());
+ }
+ if (work_started_ > work_finished_) {
+ return errors::InvalidArgument(
+ "Inconsistent work started vs. finished when restoring in ", name(),
+ ": ", state.ShortDebugString());
+ }
+ return Status::OK();
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reader_base.h b/tensorflow/core/kernels/reader_base.h
new file mode 100644
index 0000000000..d344300388
--- /dev/null
+++ b/tensorflow/core/kernels/reader_base.h
@@ -0,0 +1,107 @@
+#ifndef TENSORFLOW_KERNELS_READER_BASE_H_
+#define TENSORFLOW_KERNELS_READER_BASE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "tensorflow/core/framework/queue_interface.h"
+#include "tensorflow/core/framework/reader_interface.h"
+#include "tensorflow/core/kernels/reader_base.pb.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace tensorflow {
+
+// Default implementation of ReaderInterface.
+class ReaderBase : public ReaderInterface {
+ public:
+ // name: For use in error messages, should mention both the name of
+ // the op and the node.
+ explicit ReaderBase(const string& name);
+
+ // Note that methods with names ending in "Locked" are called while
+ // the ReaderBase's mutex is held.
+
+ // Implement this function in descendants -----------------------------------
+
+ // Produce the next key/value pair from the current work item.
+ // This is called "Locked" since it is executed under a mutex
+ // that serializes all Reader calls.
+ // Usage:
+ // a) If a record was successfully produced, set *produced = true,
+ // and fill in *key and *value.
+ // b) If no more records will be produced for this work item, set
+ // *at_end = true.
+ // c) If a record was produced, but no more will be produced, you
+ // may either do both (a) and (b), or do (a) in this call and do (b) in
+ // the next call to ReadLocked().
+ // d) If there was an error producing (e.g. an error reading the file,
+ // data corruption), return a non-OK() status. ReadLocked may be
+ // called again if the user reruns this part of the graph.
+ virtual Status ReadLocked(string* key, string* value, bool* produced,
+ bool* at_end) = 0;
+
+ // Descendants may optionally implement these -------------------------------
+
+ // Called when work starts / finishes.
+ virtual Status OnWorkStartedLocked() { return Status::OK(); }
+ virtual Status OnWorkFinishedLocked() { return Status::OK(); }
+
+ // Called to reset the Reader to a newly constructed state.
+ virtual Status ResetLocked();
+
+ // Default implementation generates an Unimplemented error.
+ // See the protected helper methods below.
+ virtual Status SerializeStateLocked(string* state);
+ virtual Status RestoreStateLocked(const string& state);
+
+ // Accessors ----------------------------------------------------------------
+
+ // Always true during a call to ReadLocked().
+ bool work_in_progress() const { return work_finished_ < work_started_; }
+
+ // Returns the name of the current work item (valid if
+ // work_in_progress() returns true). May change between calls to
+ // ReadLocked().
+ const string& current_work() const { return work_; }
+
+ // What was passed to the constructor.
+ const string& name() const { return name_; }
+
+ protected:
+ // For descendants wishing to implement serialize & restore state.
+
+ // Writes ReaderBase state to *state.
+ void SaveBaseState(ReaderBaseState* state) const;
+
+ // Restores ReaderBase state from state. Assumes state was filled
+ // using SaveBaseState() above.
+ Status RestoreBaseState(const ReaderBaseState& state);
+
+ private:
+ // Implementations of ReaderInterface methods. These ensure thread-safety
+ // and call the methods above to do the work.
+ void Read(QueueInterface* queue, string* key, string* value,
+ OpKernelContext* context) override;
+ Status Reset() override;
+ int64 NumRecordsProduced() override;
+ int64 NumWorkUnitsCompleted() override;
+ Status SerializeState(string* state) override;
+ Status RestoreState(const string& state) override;
+
+ // For implementing Read(). Dequeues the next work item from
+ // *queue, and if successful updates work_, work_started_
+ // (establishing work_in_progress() == true) and calls
+ // OnWorkStartedLocked(). May block.
+ void GetNextWorkLocked(QueueInterface* queue, OpKernelContext* context);
+
+ mutable mutex mu_;
+ const string name_;
+ int64 work_started_ = 0;
+ int64 work_finished_ = 0;
+ int64 num_records_produced_ = 0;
+ string work_;
+};
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_READER_BASE_H_
diff --git a/tensorflow/core/kernels/reader_base.proto b/tensorflow/core/kernels/reader_base.proto
new file mode 100644
index 0000000000..4335cb2152
--- /dev/null
+++ b/tensorflow/core/kernels/reader_base.proto
@@ -0,0 +1,13 @@
+syntax = "proto3";
+
+package tensorflow;
+// option cc_enable_arenas = true;
+
+// For serializing and restoring the state of ReaderBase, see
+// reader_base.h for details.
+message ReaderBaseState {
+ int64 work_started = 1;
+ int64 work_finished = 2;
+ int64 num_records_produced = 3;
+ bytes current_work = 4;
+};
diff --git a/tensorflow/core/kernels/reader_ops.cc b/tensorflow/core/kernels/reader_ops.cc
new file mode 100644
index 0000000000..38c1013604
--- /dev/null
+++ b/tensorflow/core/kernels/reader_ops.cc
@@ -0,0 +1,132 @@
+// See docs in ../ops/io_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/queue_interface.h"
+#include "tensorflow/core/framework/reader_interface.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+class ReaderVerbOpKernel : public OpKernel {
+ public:
+ using OpKernel::OpKernel;
+
+ void Compute(OpKernelContext* context) override {
+ ReaderInterface* reader;
+ OP_REQUIRES_OK(context,
+ GetResourceFromContext(context, "reader_handle", &reader));
+ ComputeWithReader(context, reader);
+ reader->Unref();
+ }
+
+ protected:
+ virtual void ComputeWithReader(OpKernelContext* context,
+ ReaderInterface* reader) = 0;
+};
+
+class ReaderReadOp : public ReaderVerbOpKernel {
+ public:
+ using ReaderVerbOpKernel::ReaderVerbOpKernel;
+
+ void ComputeWithReader(OpKernelContext* context,
+ ReaderInterface* reader) override {
+ QueueInterface* queue;
+ OP_REQUIRES_OK(context,
+ GetResourceFromContext(context, "queue_handle", &queue));
+ core::ScopedUnref unref_me(queue);
+ Tensor* key = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output("key", TensorShape({}), &key));
+ Tensor* value = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output("value", TensorShape({}), &value));
+
+ auto key_scalar = key->scalar<string>();
+ auto value_scalar = value->scalar<string>();
+ reader->Read(queue, &key_scalar(), &value_scalar(), context);
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ReaderRead").Device(DEVICE_CPU), ReaderReadOp);
+
+class ReaderNumRecordsProducedOp : public ReaderVerbOpKernel {
+ public:
+ using ReaderVerbOpKernel::ReaderVerbOpKernel;
+
+ void ComputeWithReader(OpKernelContext* context,
+ ReaderInterface* reader) override {
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output("records_produced",
+ TensorShape({}), &output));
+ output->scalar<int64>()() = reader->NumRecordsProduced();
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ReaderNumRecordsProduced").Device(DEVICE_CPU),
+ ReaderNumRecordsProducedOp);
+
+class ReaderNumWorkUnitsCompletedOp : public ReaderVerbOpKernel {
+ public:
+ using ReaderVerbOpKernel::ReaderVerbOpKernel;
+
+ void ComputeWithReader(OpKernelContext* context,
+ ReaderInterface* reader) override {
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output("units_completed",
+ TensorShape({}), &output));
+ output->scalar<int64>()() = reader->NumWorkUnitsCompleted();
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ReaderNumWorkUnitsCompleted").Device(DEVICE_CPU),
+ ReaderNumWorkUnitsCompletedOp);
+
+class ReaderSerializeStateOp : public ReaderVerbOpKernel {
+ public:
+ using ReaderVerbOpKernel::ReaderVerbOpKernel;
+
+ void ComputeWithReader(OpKernelContext* context,
+ ReaderInterface* reader) override {
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output("state", TensorShape({}), &output));
+ OP_REQUIRES_OK(context,
+ reader->SerializeState(&output->scalar<string>()()));
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ReaderSerializeState").Device(DEVICE_CPU),
+ ReaderSerializeStateOp);
+
+class ReaderRestoreStateOp : public ReaderVerbOpKernel {
+ public:
+ using ReaderVerbOpKernel::ReaderVerbOpKernel;
+
+ void ComputeWithReader(OpKernelContext* context,
+ ReaderInterface* reader) override {
+ const Tensor* tensor;
+ OP_REQUIRES_OK(context, context->input("state", &tensor));
+ OP_REQUIRES(
+ context, TensorShapeUtils::IsScalar(tensor->shape()),
+ errors::InvalidArgument("Reader state must be scalar, but had shape: ",
+ tensor->shape().DebugString()));
+ OP_REQUIRES_OK(context, reader->RestoreState(tensor->scalar<string>()()));
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ReaderRestoreState").Device(DEVICE_CPU),
+ ReaderRestoreStateOp);
+
+class ReaderResetOp : public ReaderVerbOpKernel {
+ public:
+ using ReaderVerbOpKernel::ReaderVerbOpKernel;
+
+ void ComputeWithReader(OpKernelContext* context,
+ ReaderInterface* reader) override {
+ OP_REQUIRES_OK(context, reader->Reset());
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ReaderReset").Device(DEVICE_CPU), ReaderResetOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h
new file mode 100644
index 0000000000..b412617a65
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops.h
@@ -0,0 +1,66 @@
+#ifndef TENSORFLOW_KERNELS_REDUCTION_OPS_H_
+#define TENSORFLOW_KERNELS_REDUCTION_OPS_H_
+
+// Functor definitions for Reduction ops, must be compilable by nvcc.
+
+#include <iostream>
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// When eigen3 has better implementation of AllReducer and AnyReducer,
+// replaces reducers here.
+
+// Reduction using logical_and.
+struct AllReducer {
+ // TODO(zhifengc): Implement PacketAccess when performance matters.
+ static const bool PacketAccess = false;
+ static const bool IsStateful = false;
+
+ EIGEN_DEVICE_FUNC void reduce(const bool t, bool* accum) const {
+ *accum &= t;
+ }
+
+ EIGEN_DEVICE_FUNC bool initialize() const { return true; }
+
+ EIGEN_DEVICE_FUNC bool finalize(const bool accum) const { return accum; }
+};
+
+// Reduction using logical_or.
+struct AnyReducer {
+ // TODO(zhifengc): Implement PacketAccess when performance matters.
+ static const bool PacketAccess = false;
+ static const bool IsStateful = false;
+
+ EIGEN_DEVICE_FUNC void reduce(const bool t, bool* accum) const {
+ *accum |= t;
+ }
+
+ EIGEN_DEVICE_FUNC bool initialize() const { return false; }
+
+ EIGEN_DEVICE_FUNC bool finalize(const bool accum) const { return accum; }
+};
+
+template <typename Device, typename OUT_T, typename IN_T,
+ typename ReductionAxes, typename Reducer>
+void ReduceEigenImpl(const Device& d, OUT_T out, IN_T in,
+ const ReductionAxes& reduction_axes,
+ const Reducer& reducer) {
+ out.device(d) = in.reduce(reduction_axes, reducer);
+}
+
+template <typename Device>
+struct ReduceFunctor {
+ template <typename OUT_T, typename IN_T, typename ReductionAxes,
+ typename Reducer>
+ static void Reduce(const Device& d, OUT_T out, IN_T in,
+ const ReductionAxes& reduction_axes,
+ const Reducer& reducer);
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_REDUCTION_OPS_H_
diff --git a/tensorflow/core/kernels/reduction_ops_all.cc b/tensorflow/core/kernels/reduction_ops_all.cc
new file mode 100644
index 0000000000..11d399e70a
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_all.cc
@@ -0,0 +1,17 @@
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("All")
+ .Device(DEVICE_CPU)
+ .HostMemory("reduction_indices"),
+ ReductionOp<CPUDevice, bool, functor::AllReducer>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("All")
+ .Device(DEVICE_GPU)
+ .HostMemory("reduction_indices"),
+ ReductionOp<GPUDevice, bool, functor::AllReducer>);
+#endif
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_any.cc b/tensorflow/core/kernels/reduction_ops_any.cc
new file mode 100644
index 0000000000..a89ef22b08
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_any.cc
@@ -0,0 +1,17 @@
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("Any")
+ .Device(DEVICE_CPU)
+ .HostMemory("reduction_indices"),
+ ReductionOp<CPUDevice, bool, functor::AnyReducer>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("Any")
+ .Device(DEVICE_GPU)
+ .HostMemory("reduction_indices"),
+ ReductionOp<GPUDevice, bool, functor::AnyReducer>);
+#endif
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
new file mode 100644
index 0000000000..2bde3a1a54
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -0,0 +1,302 @@
+// This is an internal header file intended to only be included as the
+// front-matter in the implementation files of various reduction ops. It
+// is a header file because we split the various reduction ops into their
+// own compilation units to get more parallelism in compilation.
+
+#ifndef TENSORFLOW_KERNELS_REDUCTION_OPS_COMMON_H_
+#define TENSORFLOW_KERNELS_REDUCTION_OPS_COMMON_H_
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/reduction_ops.h"
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/public/status.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device>
+struct Constants {
+ // Derive Index type. int (32-bit) or long (64-bit) depending on the
+ // compile-time configuration. "float" here is not relevant.
+ // TODO(zhifengc): Moves the definition to TTypes.
+ typedef TTypes<float>::Tensor::Index Index;
+ Eigen::array<Index, 1> kZero;
+ Eigen::array<Index, 1> kOne;
+ Eigen::array<Index, 2> kZeroTwo;
+
+ Constants() {
+ kZero[0] = 0;
+ kOne[0] = 1;
+ kZeroTwo[0] = 0;
+ kZeroTwo[1] = 2;
+ }
+};
+
+#if defined(EIGEN_HAS_INDEX_LIST)
+template <>
+struct Constants<CPUDevice> {
+ const Eigen::IndexList<Eigen::type2index<0>> kZero;
+ const Eigen::IndexList<Eigen::type2index<1>> kOne;
+ const Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<2>> kZeroTwo;
+};
+#endif
+
+namespace {
+
+class ReductionHelper {
+ public:
+ ReductionHelper() : reduce_first_axis_(false) {}
+
+ Status Simplify(const Tensor& data, const Tensor& axis,
+ const bool keep_dims) {
+ // bitmap[i] indicates whether to reduce data along i-th axis.
+ std::vector<bool> bitmap(data.dims(), false);
+ auto axis_vec = axis.flat<int32>();
+ for (int64 i = 0; i < axis.NumElements(); ++i) {
+ const int32 index = axis_vec(i);
+ if (index < 0 || index >= data.dims()) {
+ return errors::OutOfRange("Invalid reduction dimension (", index,
+ " for input with ", data.dims(),
+ " dimension(s)");
+ }
+ bitmap[index] = true;
+ }
+
+ // Output tensor's dim sizes.
+ out_shape_.clear();
+ for (int i = 0; i < data.dims(); ++i) {
+ if (!bitmap[i]) {
+ // If we are not reducing along dimension i.
+ out_shape_.push_back(data.dim_size(i));
+ } else if (keep_dims) {
+ // We are reducing along dimension i, but we want to keep the
+ // same number of dimensions, so we set the dimension of i to
+ // '1'.
+ out_shape_.push_back(1);
+ }
+ }
+
+ // Depending on bitmap[i] and bitmap[i-1], we can collapse axis of
+ // the input data before doing the reduction on the resulting
+ // tensor. The shape of the reduction is a reshape of the final
+ // output.
+
+ // We'll skip the leading 1s.
+ int dim_index = 0;
+ for (; dim_index < data.dims(); ++dim_index) {
+ if (data.dim_size(dim_index) != 1) break;
+ }
+ if (dim_index >= data.dims()) {
+ // Special case. The input is essentially a scalar.
+ reduce_first_axis_ = true;
+ } else {
+ // Starting from the (dim_index)-th dimension, dimensions
+ // alternates between runs that need to be reduced and runs that
+ // don't.
+ //
+ // NOTE: If a dimension has size 1, we group it as the current
+ // run so that we can minimize the number of runs.
+ //
+ // E.g., when we want to reduce a tensor of shape [2, 1, 3, 1,
+ // 5] by axes = [1, 4], we should treat the tensor as a [6, 5]
+ // and reduce by axes = [1] (i.e., the output is shape [6]).
+ reduce_first_axis_ = bitmap[dim_index];
+ data_reshape_.push_back(data.dim_size(dim_index));
+ ++dim_index;
+ for (; dim_index < data.dims(); ++dim_index) {
+ const auto size = data.dim_size(dim_index);
+ if (size == 1) {
+ bitmap[dim_index] = bitmap[dim_index - 1];
+ }
+ if (bitmap[dim_index - 1] != bitmap[dim_index]) {
+ // Starts a new run of reduce or !reduce.
+ data_reshape_.push_back(size);
+ } else {
+ // Continue a run of reduce or !reduce.
+ data_reshape_.back() *= size;
+ }
+ }
+ // If reduce_first_axis_ is true (input's dimension 0, 2, 4, etc
+ // are reduced), data_reshape_[1, 3, 5, ...] is out_reshape_,
+ // otherwise, data_reshape_[0, 2, 4, ...] is.
+ for (size_t i = reduce_first_axis_ ? 1 : 0; i < data_reshape_.size();
+ i += 2) {
+ out_reshape_.push_back(data_reshape_[i]);
+ }
+ }
+
+ VLOG(1) << "data reshape: " << str_util::Join(data_reshape_, ",");
+ VLOG(1) << "out reshape: " << str_util::Join(out_reshape_, ",");
+ VLOG(1) << "out shape: " << str_util::Join(out_shape_, ",");
+ return Status::OK();
+ }
+
+ // We need to do roughly:
+ // tmp_out = allocate(out_reshape())
+ // tmp_out.reshape(out_reshape) = data.reshape(data_reshape).reduce(axes)
+ // out = tmp_out.reshape(out_shape)
+
+ // The reduction result must be allocated with this shape.
+ TensorShape out_reshape() const {
+ TensorShape shape;
+ for (auto size : out_reshape_) shape.AddDim(size);
+ return shape;
+ }
+
+ // The final output shape must be allocated with this shape.
+ TensorShape out_shape() const {
+ TensorShape shape;
+ for (auto size : out_shape_) shape.AddDim(size);
+ return shape;
+ }
+
+ // The reduction is on a reshaped tensor of this rank.
+ int ndims() const { return data_reshape_.size(); }
+
+ // True if need to reduce the 0-th dimension.
+ bool reduce_first_axis() const { return reduce_first_axis_; }
+
+ // The output is reshaped.
+ template <typename T, int N>
+ typename TTypes<T, N>::Tensor out(Tensor* out) {
+ return out->shaped<T, N>(out_reshape_);
+ }
+
+ // The input is reshaped.
+ template <typename T, int N>
+ typename TTypes<T, N>::ConstTensor in(const Tensor& data) {
+ return data.shaped<T, N>(data_reshape_);
+ }
+
+ private:
+ bool reduce_first_axis_; // True if need to reduce the 0-th dimension.
+ std::vector<int64> data_reshape_; // Reshape the data before reduction.
+ std::vector<int64> out_shape_; // The final output shape.
+ std::vector<int64> out_reshape_; // Reshape the output for reduction.
+};
+
+} // end namespace
+
+// For operations where the output is a reduction function along some
+// dimensions of the input.
+template <typename Device, class T, typename Reducer>
+class ReductionOp : public OpKernel {
+ public:
+ explicit ReductionOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ const DataType dt = DataTypeToEnum<T>::v();
+ OP_REQUIRES_OK(ctx, ctx->MatchSignature({dt, DT_INT32}, {dt}));
+
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor& data = ctx->input(0);
+ const Tensor& axes = ctx->input(1);
+ VLOG(1) << "data shape: " << data.shape().ShortDebugString();
+ VLOG(1) << "axes : " << axes.SummarizeValue(10);
+
+ ReductionHelper helper;
+ OP_REQUIRES_OK(ctx, helper.Simplify(data, axes, keep_dims_));
+ CHECK_GE(helper.ndims(), 0);
+
+ // The real output shape will be assigned below.
+ TensorShape empty_shape;
+ Tensor* out = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, empty_shape, &out));
+
+ if (helper.ndims() == 0 ||
+ (helper.ndims() == 1 && !helper.reduce_first_axis())) {
+ // Special case. Reduces nothing. It is unclear why this is
+ // necessary, but tests fail without it. Look into why this
+ // case occurs.
+ if (!out->CopyFrom(data, helper.out_shape())) {
+ ctx->SetStatus(errors::Internal("Error during reduction copy."));
+ }
+ return;
+ }
+
+ // A temporary tensor whose size matches the size of the reduced
+ // output.
+ Tensor tmp_out;
+ OP_REQUIRES_OK(
+ ctx, ctx->allocate_temp(out->dtype(), helper.out_reshape(), &tmp_out));
+
+ typedef functor::ReduceFunctor<Device> Functor;
+ Constants<Device> constants;
+ const Device& d = ctx->eigen_device<Device>();
+ Reducer reducer;
+
+ if ((helper.ndims() == 1) && helper.reduce_first_axis()) {
+ // Reduce to a scalar.
+ Functor::Reduce(d, helper.out<T, 0>(&tmp_out), helper.in<T, 1>(data),
+ constants.kZero, reducer);
+ } else if ((helper.ndims() == 2) && helper.reduce_first_axis()) {
+ // Can be viewed as a reduction of a matrix along 1st dimension.
+ Functor::Reduce(d, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
+ constants.kZero, reducer);
+ } else if ((helper.ndims() == 2) && !helper.reduce_first_axis()) {
+ // Can be viewed as a reduction of a matrix along 2nd dimension.
+ Functor::Reduce(d, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
+ constants.kOne, reducer);
+ } else if ((helper.ndims() == 3) && helper.reduce_first_axis()) {
+ // Can be viewed as a reduction of a 3D tensor along 1st and 3rd
+ // dimensions.
+ Functor::Reduce(d, helper.out<T, 1>(&tmp_out), helper.in<T, 3>(data),
+ constants.kZeroTwo, reducer);
+ } else if ((helper.ndims() == 3) && !helper.reduce_first_axis()) {
+ // Can be viewed as a reduction of a 3D tensor along 2nd dimension.
+ Functor::Reduce(d, helper.out<T, 2>(&tmp_out), helper.in<T, 3>(data),
+ constants.kOne, reducer);
+ } else {
+ // TODO(zhifengc): We can implement reduction for arbitrary rank
+ // tensor and arbitrary reduction axes by iterating the reduction
+ // multiple times. This may also be accomplished in the graph
+ // construction.
+ ctx->SetStatus(
+ errors::Unimplemented("Reducing ", data.shape().ShortDebugString(),
+ " axes [", axes.SummarizeValue(10), "] to ",
+ tmp_out.shape().ShortDebugString()));
+ return;
+ }
+
+ // Set the real output using the contents of the reduction but the
+ // real expected output shape. The number of elements should
+ // match between the two shapes.
+ if (!out->CopyFrom(tmp_out, helper.out_shape())) {
+ ctx->SetStatus(errors::Internal("Error during reduction copy."));
+ }
+ }
+
+ private:
+ // True if the number of dimensions should be maintained.
+ bool keep_dims_;
+};
+
+namespace functor {
+
+template <>
+struct ReduceFunctor<CPUDevice> {
+ template <typename OUT_T, typename IN_T, typename ReductionAxes,
+ typename Reducer>
+ static void Reduce(const CPUDevice& d, OUT_T out, IN_T in,
+ const ReductionAxes& reduction_axes,
+ const Reducer& reducer) {
+ ReduceEigenImpl(d, out, in, reduction_axes, reducer);
+ }
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_REDUCTION_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
new file mode 100644
index 0000000000..8e29d2d06c
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
@@ -0,0 +1,65 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/kernels/reduction_ops.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Derive Index type. int (32-bit) or long (64-bit) depending on the
+// compile-time configuration. "float" here is not relevant.
+// TODO(zhifengc): Moves the definition to TTypes.
+typedef TTypes<float>::Tensor::Index Index;
+
+template <>
+struct ReduceFunctor<GPUDevice> {
+ template <typename OUT_T, typename IN_T, typename ReductionAxes,
+ typename Reducer>
+ static void Reduce(const GPUDevice& d, OUT_T out, IN_T in,
+ const ReductionAxes& reduction_axes,
+ const Reducer& reducer) {
+ ReduceEigenImpl(d, To32Bit(out), To32Bit(in), reduction_axes, reducer);
+ }
+};
+
+// T: the data type
+// REDUCER: the reducer functor
+// NUM_AXES: the number of axes to reduce
+// IN_DIMS: the number of dimensions of the input tensor
+#define DEFINE(T, REDUCER, IN_DIMS, NUM_AXES) \
+ template void ReduceFunctor<GPUDevice>::Reduce( \
+ const GPUDevice& d, TTypes<T, IN_DIMS - NUM_AXES>::Tensor out, \
+ TTypes<T, IN_DIMS>::ConstTensor in, \
+ const Eigen::array<Index, NUM_AXES>& reduction_axes, \
+ const REDUCER& reducer);
+
+#define DEFINE_FOR_TYPE_AND_R(T, R) \
+ DEFINE(T, R, 1, 1); \
+ DEFINE(T, R, 2, 1); \
+ DEFINE(T, R, 3, 1); \
+ DEFINE(T, R, 3, 2);
+
+#define DEFINE_FOR_ALL_REDUCERS(T) \
+ DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \
+ DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>); \
+ DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>); \
+ DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
+
+DEFINE_FOR_ALL_REDUCERS(float);
+#undef DEFINE_FOR_ALL_REDUCERS
+
+DEFINE_FOR_TYPE_AND_R(complex64, Eigen::internal::SumReducer<complex64>);
+DEFINE_FOR_TYPE_AND_R(bool, AllReducer);
+DEFINE_FOR_TYPE_AND_R(bool, AnyReducer);
+#undef DEFINE_FOR_TYPE_AND_R
+
+#undef DEFINE
+
+} // end namespace functor
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/reduction_ops_max.cc b/tensorflow/core/kernels/reduction_ops_max.cc
new file mode 100644
index 0000000000..1749360b6e
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_max.cc
@@ -0,0 +1,26 @@
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+
+namespace tensorflow {
+
+#define REGISTER_CPU_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Max").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ ReductionOp<CPUDevice, type, Eigen::internal::MaxReducer<type>>);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Max") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("reduction_indices"), \
+ ReductionOp<GPUDevice, type, Eigen::internal::MaxReducer<type>>);
+REGISTER_GPU_KERNELS(float);
+#undef REGISTER_GPU_KERNELS
+
+#endif
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_mean.cc b/tensorflow/core/kernels/reduction_ops_mean.cc
new file mode 100644
index 0000000000..b00c36fed8
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_mean.cc
@@ -0,0 +1,12 @@
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+
+namespace tensorflow {
+
+#define REGISTER_CPU_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Mean").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ ReductionOp<CPUDevice, type, Eigen::internal::MeanReducer<type>>);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc
new file mode 100644
index 0000000000..de1f4b8520
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_min.cc
@@ -0,0 +1,26 @@
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+
+namespace tensorflow {
+
+#define REGISTER_CPU_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Min").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ ReductionOp<CPUDevice, type, Eigen::internal::MinReducer<type>>);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Min") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("reduction_indices"), \
+ ReductionOp<GPUDevice, type, Eigen::internal::MinReducer<type>>);
+REGISTER_GPU_KERNELS(float);
+#undef REGISTER_GPU_KERNELS
+
+#endif
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_prod.cc b/tensorflow/core/kernels/reduction_ops_prod.cc
new file mode 100644
index 0000000000..4068c7feda
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_prod.cc
@@ -0,0 +1,26 @@
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+
+namespace tensorflow {
+
+#define REGISTER_CPU_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Prod").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ ReductionOp<CPUDevice, type, Eigen::internal::ProdReducer<type>>);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Prod") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("reduction_indices"), \
+ ReductionOp<GPUDevice, type, Eigen::internal::ProdReducer<type>>);
+REGISTER_GPU_KERNELS(float);
+#undef REGISTER_GPU_KERNELS
+
+#endif
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc
new file mode 100644
index 0000000000..82d685e225
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_sum.cc
@@ -0,0 +1,37 @@
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+
+namespace tensorflow {
+
+#define REGISTER_CPU_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Sum").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ ReductionOp<CPUDevice, type, Eigen::internal::SumReducer<type>>);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+// NOTE: We should have mean(complex64,int32), too. But that needs to
+// change Eigen::internal::MeanReducer to cast int to complex<float>.
+// We don't see immediate need of mean(complex64,int32) anyway.
+REGISTER_KERNEL_BUILDER(
+ Name("Sum").Device(DEVICE_CPU).TypeConstraint<complex64>("T"),
+ ReductionOp<CPUDevice, complex64, Eigen::internal::SumReducer<complex64>>);
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Sum") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("reduction_indices"), \
+ ReductionOp<GPUDevice, type, Eigen::internal::SumReducer<type>>);
+REGISTER_GPU_KERNELS(float);
+#undef REGISTER_GPU_KERNELS
+
+REGISTER_KERNEL_BUILDER(
+ Name("Sum").Device(DEVICE_GPU).TypeConstraint<complex64>("T"),
+ ReductionOp<GPUDevice, complex64, Eigen::internal::SumReducer<complex64>>);
+
+#endif
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_test.cc b/tensorflow/core/kernels/reduction_ops_test.cc
new file mode 100644
index 0000000000..d96da3c7f1
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_test.cc
@@ -0,0 +1,73 @@
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+
+// Creates a Graph which "reduce"s a 3D float tensor of "num" elements
+// into a scalar.
+static Graph* ToScalar(const string& reduce, int num) {
+ Graph* g = new Graph(OpRegistry::Global());
+ Tensor data(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
+ data.flat<float>().setRandom();
+ Tensor axes(DT_INT32, TensorShape({3}));
+ axes.flat<int32>()(0) = 0;
+ axes.flat<int32>()(1) = 1;
+ axes.flat<int32>()(2) = 2;
+ test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
+ test::graph::Constant(g, axes));
+ return g;
+}
+
+// Creates a bench which reduces a 3D tensor with total "num" floats
+// into a scalar on a "device". Runs the bench for "iters" times.
+static void ReduceToScalar(int iters, const string& device,
+ const string& reduce, int num) {
+ testing::ItemsProcessed(static_cast<int64>(iters) * num);
+ testing::BytesProcessed(static_cast<int64>(iters) * num * sizeof(float));
+ test::Benchmark(device, ToScalar(reduce, num)).Run(iters);
+}
+
+static void BM_Sum3DToScalarCPU(int iters, int num) {
+ ReduceToScalar(iters, "cpu", "Sum", num);
+}
+BENCHMARK(BM_Sum3DToScalarCPU)->Range(1 << 13, 1 << 20);
+
+static void BM_Max3DToScalarCPU(int iters, int num) {
+ ReduceToScalar(iters, "cpu", "Max", num);
+}
+BENCHMARK(BM_Max3DToScalarCPU)->Range(1 << 13, 1 << 20);
+
+static void BM_Prod3DToScalarCPU(int iters, int num) {
+ ReduceToScalar(iters, "cpu", "Prod", num);
+}
+BENCHMARK(BM_Prod3DToScalarCPU)->Range(1 << 13, 1 << 20);
+
+static void BM_Mean3DToScalarCPU(int iters, int num) {
+ ReduceToScalar(iters, "cpu", "Mean", num);
+}
+BENCHMARK(BM_Mean3DToScalarCPU)->Range(1 << 13, 1 << 20);
+
+static void BM_Sum3DToScalarGPU(int iters, int num) {
+ ReduceToScalar(iters, "gpu", "Sum", num);
+}
+BENCHMARK(BM_Sum3DToScalarGPU)->Range(1 << 13, 1 << 20);
+
+static void BM_Max3DToScalarGPU(int iters, int num) {
+ ReduceToScalar(iters, "gpu", "Max", num);
+}
+BENCHMARK(BM_Max3DToScalarGPU)->Range(1 << 13, 1 << 20);
+
+static void BM_Prod3DToScalarGPU(int iters, int num) {
+ ReduceToScalar(iters, "gpu", "Prod", num);
+}
+BENCHMARK(BM_Prod3DToScalarGPU)->Range(1 << 13, 1 << 20);
+
+// Once Mean is available on GPU, enable this.
+// static void BM_Mean3DToScalarGPU(int iters, int num) {
+// ReduceToScalar(iters, "gpu", "Mean", num);
+// }
+// BENCHMARK(BM_Mean3DToScalarGPU)->Range(1 << 13, 1 << 20);
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/reference_gemm.h b/tensorflow/core/kernels/reference_gemm.h
new file mode 100644
index 0000000000..77c6ef35e9
--- /dev/null
+++ b/tensorflow/core/kernels/reference_gemm.h
@@ -0,0 +1,75 @@
+#ifndef TENSORFLOW_KERNELS_REFERENCE_GEMM_H_
+#define TENSORFLOW_KERNELS_REFERENCE_GEMM_H_
+
+// This is an unoptimized but debuggable implementation of the GEMM matrix
+// multiply function, used to compare to faster but more opaque versions, or
+// for bit depths or argument combinations that aren't supported by optimized
+// code.
+// It assumes the row-major convention used by TensorFlow, and implements
+// C = A * B, like the standard BLAS GEMM interface. If the tranpose flags are
+// true, then the relevant matrix is treated as stored in column-major order.
+
+namespace tensorflow {
+template <class T1, class T2, class T3>
+void ReferenceGemm(bool transpose_a, bool transpose_b, bool transpose_c,
+ size_t m, size_t n, size_t k, const T1* a, T1 offset_a,
+ size_t lda, const T2* b, T2 offset_b, size_t ldb, T3* c,
+ int32 shift_c, int32 offset_c, int32 mult_c, size_t ldc) {
+ int a_i_stride;
+ int a_l_stride;
+ if (transpose_a) {
+ a_i_stride = 1;
+ a_l_stride = lda;
+ } else {
+ a_i_stride = lda;
+ a_l_stride = 1;
+ }
+ int b_j_stride;
+ int b_l_stride;
+ if (transpose_b) {
+ b_j_stride = ldb;
+ b_l_stride = 1;
+ } else {
+ b_j_stride = 1;
+ b_l_stride = ldb;
+ }
+ int c_i_stride;
+ int c_j_stride;
+ if (transpose_c) {
+ c_i_stride = 1;
+ c_j_stride = ldc;
+ } else {
+ c_i_stride = ldc;
+ c_j_stride = 1;
+ }
+
+ const int32 highest = static_cast<int32>(Eigen::NumTraits<T3>::highest());
+ const int32 lowest = static_cast<int32>(Eigen::NumTraits<T3>::lowest());
+ const int32 rounding = (shift_c < 1) ? 0 : (1 << (shift_c - 1));
+
+ int i, j, l;
+ for (j = 0; j < n; j++) {
+ for (i = 0; i < m; i++) {
+ int32 total = 0;
+ for (l = 0; l < k; l++) {
+ const size_t a_index = ((i * a_i_stride) + (l * a_l_stride));
+ const int32 a_value = a[a_index] - offset_a;
+ const size_t b_index = ((j * b_j_stride) + (l * b_l_stride));
+ const int32 b_value = b[b_index] - offset_b;
+ total += (a_value * b_value);
+ }
+ const size_t c_index = ((i * c_i_stride) + (j * c_j_stride));
+ int32_t output = ((((total + offset_c) * mult_c) + rounding) >> shift_c);
+ if (output > highest) {
+ output = highest;
+ }
+ if (output < lowest) {
+ output = lowest;
+ }
+ c[c_index] = static_cast<T3>(output);
+ }
+ }
+}
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_REFERENCE_GEMM_H_
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
new file mode 100644
index 0000000000..d5dd7a8119
--- /dev/null
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -0,0 +1,154 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/relu_op.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class ReluOp : public UnaryElementWiseOp<T, ReluOp<Device, T>> {
+ public:
+ using UnaryElementWiseOp<T, ReluOp<Device, T>>::UnaryElementWiseOp;
+
+ void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+ functor::Relu<Device, T> functor;
+ functor(context->eigen_device<Device>(), input.flat<T>(),
+ output->flat<T>());
+ }
+};
+
+template <typename Device, typename T>
+class Relu6Op : public UnaryElementWiseOp<T, Relu6Op<Device, T>> {
+ public:
+ using UnaryElementWiseOp<T, Relu6Op<Device, T>>::UnaryElementWiseOp;
+
+ void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+ functor::Relu6<Device, T> functor;
+ functor(context->eigen_device<Device>(), input.flat<T>(),
+ output->flat<T>());
+ }
+};
+
+template <typename Device, typename T>
+class ReluGradOp : public BinaryElementWiseOp<T, ReluGradOp<Device, T>> {
+ public:
+ using BinaryElementWiseOp<T, ReluGradOp<Device, T>>::BinaryElementWiseOp;
+
+ // INPUTS:
+ // g (gradients): backpropagated gradients
+ // a (inputs): inputs that were passed to ReluOp()
+ // OUTPUT:
+ // gradients to backprop
+ template <int NDIMS>
+ void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
+ Tensor* output) {
+ OP_REQUIRES(context, a.IsSameSize(g),
+ errors::InvalidArgument("g and a must be the same size"));
+ functor::ReluGrad<Device, T> functor;
+ functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
+ output->flat<T>());
+ }
+};
+
+template <typename Device, typename T>
+class Relu6GradOp : public BinaryElementWiseOp<T, Relu6GradOp<Device, T>> {
+ public:
+ using BinaryElementWiseOp<T, Relu6GradOp<Device, T>>::BinaryElementWiseOp;
+
+ // INPUTS:
+ // g (gradients): backpropagated gradients
+ // a (inputs): inputs that were passed to Relu6Op()
+ // OUTPUT:
+ // gradients to backprop
+ template <int NDIMS>
+ void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
+ Tensor* output) {
+ OP_REQUIRES(context, a.IsSameSize(g),
+ errors::InvalidArgument("g and a must be the same size"));
+ functor::Relu6Grad<Device, T> functor;
+ functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
+ output->flat<T>());
+ }
+};
+
+#define REGISTER_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Relu").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ ReluOp<CPUDevice, type>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Relu6").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ Relu6Op<CPUDevice, type>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("ReluGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ ReluGradOp<CPUDevice, type>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Relu6Grad").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ Relu6GradOp<CPUDevice, type>)
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T) \
+ template <> \
+ void Relu<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T>::ConstTensor features, \
+ typename TTypes<T>::Tensor activations); \
+ extern template struct Relu<GPUDevice, T>; \
+ \
+ template <> \
+ void ReluGrad<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T>::ConstTensor gradients, \
+ typename TTypes<T>::ConstTensor features, \
+ typename TTypes<T>::Tensor backprops); \
+ \
+ extern template struct ReluGrad<GPUDevice, T>; \
+ template <> \
+ void Relu6<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T>::ConstTensor features, \
+ typename TTypes<T>::Tensor activations); \
+ extern template struct Relu6<GPUDevice, T>; \
+ \
+ template <> \
+ void Relu6Grad<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T>::ConstTensor gradients, \
+ typename TTypes<T>::ConstTensor features, \
+ typename TTypes<T>::Tensor backprops); \
+ extern template struct Relu6Grad<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+} // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Relu").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+ ReluOp<GPUDevice, type>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Relu6").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+ Relu6Op<GPUDevice, type>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("ReluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+ ReluGradOp<GPUDevice, type>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Relu6Grad").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+ Relu6GradOp<GPUDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/relu_op.h b/tensorflow/core/kernels/relu_op.h
new file mode 100644
index 0000000000..8ed071cc4a
--- /dev/null
+++ b/tensorflow/core/kernels/relu_op.h
@@ -0,0 +1,79 @@
+#ifndef TENSORFLOW_KERNELS_RELU_OP_H_
+#define TENSORFLOW_KERNELS_RELU_OP_H_
+// Functor definition for ReluOp and ReluGradOp, must be compilable by nvcc.
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by ReluOp to do the computations.
+template <typename Device, typename T>
+struct Relu {
+ // Computes Relu activation.
+ //
+ // features: any shape.
+ // activations: same shape as "features".
+ void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+ typename TTypes<T>::Tensor activations) {
+ activations.device(d) = features.cwiseMax(static_cast<T>(0));
+ }
+};
+
+// Functor used by ReluGradOp to do the computations.
+template <typename Device, typename T>
+struct ReluGrad {
+ // Computes ReluGrad backprops.
+ //
+ // gradients: gradients backpropagated to the Relu op.
+ // features: inputs that where passed to the Relu op.
+ // backprops: gradients to backpropagate to the Relu inputs.
+ void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+ typename TTypes<T>::ConstTensor features,
+ typename TTypes<T>::Tensor backprops) {
+ // NOTE: When the activation is exactly zero, we arbitrarily choose to not
+ // propagate the associated gradient value.
+ backprops.device(d) =
+ gradients * (features > features.constant(static_cast<T>(0)));
+ }
+};
+
+// Functor used by Relu6Op to do the computations.
+template <typename Device, typename T>
+struct Relu6 {
+ // Computes Relu6 activation.
+ //
+ // features: any shape.
+ // activations: same shape as "features".
+ void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+ typename TTypes<T>::Tensor activations) {
+ activations.device(d) =
+ features.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(6));
+ }
+};
+
+// Functor used by ReluGradOp to do the computations.
+template <typename Device, typename T>
+struct Relu6Grad {
+ // Computes Relu6Grad backprops.
+ //
+ // gradients: gradients backpropagated to the Relu6 op.
+ // features: inputs that where passed to the Relu6 op.
+ // backprops: gradients to backpropagate to the Relu6 inputs.
+ void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+ typename TTypes<T>::ConstTensor features,
+ typename TTypes<T>::Tensor backprops) {
+ // NOTE: When the activation is exactly zero or six, we
+ // arbitrarily choose to not propagate the associated gradient
+ // value.
+ backprops.device(d) = gradients *
+ (features > features.constant(static_cast<T>(0))) *
+ (features < features.constant(static_cast<T>(6)));
+ }
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_RELU_OP_H_
diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc
new file mode 100644
index 0000000000..6bd87ff8e4
--- /dev/null
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@@ -0,0 +1,27 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+
+#include "tensorflow/core/kernels/relu_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Definition of the GPU implementations declared in relu_op.cc.
+#define DEFINE_GPU_KERNELS(T) \
+ template struct functor::Relu<GPUDevice, T>; \
+ template struct functor::ReluGrad<GPUDevice, T>; \
+ template struct functor::Relu6<GPUDevice, T>; \
+ template struct functor::Relu6Grad<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/reshape_op.cc b/tensorflow/core/kernels/reshape_op.cc
new file mode 100644
index 0000000000..7e1cf029de
--- /dev/null
+++ b/tensorflow/core/kernels/reshape_op.cc
@@ -0,0 +1,29 @@
+// See docs in ../ops/array_ops.cc.
+#include "tensorflow/core/kernels/reshape_op.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("Reshape").Device(DEVICE_CPU).HostMemory("shape"),
+ ReshapeOp);
+
+#define REGISTER_GPU_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER(Name("Reshape") \
+ .Device(DEVICE_GPU) \
+ .HostMemory("shape") \
+ .TypeConstraint<type>("T"), \
+ ReshapeOp);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Reshape")
+ .Device(DEVICE_GPU)
+ .HostMemory("tensor")
+ .HostMemory("shape")
+ .HostMemory("output")
+ .TypeConstraint<int32>("T"),
+ ReshapeOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reshape_op.h b/tensorflow/core/kernels/reshape_op.h
new file mode 100644
index 0000000000..3fd3f4492e
--- /dev/null
+++ b/tensorflow/core/kernels/reshape_op.h
@@ -0,0 +1,83 @@
+#ifndef TENSORFLOW_KERNELS_RESHAPE_OP_H_
+#define TENSORFLOW_KERNELS_RESHAPE_OP_H_
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+class ReshapeOp : public OpKernel {
+ public:
+ explicit ReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ const Tensor& sizes = context->input(1);
+ // Preliminary validation of sizes.
+ OP_REQUIRES(context, TensorShapeUtils::IsLegacyVector(sizes.shape()),
+ errors::InvalidArgument("sizes input must be 1-D, not shape ",
+ sizes.shape().ShortDebugString()));
+ const int64 num_dims = sizes.NumElements();
+ OP_REQUIRES(
+ context, num_dims <= 8,
+ errors::InvalidArgument(num_dims, " > max 8 output dims supported"));
+
+ // Compute the output shape. Determine product of specified
+ // dimensions, and find the index of the unspecified one.
+ TensorShape shape;
+ int32 product = 1;
+ int unknown_index = -1;
+ auto Svec = sizes.flat<int32>();
+ for (int d = 0; d < num_dims; ++d) {
+ const int32 size = Svec(d);
+ if (size == -1) {
+ OP_REQUIRES(
+ context, unknown_index == -1,
+ errors::InvalidArgument("only one input size may be -1, not both ",
+ unknown_index, " and ", d));
+ unknown_index = d;
+ shape.AddDim(1);
+ } else {
+ OP_REQUIRES(context, size >= 0,
+ errors::InvalidArgument(
+ "size ", d, " must be non-negative, not ", size));
+ shape.AddDim(size);
+ product *= size;
+ }
+ }
+ if (unknown_index != -1) {
+ OP_REQUIRES(
+ context, product > 0,
+ errors::InvalidArgument("cannot infer the missing input size for "
+ "an empty tensor unless all specified "
+ "input sizes are non-zero"));
+ const int32 missing = input.NumElements() / product;
+ OP_REQUIRES(context, product * missing == input.NumElements(),
+ errors::InvalidArgument("Input has ", input.NumElements(),
+ " values, which isn't divisible by ",
+ product));
+ shape.set_dim(unknown_index, missing);
+ }
+ OP_REQUIRES(context, shape.num_elements() == input.NumElements(),
+ errors::InvalidArgument("Input has ", input.NumElements(),
+ " values, which isn't the same as ",
+ shape.num_elements()));
+
+ // Actually produce the reshaped output.
+ Tensor output(input.dtype());
+ CHECK(output.CopyFrom(input, shape));
+ context->set_output(0, output);
+ }
+
+ bool IsExpensive() override { return false; }
+};
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_RESHAPE_OP_H_
diff --git a/tensorflow/core/kernels/resize_area_op.cc b/tensorflow/core/kernels/resize_area_op.cc
new file mode 100644
index 0000000000..2b22d38ad6
--- /dev/null
+++ b/tensorflow/core/kernels/resize_area_op.cc
@@ -0,0 +1,139 @@
+// See docs in ../ops/image_ops.cc
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T>
+class ResizeAreaOp : public OpKernel {
+ public:
+ explicit ResizeAreaOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ OP_REQUIRES(context, input.dims() == 4,
+ errors::InvalidArgument("input must be 4-dimensional",
+ input.shape().ShortDebugString()));
+ const Tensor& shape_t = context->input(1);
+ OP_REQUIRES(context, shape_t.dims() == 1,
+ errors::InvalidArgument("shape_t must be 1-dimensional",
+ shape_t.shape().ShortDebugString()));
+ OP_REQUIRES(context, shape_t.NumElements() == 2,
+ errors::InvalidArgument("shape_t must have two elements",
+ shape_t.shape().ShortDebugString()));
+
+ auto Svec = shape_t.vec<int32>();
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(
+ 0, TensorShape({input.dim_size(0), Svec(0),
+ Svec(1), input.dim_size(3)}),
+ &output));
+ const int64 batch_size = input.dim_size(0);
+ const int64 in_height = input.dim_size(1);
+ const int64 in_width = input.dim_size(2);
+ const int64 channels = input.dim_size(3);
+ const int64 out_height = output->dim_size(1);
+ const int64 out_width = output->dim_size(2);
+
+ typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
+ typename TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>();
+
+ // A temporary tensor for computing the sum.
+ Tensor sum_tensor;
+ OP_REQUIRES_OK(
+ context, context->allocate_temp(DataTypeToEnum<float>::value,
+ TensorShape({channels}), &sum_tensor));
+ typename TTypes<float, 1>::Tensor sum_data = sum_tensor.vec<float>();
+
+ const float height_scale = in_height / static_cast<float>(out_height);
+ const float width_scale = in_width / static_cast<float>(out_width);
+
+ // When using this algorithm for downsizing, the target pixel value is the
+ // weighted average of all the source pixels. The weight is determined by
+ // the contribution percentage of the source pixel.
+ //
+ // Let "scale" be "target_image_size/source_image_size". If 1/n of the
+ // source pixel contributes to the target pixel, then the weight is (1/n *
+ // scale); if the complete source pixel contributes to the target pixel,
+ // then the weight is scale.
+ //
+ // To visualize the implementation, use one dimension as an example:
+ // Resize in[4] to out[3].
+ // scale = 3/4 = 0.75
+ // out[0]: in[0] and 1/3 of in[1]
+ // out[1]: 2/3 of in[1] and 2/3 of in[2]
+ // out[2]: 1/3 of in[2] and in[1]
+ // Hence, the output pixel values are:
+ // out[0] = (in[0] * 1.0 + in[1] * 1/3) * scale
+ // out[1] = (in[1] * 2/3 + in[2] * 2/3 * scale
+ // out[2] = (in[3] * 1/3 + in[3] * 1.0) * scale
+ float scale = 1.0 / (height_scale * width_scale);
+ for (int64 b = 0; b < batch_size; ++b) {
+ for (int64 y = 0; y < out_height; ++y) {
+ const float in_y = y * height_scale;
+ const float in_y1 = (y + 1) * height_scale;
+ // The start and end height indices of all the cells that could
+ // contribute to the target cell.
+ int64 y_start = floor(in_y);
+ int64 y_end = ceil(in_y1);
+
+ for (int64 x = 0; x < out_width; ++x) {
+ const float in_x = x * width_scale;
+ const float in_x1 = (x + 1) * width_scale;
+ // The start and end width indices of all the cells that could
+ // contribute to the target cell.
+ int64 x_start = floor(in_x);
+ int64 x_end = ceil(in_x1);
+
+ sum_data.setConstant(0.0);
+ for (int64 i = y_start; i < y_end; ++i) {
+ float scale_y =
+ i < in_y ? i + 1 - in_y : (i + 1 > in_y1 ? in_y1 - i : 1.0);
+ for (int64 j = x_start; j < x_end; ++j) {
+ float scale_x =
+ j < in_x ? j + 1 - in_x : (j + 1 > in_x1 ? in_x1 - j : 1.0);
+ for (int64 c = 0; c < channels; ++c) {
+#define BOUND(val, limit) std::min(((limit)-1ll), (std::max(0ll, (val))))
+ sum_data(c) +=
+ input_data(b, BOUND(i, in_height), BOUND(j, in_width), c) *
+ scale_y * scale_x * scale;
+#undef BOUND
+ }
+ }
+ }
+ for (int64 c = 0; c < channels; ++c) {
+ output_data(b, y, x, c) = sum_data(c);
+ }
+ }
+ }
+ }
+ }
+};
+
+#define REGISTER_KERNEL(T) \
+ REGISTER_KERNEL_BUILDER(Name("ResizeArea") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<T>("T") \
+ .HostMemory("size"), \
+ ResizeAreaOp<CPUDevice, T>);
+
+REGISTER_KERNEL(uint8);
+REGISTER_KERNEL(int8);
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc
new file mode 100644
index 0000000000..472fc19b82
--- /dev/null
+++ b/tensorflow/core/kernels/resize_bicubic_op.cc
@@ -0,0 +1,121 @@
+// See docs in ../ops/image_ops.cc
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T>
+class ResizeBicubicOp : public OpKernel {
+ public:
+ explicit ResizeBicubicOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ OP_REQUIRES(context, input.dims() == 4,
+ errors::InvalidArgument("input must be 4-dimensional",
+ input.shape().ShortDebugString()));
+ const Tensor& shape_t = context->input(1);
+ OP_REQUIRES(context, shape_t.dims() == 1,
+ errors::InvalidArgument("shape_t must be 1-dimensional",
+ shape_t.shape().ShortDebugString()));
+ OP_REQUIRES(context, shape_t.NumElements() == 2,
+ errors::InvalidArgument("shape_t must have two elements",
+ shape_t.shape().ShortDebugString()));
+
+ auto Svec = shape_t.vec<int32>();
+ // Initialize shape to the batch size of the input, then add
+ // the rest of the dimensions
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(
+ 0, TensorShape({input.dim_size(0), Svec(0),
+ Svec(1), input.dim_size(3)}),
+ &output));
+ const int64 batch_size = input.dim_size(0);
+ const int64 in_height = input.dim_size(1);
+ const int64 in_width = input.dim_size(2);
+ const int64 channels = input.dim_size(3);
+ const int64 out_height = output->dim_size(1);
+ const int64 out_width = output->dim_size(2);
+
+ typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
+ typename TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>();
+
+ const float height_scale = in_height / static_cast<float>(out_height);
+ const float width_scale = in_width / static_cast<float>(out_width);
+
+ // Initialize coefficients table using Bicubic convolution algorithm.
+ // https://en.wikipedia.org/wiki/Bicubic_interpolation
+ static const int64 tab_size = (1 << 10);
+ static float coeffs_tab[(tab_size + 1) * 2];
+ static const double A = -0.75;
+ for (int i = 0; i <= tab_size; ++i) {
+ float x = i * 1.0 / tab_size;
+ coeffs_tab[i * 2] = ((A + 2) * x - (A + 3)) * x * x + 1;
+ x += 1.0;
+ coeffs_tab[i * 2 + 1] = ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+ }
+
+ auto cal = [](float v0, float v1, float v2, float v3, float dx) {
+ const int64 offset = round(dx * tab_size);
+ const float a0 = coeffs_tab[offset * 2 + 1];
+ const float a1 = coeffs_tab[offset * 2];
+ const float a2 = coeffs_tab[(tab_size - offset) * 2];
+ const float a3 = coeffs_tab[(tab_size - offset) * 2 + 1];
+ return a0 * v0 + a1 * v1 + a2 * v2 + a3 * v3;
+ };
+
+ float coeff[4] = {0.0};
+ for (int64 b = 0; b < batch_size; ++b) {
+ for (int64 y = 0; y < out_height; ++y) {
+ const int64 in_y = floor(height_scale * y);
+ const float dy = height_scale * y - in_y;
+ for (int64 x = 0; x < out_width; ++x) {
+ const int64 in_x = floor(width_scale * x);
+ const float dx = width_scale * x - in_x;
+ for (int64 c = 0; c < channels; ++c) {
+ for (int64 i = 0; i < 4; ++i) {
+#define BOUND(val, limit) std::min(((limit)-1ll), (std::max(0ll, (val))))
+ int64 bound_y = BOUND(in_y - 1 + i, in_height);
+ coeff[i] =
+ cal(input_data(b, bound_y, BOUND(in_x - 1, in_width), c),
+ input_data(b, bound_y, BOUND(in_x, in_width), c),
+ input_data(b, bound_y, BOUND(in_x + 1, in_width), c),
+ input_data(b, bound_y, BOUND(in_x + 2, in_width), c), dx);
+#undef BOUND
+ }
+ output_data(b, y, x, c) =
+ cal(coeff[0], coeff[1], coeff[2], coeff[3], dy);
+ }
+ }
+ }
+ }
+ }
+};
+
+#define REGISTER_KERNEL(T) \
+ REGISTER_KERNEL_BUILDER(Name("ResizeBicubic") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<T>("T") \
+ .HostMemory("size"), \
+ ResizeBicubicOp<CPUDevice, T>);
+
+REGISTER_KERNEL(uint8);
+REGISTER_KERNEL(int8);
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc
new file mode 100644
index 0000000000..5119b93508
--- /dev/null
+++ b/tensorflow/core/kernels/resize_bilinear_op.cc
@@ -0,0 +1,109 @@
+// See docs in ../ops/image_ops.cc
+#define EIGEN_USE_THREADS
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T>
+class ResizeBilinearOp : public OpKernel {
+ public:
+ explicit ResizeBilinearOp(OpKernelConstruction* context)
+ : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ OP_REQUIRES(context, input.dims() == 4,
+ errors::InvalidArgument("input must be 4-dimensional",
+ input.shape().ShortDebugString()));
+ const Tensor& shape_t = context->input(1);
+ OP_REQUIRES(context, shape_t.dims() == 1,
+ errors::InvalidArgument("shape_t must be 1-dimensional",
+ shape_t.shape().ShortDebugString()));
+ OP_REQUIRES(context, shape_t.NumElements() == 2,
+ errors::InvalidArgument("shape_t must have two elements",
+ shape_t.shape().ShortDebugString()));
+
+ auto Svec = shape_t.vec<int32>();
+ // Initialize shape to the batch size of the input, then add
+ // the rest of the dimensions
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(
+ 0, TensorShape({input.dim_size(0), Svec(0),
+ Svec(1), input.dim_size(3)}),
+ &output));
+
+ const int64 batch_size = input.dim_size(0);
+ const int64 in_height = input.dim_size(1);
+ const int64 in_width = input.dim_size(2);
+ const int64 channels = input.dim_size(3);
+ const int64 out_height = output->dim_size(1);
+ const int64 out_width = output->dim_size(2);
+
+ typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
+ typename TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>();
+
+ const float height_scale = in_height / static_cast<float>(out_height);
+ const float width_scale = in_width / static_cast<float>(out_width);
+
+ for (int b = 0; b < batch_size; ++b) {
+ for (int y = 0; y < out_height; ++y) {
+ const float in_y = y * height_scale;
+ const int top_y_index = static_cast<int>(floorf(in_y));
+ const int bottom_y_index =
+ std::min(static_cast<int64>(ceilf(in_y)), (in_height - 1));
+ const float y_lerp = in_y - top_y_index;
+ const float inverse_y_lerp = (1.0f - y_lerp);
+ for (int x = 0; x < out_width; ++x) {
+ const float in_x = x * width_scale;
+ const int left_x_index = static_cast<int>(floorf(in_x));
+ const int right_x_index =
+ std::min(static_cast<int64>(ceilf(in_x)), (in_width - 1));
+ const float x_lerp = in_x - left_x_index;
+ const float inverse_x_lerp = (1.0f - x_lerp);
+ for (int c = 0; c < channels; ++c) {
+ const float top_left = input_data(b, top_y_index, left_x_index, c);
+ const float top_right =
+ input_data(b, top_y_index, right_x_index, c);
+ const float bottom_left =
+ input_data(b, bottom_y_index, left_x_index, c);
+ const float bottom_right =
+ input_data(b, bottom_y_index, right_x_index, c);
+ const float top =
+ (top_left * inverse_x_lerp) + (top_right * x_lerp);
+ const float bottom =
+ (bottom_left * inverse_x_lerp) + (bottom_right * x_lerp);
+ output_data(b, y, x, c) =
+ (top * inverse_y_lerp) + (bottom * y_lerp);
+ }
+ }
+ }
+ }
+ }
+};
+
+#define REGISTER_KERNEL(T) \
+ REGISTER_KERNEL_BUILDER(Name("ResizeBilinear") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<T>("T") \
+ .HostMemory("size"), \
+ ResizeBilinearOp<CPUDevice, T>);
+
+REGISTER_KERNEL(uint8);
+REGISTER_KERNEL(int8);
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/resize_bilinear_op_test.cc
new file mode 100644
index 0000000000..0ebe2e5f8c
--- /dev/null
+++ b/tensorflow/core/kernels/resize_bilinear_op_test.cc
@@ -0,0 +1,171 @@
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+
+class ResizeBilinearOpTest : public OpsTestBase {
+ protected:
+ ResizeBilinearOpTest() {
+ RequireDefaultOps();
+ EXPECT_OK(NodeDefBuilder("resize_bilinear_op", "ResizeBilinear")
+ .Input(FakeInput(DT_FLOAT))
+ .Input(FakeInput(DT_INT32))
+ .Finalize(node_def()));
+ EXPECT_OK(InitOp());
+ }
+};
+
+TEST_F(ResizeBilinearOpTest, TestBilinear2x2To1x1) {
+ // Input:
+ // 1, 2
+ // 3, 4
+ AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+ AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+ ASSERT_OK(RunOpKernel());
+
+ // When scaling down, we have to arbitrarily pick a pixel from the
+ // original input. In this case, we choose the top/left most pixel.
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
+ test::FillValues<float>(&expected, {1.0});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeBilinearOpTest, TestBilinear2x2To3x3) {
+ // Input:
+ // 1, 2
+ // 3, 4
+ AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+ AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
+
+ // The corners should match the original corners, and we bilinear
+ // interpolate the values in between.
+
+ // clang-format off
+ test::FillValues<float>(&expected,
+ {1, 5.0/3, 2,
+ 7.0/3, 3, 10.0/3,
+ 3, 11.0/3, 4});
+
+ // clang-format on
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeBilinearOpTest, TestBilinear3x3To4x4) {
+ // Input:
+ // 1, 2, 3,
+ // 4, 5, 6,
+ // 7, 8, 9
+ AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
+ {1, 2, 3, 4, 5, 6, 7, 8, 9});
+ AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+ ASSERT_OK(RunOpKernel());
+
+ // The corners should match the original corners, and we bilinear
+ // interpolate the values in between.
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 4, 4, 1}));
+ // clang-format off
+ test::FillValues<float>(&expected,
+ {1, 1.75, 2.5, 3,
+ 3.25, 4, 4.75, 5.25,
+ 5.5, 6.25, 7, 7.5,
+ 7, 7.75, 8.5, 9});
+
+ // clang-format on
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeBilinearOpTest, TestBilinear2x2To3x3Batch2) {
+ // Input:
+ // 1, 2
+ // 3, 4
+ //
+ // repeated twice
+ AddInputFromArray<float>(TensorShape({2, 2, 2, 1}), {1, 2, 3, 4, 1, 2, 3, 4});
+ AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3, 3, 1}));
+ // clang-format off
+ test::FillValues<float>(&expected,
+ {1, 5.0/3, 2, 7.0/3, 3, 10.0/3, 3, 11.0/3, 4,
+ 1, 5.0/3, 2, 7.0/3, 3, 10.0/3, 3, 11.0/3, 4
+ });
+ // clang-format on
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeBilinearOpTest, TestBilinear2x2x2To3x3x2) {
+ AddInputFromArray<float>(TensorShape({1, 2, 2, 2}),
+ {1, -1, 2, -2, 3, -3, 4, -4});
+ AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 2}));
+ // clang-format off
+ test::FillValues<float>(&expected,
+ {
+ 1, -1,
+ 5.0/3, -5.0/3,
+ 2, -2,
+ 7.0/3, -7.0/3,
+ 3, -3,
+ 10.0/3, -10.0/3,
+ 3, -3,
+ 11.0/3, -11.0/3,
+ 4, -4
+ });
+ // clang-format on
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeBilinearOpTest, TestBilinear2x2To4x4) {
+ // Input:
+ // 1, 2
+ // 3, 4
+ AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+ AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 4, 4, 1}));
+ // clang-format off
+ test::FillValues<float>(&expected,
+ {1, 1.5, 2, 2,
+ 2, 2.5, 3, 3,
+ 3, 3.5, 4, 4,
+ 3, 3.5, 4, 4});
+ // clang-format on
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeBilinearOpTest, TestInvalidInputShape) {
+ AddInputFromArray<float>(TensorShape({2, 2, 1}), {1, 2, 3, 4});
+ AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+ ASSERT_FALSE(RunOpKernel().ok());
+}
+
+TEST_F(ResizeBilinearOpTest, TestInvalidSizeDim) {
+ AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+ AddInputFromArray<int32>(TensorShape({2, 1}), {4, 4});
+ ASSERT_FALSE(RunOpKernel().ok());
+}
+TEST_F(ResizeBilinearOpTest, TestInvalidSizeElements) {
+ AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+ AddInputFromArray<int32>(TensorShape({3}), {4, 4, 1});
+ ASSERT_FALSE(RunOpKernel().ok());
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
new file mode 100644
index 0000000000..13089308ce
--- /dev/null
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
@@ -0,0 +1,89 @@
+// See docs in ../ops/image_ops.cc
+#define EIGEN_USE_THREADS
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T>
+class ResizeNearestNeighborOp : public OpKernel {
+ public:
+ explicit ResizeNearestNeighborOp(OpKernelConstruction* context)
+ : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ OP_REQUIRES(context, input.dims() == 4,
+ errors::InvalidArgument("input must be 4-dimensional",
+ input.shape().ShortDebugString()));
+ const Tensor& shape_t = context->input(1);
+ OP_REQUIRES(context, shape_t.dims() == 1,
+ errors::InvalidArgument("shape_t must be 1-dimensional",
+ shape_t.shape().ShortDebugString()));
+ OP_REQUIRES(context, shape_t.NumElements() == 2,
+ errors::InvalidArgument("shape_t must have two elements",
+ shape_t.shape().ShortDebugString()));
+
+ auto Svec = shape_t.vec<int32>();
+ // Initialize shape to the batch size of the input, then add
+ // the rest of the dimensions
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(
+ 0, TensorShape({input.dim_size(0), Svec(0),
+ Svec(1), input.dim_size(3)}),
+ &output));
+
+ const int64 batch_size = input.dim_size(0);
+ const int64 in_height = input.dim_size(1);
+ const int64 in_width = input.dim_size(2);
+ const int64 channels = input.dim_size(3);
+ const int64 out_height = output->dim_size(1);
+ const int64 out_width = output->dim_size(2);
+
+ typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
+ typename TTypes<T, 4>::Tensor output_data = output->tensor<T, 4>();
+
+ const float height_scale = in_height / static_cast<float>(out_height);
+ const float width_scale = in_width / static_cast<float>(out_width);
+
+ for (int b = 0; b < batch_size; ++b) {
+ for (int y = 0; y < out_height; ++y) {
+ const int in_y = std::min(static_cast<int64>(floorf(y * height_scale)),
+ (in_height - 1));
+ for (int x = 0; x < out_width; ++x) {
+ const int in_x = std::min(static_cast<int64>(floorf(x * width_scale)),
+ (in_width - 1));
+ for (int c = 0; c < channels; ++c) {
+ output_data(b, y, x, c) = input_data(b, in_y, in_x, c);
+ }
+ }
+ }
+ }
+ }
+};
+
+#define REGISTER_KERNEL(T) \
+ REGISTER_KERNEL_BUILDER(Name("ResizeNearestNeighbor") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<T>("T") \
+ .HostMemory("size"), \
+ ResizeNearestNeighborOp<CPUDevice, T>);
+
+REGISTER_KERNEL(uint8);
+REGISTER_KERNEL(int8);
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc
new file mode 100644
index 0000000000..8fca1f34e3
--- /dev/null
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc
@@ -0,0 +1,163 @@
+// TODO(shlens, sherrym): Consider adding additional tests in image_ops.py in
+// order to compare the reference implementation for image resizing in Python
+// Image Library.
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+
+class ResizeNearestNeighborOpTest : public OpsTestBase {
+ protected:
+ ResizeNearestNeighborOpTest() {
+ RequireDefaultOps();
+ EXPECT_OK(NodeDefBuilder("resize_nn", "ResizeNearestNeighbor")
+ .Input(FakeInput(DT_FLOAT))
+ .Input(FakeInput(DT_INT32))
+ .Finalize(node_def()));
+ EXPECT_OK(InitOp());
+ }
+};
+
+TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To1x1) {
+ // Input:
+ // 1, 2
+ // 3, 4
+ AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+ AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
+
+ // clang-format off
+ test::FillValues<float>(&expected, {1});
+
+ // clang-format on
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To3x3) {
+ // Input:
+ // 1, 2
+ // 3, 4
+ AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+ AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
+
+ // clang-format off
+ test::FillValues<float>(&expected,
+ {1, 1, 2,
+ 1, 1, 2,
+ 3, 3, 4});
+
+ // clang-format on
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To2x5) {
+ // Input:
+ // 1, 2
+ // 3, 4
+ AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+ AddInputFromArray<int32>(TensorShape({2}), {2, 5});
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 5, 1}));
+
+ // clang-format off
+ test::FillValues<float>(&expected,
+ {1, 1, 1, 2, 2,
+ 3, 3, 3, 4, 4});
+
+ // clang-format on
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To5x2) {
+ // Input:
+ // 1, 2
+ // 3, 4
+ AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+ AddInputFromArray<int32>(TensorShape({2}), {5, 2});
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 5, 2, 1}));
+
+ // clang-format off
+ test::FillValues<float>(&expected,
+ {1, 2,
+ 1, 2,
+ 1, 2,
+ 3, 4,
+ 3, 4});
+
+ // clang-format on
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To4x4) {
+ // Input:
+ // 1, 2
+ // 3, 4
+ AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+ AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 4, 4, 1}));
+
+ // clang-format off
+ test::FillValues<float>(&expected,
+ {1, 1, 2, 2,
+ 1, 1, 2, 2,
+ 3, 3, 4, 4,
+ 3, 3, 4, 4});
+
+ // clang-format on
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2x2x2To2x3x3x2) {
+ // Input:
+ // [ [ 1, 1 ], [ 2, 2],
+ // [ 3, 3 ], [ 4, 4] ],
+ // [ [ 5, 5 ], [ 6, 6],
+ // [ 7, 7 ], [ 8, 8] ]
+ AddInputFromArray<float>(TensorShape({2, 2, 2, 2}),
+ {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8});
+ AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3, 3, 2}));
+
+ // clang-format off
+ test::FillValues<float>(&expected,
+ {1, 1, 1,
+ 1, 2, 2,
+ 1, 1, 1,
+ 1, 2, 2,
+ 3, 3, 3,
+ 3, 4, 4,
+ 5, 5, 5,
+ 5, 6, 6,
+ 5, 5, 5,
+ 5, 6, 6,
+ 7, 7, 7,
+ 7, 8, 8});
+
+ // clang-format on
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/restore_op.cc b/tensorflow/core/kernels/restore_op.cc
new file mode 100644
index 0000000000..b52c69449c
--- /dev/null
+++ b/tensorflow/core/kernels/restore_op.cc
@@ -0,0 +1,65 @@
+// See docs in ../ops/io_ops.cc.
+#include "tensorflow/core/kernels/io.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/tensor_slice_reader.h"
+
+namespace tensorflow {
+
+class RestoreOp : public OpKernel {
+ public:
+ explicit RestoreOp(OpKernelConstruction* context) : OpKernel(context) {
+ int preferred_shard;
+ OP_REQUIRES_OK(context,
+ context->GetAttr("preferred_shard", &preferred_shard));
+ if (preferred_shard == -1) {
+ preferred_shard_ = checkpoint::TensorSliceReader::kLoadAllShards;
+ } else {
+ OP_REQUIRES(context, preferred_shard >= 0,
+ errors::InvalidArgument("Attribute 'preferred_shard' must be "
+ "greater or equal to -1"));
+ preferred_shard_ = preferred_shard;
+ }
+ }
+ void Compute(OpKernelContext* context) override {
+ RestoreTensor(context, &checkpoint::OpenTableTensorSliceReader,
+ preferred_shard_, false);
+ }
+
+ private:
+ int preferred_shard_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Restore").Device(DEVICE_CPU), RestoreOp);
+
+class RestoreSliceOp : public OpKernel {
+ public:
+ explicit RestoreSliceOp(OpKernelConstruction* context) : OpKernel(context) {
+ int preferred_shard;
+ OP_REQUIRES_OK(context,
+ context->GetAttr("preferred_shard", &preferred_shard));
+ if (preferred_shard == -1) {
+ preferred_shard_ = checkpoint::TensorSliceReader::kLoadAllShards;
+ } else {
+ OP_REQUIRES(context, preferred_shard >= 0,
+ errors::InvalidArgument("Attribute 'preferred_shard' must be "
+ "greater or equal to -1"));
+ preferred_shard_ = preferred_shard;
+ }
+ }
+ void Compute(OpKernelContext* context) override {
+ RestoreTensor(context, &checkpoint::OpenTableTensorSliceReader,
+ preferred_shard_, true);
+ }
+
+ private:
+ int preferred_shard_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("RestoreSlice").Device(DEVICE_CPU),
+ RestoreSliceOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/restore_op_test.cc b/tensorflow/core/kernels/restore_op_test.cc
new file mode 100644
index 0000000000..59343a8037
--- /dev/null
+++ b/tensorflow/core/kernels/restore_op_test.cc
@@ -0,0 +1,305 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+namespace {
+
+class RestoreOpTest : public OpsTestBase {
+ protected:
+ // Makes an operation to restore two tensors
+ void MakeRestoreOp(DataType dt) {
+ RequireDefaultOps();
+ ASSERT_OK(NodeDefBuilder("myop", "Restore")
+ .Input(FakeInput())
+ .Input(FakeInput())
+ .Attr("dt", dt)
+ .Finalize(node_def()));
+ ASSERT_OK(InitOp());
+ }
+};
+
+TEST_F(RestoreOpTest, RestoreInt) {
+ const string filename = io::JoinPath(testing::TmpDir(), "tensor_int");
+ const string tensor_name = "tensor_int";
+
+ // We first need to write a tensor using the save_op
+ {
+ // Initialize an operation
+ NodeDef save;
+ ASSERT_OK(NodeDefBuilder("save", "Save")
+ .Input(FakeInput(DT_STRING))
+ .Input(FakeInput(DT_STRING))
+ .Input(FakeInput({DT_INT32}))
+ .Finalize(&save));
+
+ std::unique_ptr<Device> device(
+ DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+ gtl::InlinedVector<TensorValue, 4> inputs;
+
+ Status status;
+ std::unique_ptr<OpKernel> op(CreateOpKernel(
+ DEVICE_CPU, device.get(), cpu_allocator(), save, &status));
+ EXPECT_OK(status);
+
+ // Run it
+
+ // Input #0 is the file name
+ Tensor input_0(DT_STRING, TensorShape({}));
+ input_0.scalar<string>()() = filename;
+ inputs.push_back({nullptr, &input_0});
+
+ // Input #1 is the tensor name
+ Tensor input_1(DT_STRING, TensorShape({}));
+ input_1.scalar<string>()() = tensor_name;
+ inputs.push_back({nullptr, &input_1});
+
+ // Input #2 is an integer tensor: it's a 1-d array.
+ Tensor input_2(DT_INT32, TensorShape({10}));
+ for (int i = 0; i < 10; ++i) {
+ input_2.flat<int32>()(i) = i + 1;
+ }
+ inputs.push_back({nullptr, &input_2});
+
+ OpKernelContext::Params params;
+ params.device = device.get();
+ params.frame_iter = FrameAndIter(0, 0);
+ params.inputs = &inputs;
+ params.op_kernel = op.get();
+ params.output_alloc_attr = [&device, &op, &params](int index) {
+ AllocatorAttributes attr;
+ const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+ attr.set_on_host(on_host);
+ return attr;
+ };
+ checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper;
+ params.slice_reader_cache = &slice_reader_cache_wrapper;
+
+ OpKernelContext ctx(params);
+ op->Compute(&ctx);
+ EXPECT_OK(ctx.status());
+ }
+
+ // Now we restore
+ MakeRestoreOp(DT_INT32);
+ // Add a file name
+ AddInput<string>(TensorShape({}),
+ [&filename](int x) -> string { return filename; });
+ // Add the tensor names
+ AddInput<string>(TensorShape({}),
+ [&tensor_name](int x) -> string { return tensor_name; });
+
+ ASSERT_OK(RunOpKernel());
+
+ // Check that we have an integer tensor
+ Tensor* output = GetOutput(0);
+ TensorShape expected({10});
+ EXPECT_TRUE(output->shape().IsSameSize(expected));
+ for (int i = 0; i < 10; ++i) {
+ EXPECT_EQ(i + 1, output->flat<int32>()(i));
+ }
+}
+
+TEST_F(RestoreOpTest, RestoreFloat) {
+ const string filename = io::JoinPath(testing::TmpDir(), "tensor_float");
+ const string tensor_name = "tensor_float";
+
+ // We first need to write a tensor using the save_op
+ {
+ // Initialize an operation
+ NodeDef save;
+ ASSERT_OK(NodeDefBuilder("save", "Save")
+ .Input(FakeInput(DT_STRING))
+ .Input(FakeInput(DT_STRING))
+ .Input(FakeInput({DT_FLOAT}))
+ .Finalize(&save));
+
+ std::unique_ptr<Device> device(
+ DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+ gtl::InlinedVector<TensorValue, 4> inputs;
+
+ Status status;
+ std::unique_ptr<OpKernel> op(CreateOpKernel(
+ DEVICE_CPU, device.get(), cpu_allocator(), save, &status));
+ EXPECT_OK(status);
+
+ // Run it
+
+ // Input #0 is the file name
+ Tensor input_0(DT_STRING, TensorShape({}));
+ input_0.scalar<string>()() = filename;
+ inputs.push_back({nullptr, &input_0});
+
+ // Input #1 is the tensor name
+ Tensor input_1(DT_STRING, TensorShape({}));
+ input_1.scalar<string>()() = tensor_name;
+ inputs.push_back({nullptr, &input_1});
+
+ // Input #2 is a float tensor: it's a 2-d array.
+ Tensor input_2(DT_FLOAT, TensorShape({2, 4}));
+ for (int i = 0; i < 8; ++i) {
+ input_2.flat<float>()(i) = static_cast<float>(i) / 10;
+ }
+ inputs.push_back({nullptr, &input_2});
+
+ OpKernelContext::Params params;
+ params.device = device.get();
+ params.frame_iter = FrameAndIter(0, 0);
+ params.inputs = &inputs;
+ params.op_kernel = op.get();
+ params.output_alloc_attr = [&device, &op, &params](int index) {
+ AllocatorAttributes attr;
+ const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+ attr.set_on_host(on_host);
+ return attr;
+ };
+ checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper;
+ params.slice_reader_cache = &slice_reader_cache_wrapper;
+
+ OpKernelContext ctx(params);
+ op->Compute(&ctx);
+ EXPECT_OK(ctx.status());
+ }
+
+ // Now we restore
+ MakeRestoreOp(DT_FLOAT);
+ // Add a file name
+ AddInput<string>(TensorShape({}),
+ [&filename](int x) -> string { return filename; });
+ // Add the tensor names
+ AddInput<string>(TensorShape({}),
+ [&tensor_name](int x) -> string { return tensor_name; });
+
+ ASSERT_OK(RunOpKernel());
+
+ // Check that we have a float tensor.
+ Tensor* output = GetOutput(0);
+ TensorShape expected({2, 4});
+ EXPECT_TRUE(output->shape().IsSameSize(expected));
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_EQ(static_cast<float>(i) / 10, output->flat<float>()(i));
+ }
+}
+
+class RestoreSliceOpTest : public OpsTestBase {
+ protected:
+ void MakeRestoreSliceOp(DataType dt) {
+ RequireDefaultOps();
+ ASSERT_OK(NodeDefBuilder("myop", "RestoreSlice")
+ .Input(FakeInput())
+ .Input(FakeInput())
+ .Input(FakeInput())
+ .Attr("dt", dt)
+ .Finalize(node_def()));
+ ASSERT_OK(InitOp());
+ }
+};
+
+TEST_F(RestoreSliceOpTest, RestoreInt) {
+ const string filename = io::JoinPath(testing::TmpDir(), "tensor_int");
+ const string tensor_name = "tensor_int";
+
+ // We first need to write a tensor using the save_op
+ {
+ // Initialize an operation
+ NodeDef save;
+ ASSERT_OK(NodeDefBuilder("save", "Save")
+ .Input(FakeInput(DT_STRING))
+ .Input(FakeInput(DT_STRING))
+ .Input(FakeInput({DT_INT32}))
+ .Finalize(&save));
+
+ std::unique_ptr<Device> device(
+ DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+ gtl::InlinedVector<TensorValue, 4> inputs;
+
+ Status status;
+ std::unique_ptr<OpKernel> op(CreateOpKernel(
+ DEVICE_CPU, device.get(), cpu_allocator(), save, &status));
+ EXPECT_OK(status);
+
+ // Run it
+
+ // Input #0 is the file name
+ Tensor input_0(DT_STRING, TensorShape({}));
+ input_0.scalar<string>()() = filename;
+ inputs.push_back({nullptr, &input_0});
+
+ // Input #1 is the tensor name
+ Tensor input_1(DT_STRING, TensorShape({}));
+ input_1.scalar<string>()() = tensor_name;
+ inputs.push_back({nullptr, &input_1});
+
+ // Input #2 is a 4x16 integer tensor.
+ Tensor input_2(DT_INT32, TensorShape({4, 16}));
+ for (int64 i = 0; i < input_2.NumElements(); ++i) {
+ input_2.flat<int32>()(i) = i + 1;
+ }
+ inputs.push_back({nullptr, &input_2});
+
+ OpKernelContext::Params params;
+ params.device = device.get();
+ params.frame_iter = FrameAndIter(0, 0);
+ params.inputs = &inputs;
+ params.op_kernel = op.get();
+ params.output_alloc_attr = [&device, &op, &params](int index) {
+ AllocatorAttributes attr;
+ const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+ attr.set_on_host(on_host);
+ return attr;
+ };
+ checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper;
+ params.slice_reader_cache = &slice_reader_cache_wrapper;
+
+ OpKernelContext ctx(params);
+ op->Compute(&ctx);
+ EXPECT_OK(ctx.status());
+ }
+
+ // Now we restore
+ MakeRestoreSliceOp(DT_INT32);
+ string shape_and_slice = "4 16 0,2:-";
+ // Add a file name
+ AddInput<string>(TensorShape({}),
+ [&filename](int x) -> string { return filename; });
+ // Add the tensor names
+ AddInput<string>(TensorShape({}),
+ [&tensor_name](int x) -> string { return tensor_name; });
+ // Add the tensor shape and slice
+ AddInput<string>(TensorShape({}), [&shape_and_slice](int x) -> string {
+ return shape_and_slice;
+ });
+
+ ASSERT_OK(RunOpKernel());
+
+ // Check that we have an integer tensor
+ Tensor* output = GetOutput(0);
+ TensorShape expected({2, 16});
+ EXPECT_TRUE(output->shape().IsSameSize(expected));
+ for (int64 i = 0; i < expected.num_elements(); ++i) {
+ EXPECT_EQ(i + 1, output->flat<int32>()(i));
+ }
+}
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
new file mode 100644
index 0000000000..c63dfc1e70
--- /dev/null
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -0,0 +1,139 @@
+// See docs in ../ops/array_ops.cc
+#define EIGEN_USE_THREADS
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/reverse_op.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class ReverseOp : public OpKernel {
+ public:
+ explicit ReverseOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ const Tensor& dims = context->input(1);
+
+ if (TensorShapeUtils::IsScalar(input.shape())) {
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, input.shape(), &output));
+ output->scalar<T>() = input.scalar<T>();
+
+ } else {
+ const int input_dims = input.dims();
+ OP_REQUIRES(context, TensorShapeUtils::IsVector(dims.shape()),
+ errors::InvalidArgument("'dims' must be 1-dimension, not ",
+ dims.dims()));
+
+ OP_REQUIRES(context, input_dims == dims.dim_size(0),
+ errors::InvalidArgument(
+ "'dims' must have the same number of values as 'input' has "
+ "dimensions. 'input' has ", input_dims, "'dims' has ",
+ dims.dim_size(0), " values"));
+ OP_REQUIRES(context, input_dims <= 8, errors::Unimplemented(
+ "reverse is not implemented for tensors of rank > 8."));
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, input.shape(), &output));
+
+#define HANDLE_REVERSE(NDIMS) \
+ case NDIMS: \
+ functor::Reverse<Device, T, NDIMS>()( \
+ context->eigen_device<Device>(), input.tensor<T, NDIMS>(), \
+ dims.vec<bool>(), output->tensor<T, NDIMS>()); \
+ return;
+
+ switch (input_dims) {
+ HANDLE_REVERSE(0);
+ HANDLE_REVERSE(1);
+ HANDLE_REVERSE(2);
+ HANDLE_REVERSE(3);
+ HANDLE_REVERSE(4);
+ HANDLE_REVERSE(5);
+ HANDLE_REVERSE(6);
+ HANDLE_REVERSE(7);
+ HANDLE_REVERSE(8);
+ }
+#undef HANDLE_REVERSE
+ }
+ }
+};
+
+#define REGISTER_KERNEL(T) \
+ REGISTER_KERNEL_BUILDER(Name("Reverse") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<T>("T") \
+ .HostMemory("dims"), \
+ ReverseOp<CPUDevice, T>)
+
+REGISTER_KERNEL(uint8);
+REGISTER_KERNEL(int8);
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(bool);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+
+// Forward declarations of the function specializations for GPU (to prevent
+// building the GPU versions here, they will be built compiling _gpu.cu.cc).
+namespace functor {
+#define DECLARE_GPU_SPEC_DIM(T, DIM) \
+ template <> \
+ void Reverse<GPUDevice, T, DIM>::operator()( \
+ const GPUDevice& d, typename TTypes<T, DIM>::ConstTensor input, \
+ typename TTypes<bool, 1>::ConstTensor dims, \
+ typename TTypes<T, DIM>::Tensor output); \
+ extern template struct Reverse<GPUDevice, T, DIM>;
+#define DECLARE_GPU_SPEC(T) \
+ DECLARE_GPU_SPEC_DIM(T, 0) \
+ DECLARE_GPU_SPEC_DIM(T, 1) \
+ DECLARE_GPU_SPEC_DIM(T, 2) \
+ DECLARE_GPU_SPEC_DIM(T, 3) \
+ DECLARE_GPU_SPEC_DIM(T, 4) \
+ DECLARE_GPU_SPEC_DIM(T, 5) \
+ DECLARE_GPU_SPEC_DIM(T, 6) \
+ DECLARE_GPU_SPEC_DIM(T, 7) \
+ DECLARE_GPU_SPEC_DIM(T, 8)
+
+DECLARE_GPU_SPEC(uint8);
+DECLARE_GPU_SPEC(int8);
+DECLARE_GPU_SPEC(int32);
+DECLARE_GPU_SPEC(bool);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+#undef DECLARE_GPU_SPEC_DIM
+} // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(T) \
+ REGISTER_KERNEL_BUILDER(Name("Reverse") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<T>("T") \
+ .HostMemory("dims"), \
+ ReverseOp<GPUDevice, T>)
+REGISTER_GPU_KERNEL(uint8);
+REGISTER_GPU_KERNEL(int8);
+REGISTER_GPU_KERNEL(float);
+REGISTER_GPU_KERNEL(double);
+#undef REGISTER_GPU_KERNEL
+
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reverse_op.h b/tensorflow/core/kernels/reverse_op.h
new file mode 100644
index 0000000000..bba25f70e8
--- /dev/null
+++ b/tensorflow/core/kernels/reverse_op.h
@@ -0,0 +1,28 @@
+#ifndef TENSORFLOW_KERNELS_REVERSE_OP_H_
+#define TENSORFLOW_KERNELS_REVERSE_OP_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by MirrorOp to do the computations.
+template <typename Device, typename T, int Dims>
+struct Reverse {
+ void operator()(const Device& d, typename TTypes<T, Dims>::ConstTensor input,
+ typename TTypes<bool, 1>::ConstTensor dims,
+ typename TTypes<T, Dims>::Tensor output) {
+ // mirror is in host memory
+ Eigen::array<bool, Dims> reverse_dims;
+ for (int i = 0; i < Dims; ++i) {
+ reverse_dims[i] = dims(i);
+ }
+ output.device(d) = input.reverse(reverse_dims);
+ }
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_MIRROR_OP_H_
diff --git a/tensorflow/core/kernels/reverse_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
new file mode 100644
index 0000000000..b510add3f3
--- /dev/null
+++ b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
@@ -0,0 +1,33 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/reverse_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_REVERSE(DIM) \
+ template struct functor::Reverse<GPUDevice, uint8, DIM>; \
+ template struct functor::Reverse<GPUDevice, int8, DIM>; \
+ template struct functor::Reverse<GPUDevice, int32, DIM>; \
+ template struct functor::Reverse<GPUDevice, bool, DIM>; \
+ template struct functor::Reverse<GPUDevice, float, DIM>; \
+ template struct functor::Reverse<GPUDevice, double, DIM>;
+DEFINE_REVERSE(0)
+DEFINE_REVERSE(1)
+DEFINE_REVERSE(2)
+DEFINE_REVERSE(3)
+DEFINE_REVERSE(4)
+DEFINE_REVERSE(5)
+DEFINE_REVERSE(6)
+DEFINE_REVERSE(7)
+DEFINE_REVERSE(8)
+#undef DEFINE_REVERSE
+
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/reverse_op_test.cc b/tensorflow/core/kernels/reverse_op_test.cc
new file mode 100644
index 0000000000..d41c36e693
--- /dev/null
+++ b/tensorflow/core/kernels/reverse_op_test.cc
@@ -0,0 +1,101 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+namespace {
+
+class ReverseOpTest : public OpsTestBase {
+ protected:
+ void MakeOp(DataType data_type) {
+ RequireDefaultOps();
+ ASSERT_OK(NodeDefBuilder("myop", "Reverse")
+ .Input(FakeInput(data_type))
+ .Input(FakeInput())
+ .Attr("T", data_type)
+ .Finalize(node_def()));
+ ASSERT_OK(InitOp());
+ }
+};
+
+TEST_F(ReverseOpTest, Reverse_0) {
+ MakeOp(DT_FLOAT);
+ AddInputFromArray<float>(TensorShape({}), {3});
+ AddInputFromArray<bool>(TensorShape({}), {true});
+ ASSERT_OK(RunOpKernel());
+
+ Tensor* output = GetOutput(0);
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({}));
+ expected.scalar<float>() = expected.scalar<float>().constant(3.f);
+ test::ExpectTensorEqual<float>(expected, *output);
+}
+
+TEST_F(ReverseOpTest, Reverse_234) {
+ MakeOp(DT_FLOAT);
+
+ // Feed and run
+ // [[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]
+ // [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]]
+ AddInputFromArray<float>(TensorShape({2, 3, 4}),
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23});
+ AddInputFromArray<bool>(TensorShape({3}), {true, false, true});
+
+ ASSERT_OK(RunOpKernel());
+
+ // Check the new state of the input
+ Tensor* params_tensor = GetOutput(0);
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3, 4}));
+ // Should become
+ // [[[15, 14, 13, 12], [19, 18, 17, 16], [23, 22, 21, 20]]
+ // [[3, 2, 1, 0], [7, 6, 5, 4], [11, 10, 9, 8]]]
+ test::FillValues<float>(
+ &expected, {15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 3, 2, 1, 0, 7,
+ 6, 5, 4, 11, 10, 9, 8});
+ test::ExpectTensorEqual<float>(expected, *params_tensor);
+}
+
+TEST_F(ReverseOpTest, Reverse_1234) {
+ MakeOp(DT_FLOAT);
+
+ // Feed and run
+ // [[[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]
+ // [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]]]
+ AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23});
+ AddInputFromArray<bool>(TensorShape({4}), {true, true, false, true});
+
+ ASSERT_OK(RunOpKernel());
+
+ // Check the new state of the input
+ Tensor* params_tensor = GetOutput(0);
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 3, 4}));
+ // Should become
+ // [[[[15, 14, 13, 12], [19, 18, 17, 16], [23, 22, 21, 20]]
+ // [[3, 2, 1, 0], [7, 6, 5, 4], [11, 10, 9, 8]]]]
+ test::FillValues<float>(
+ &expected, {15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 3, 2, 1, 0, 7,
+ 6, 5, 4, 11, 10, 9, 8});
+ test::ExpectTensorEqual<float>(expected, *params_tensor);
+}
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc
new file mode 100644
index 0000000000..6673a700ef
--- /dev/null
+++ b/tensorflow/core/kernels/reverse_sequence_op.cc
@@ -0,0 +1,170 @@
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif // GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/reverse_sequence_op.h"
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device>
+void CheckErrors(OpKernelContext* context, int seq_dim) {
+ const Tensor& input = context->input(0);
+ const Tensor& seq_lens = context->input(1);
+
+ auto seq_lens_t = seq_lens.vec<int64>();
+
+ std::vector<int64> seq_lens_vec(seq_lens_t.size());
+
+ // Copy seq_len info down for validity checks
+ context->eigen_device<Device>().memcpyDeviceToHost(
+ seq_lens_vec.data(), seq_lens_t.data(),
+ sizeof(int64) * seq_lens_t.size());
+
+ OP_REQUIRES(context, 0 != seq_dim, errors::InvalidArgument("0 == seq_dim"));
+ OP_REQUIRES(context, seq_dim < input.dims(),
+ errors::InvalidArgument("seq_dim must be < input.dims()", "( ",
+ seq_dim, " vs. ", input.dims(), ")"));
+
+ OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(0),
+ errors::InvalidArgument("len(seq_lens) != input.dims(", 0, "), ",
+ "(", seq_lens.NumElements(), " vs. ",
+ input.dim_size(seq_dim)));
+
+ for (int d = 0; d < seq_lens_vec.size(); ++d) {
+ OP_REQUIRES(context, seq_lens_vec[d] >= 0,
+ errors::InvalidArgument("seq_lens(", d, ") < 0"));
+ OP_REQUIRES(context, seq_lens_vec[d] <= input.dim_size(seq_dim),
+ errors::InvalidArgument("seq_lens(", d, ") > input.dims(",
+ seq_dim, ")"));
+ }
+}
+
+template <>
+void CheckErrors<GPUDevice>(OpKernelContext* context, int seq_dim) {
+ const Tensor& input = context->input(0);
+ const Tensor& seq_lens = context->input(1);
+
+ OP_REQUIRES(context, 0 != seq_dim, errors::InvalidArgument("0 == seq_dim"));
+ OP_REQUIRES(context, seq_dim < input.dims(),
+ errors::InvalidArgument("seq_dim must be < input.dims()", "( ",
+ seq_dim, " vs. ", input.dims(), ")"));
+
+ OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(0),
+ errors::InvalidArgument("len(seq_lens) != input.dims(", 0, "), ",
+ "(", seq_lens.NumElements(), " vs. ",
+ input.dim_size(seq_dim)));
+}
+
+template <typename Device, typename T>
+class ReverseSequenceOp : public OpKernel {
+ public:
+ explicit ReverseSequenceOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("seq_dim", &seq_dim_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ const Tensor& seq_lens = context->input(1);
+
+ // Preliminary validation of sizes.
+ OP_REQUIRES(context, TensorShapeUtils::IsVector(seq_lens.shape()),
+ errors::InvalidArgument("seq_lens input must be 1-dim, not ",
+ seq_lens.dims()));
+
+ auto seq_lens_t = seq_lens.vec<int64>();
+
+ CheckErrors<Device>(context, seq_dim_);
+
+ const int input_dims = input.dims();
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, input.shape(), &output));
+
+#define HANDLE_DIM(NDIM) \
+ case NDIM: \
+ functor::ReverseSequence<Device, T, NDIM>::Compute( \
+ context->eigen_device<Device>(), input.tensor<T, NDIM>(), seq_dim_, \
+ seq_lens_t, output->tensor<T, NDIM>()); \
+ break;
+
+ switch (input_dims) {
+ HANDLE_DIM(2);
+ HANDLE_DIM(3);
+ HANDLE_DIM(4);
+ HANDLE_DIM(5);
+
+ default:
+ OP_REQUIRES(context, false,
+ errors::InvalidArgument(
+ "ReverseSequenceOp : Unhandled input dimensions: ",
+ input_dims));
+ }
+ }
+
+ private:
+ int32 seq_dim_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(ReverseSequenceOp);
+};
+
+#define REGISTER_REVERSE_SEQUENCE(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("ReverseSequence").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ ReverseSequenceOp<CPUDevice, type>);
+
+TF_CALL_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE);
+
+#if GOOGLE_CUDA
+
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T, Dims) \
+ template <> \
+ void ReverseSequence<GPUDevice, T, Dims>::Compute( \
+ const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \
+ int32 seq_dim, TTypes<int64>::ConstVec seq_lens, \
+ typename TTypes<T, Dims>::Tensor output); \
+ extern template struct ReverseSequence<GPUDevice, T, Dims>;
+
+#define DECLARE_GPU_SPECS(T) \
+ DECLARE_GPU_SPEC(T, 2); \
+ DECLARE_GPU_SPEC(T, 3); \
+ DECLARE_GPU_SPEC(T, 4); \
+ DECLARE_GPU_SPEC(T, 5);
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
+
+} // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_REVERSE_SEQUENCE_GPU(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("ReverseSequence").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+ ReverseSequenceOp<GPUDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_GPU);
+
+#undef REGISTER_REVERSE_SEQUENCE_GPU
+
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reverse_sequence_op.h b/tensorflow/core/kernels/reverse_sequence_op.h
new file mode 100644
index 0000000000..d1dd572dcb
--- /dev/null
+++ b/tensorflow/core/kernels/reverse_sequence_op.h
@@ -0,0 +1,56 @@
+#ifndef TENSORFLOW_KERNELS_REVERSE_SEQUENCE_OP_H_
+#define TENSORFLOW_KERNELS_REVERSE_SEQUENCE_OP_H_
+// Generator definition for ReverseSequenceOp, must be compilable by nvcc.
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace generator {
+
+template <typename T, size_t Dims>
+class ReverseGenerator {
+ public:
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ ReverseGenerator(typename TTypes<T, Dims>::ConstTensor input, int32 seq_dim,
+ TTypes<int64>::ConstVec seq_lengths)
+ : input_(input), seq_dim_(seq_dim), seq_lengths_(seq_lengths) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+ operator()(const Eigen::array<Eigen::DenseIndex, Dims>& coords) const {
+ Eigen::array<Eigen::DenseIndex, Dims> new_coords = coords;
+ if (coords[seq_dim_] < seq_lengths_(coords[0])) {
+ new_coords[seq_dim_] = seq_lengths_(coords[0]) - coords[seq_dim_] - 1;
+ }
+
+ return input_(new_coords);
+ }
+
+ private:
+ typename TTypes<T, Dims>::ConstTensor input_;
+ int32 seq_dim_;
+ TTypes<int64>::ConstVec seq_lengths_;
+};
+
+} // namespace generator
+
+namespace functor {
+
+template <typename Device, typename T, size_t Dims>
+struct ReverseSequence {
+ EIGEN_ALWAYS_INLINE static void Compute(
+ const Device& d, typename TTypes<T, Dims>::ConstTensor input,
+ int32 seq_dim, TTypes<int64>::ConstVec seq_lengths,
+ typename TTypes<T, Dims>::Tensor output) {
+ generator::ReverseGenerator<T, Dims> generator(input, seq_dim, seq_lengths);
+ output.device(d) = input.generate(generator);
+ }
+};
+
+} // namespace functor
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_REVERSE_SEQUENCE_OP_H_
diff --git a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
new file mode 100644
index 0000000000..7b5d533026
--- /dev/null
+++ b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
@@ -0,0 +1,26 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/reverse_sequence_op.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_GPU_SPEC(T, dims) \
+ template class generator::ReverseGenerator<T, dims>; \
+ template struct functor::ReverseSequence<GPUDevice, T, dims>;
+
+#define DEFINE_GPU_SPECS(T) \
+ DEFINE_GPU_SPEC(T, 2); \
+ DEFINE_GPU_SPEC(T, 3); \
+ DEFINE_GPU_SPEC(T, 4); \
+ DEFINE_GPU_SPEC(T, 5);
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
+
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/save_op.cc b/tensorflow/core/kernels/save_op.cc
new file mode 100644
index 0000000000..71a15c643e
--- /dev/null
+++ b/tensorflow/core/kernels/save_op.cc
@@ -0,0 +1,81 @@
+// See docs in ../ops/io_ops.cc
+#include "tensorflow/core/kernels/io.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/util/tensor_slice_writer.h"
+
+namespace tensorflow {
+
+class SaveOp : public OpKernel {
+ public:
+ explicit SaveOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ SaveTensors(context, &checkpoint::CreateTableTensorSliceBuilder, false);
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("Save").Device(DEVICE_CPU), SaveOp);
+
+class SaveSlicesOp : public OpKernel {
+ public:
+ explicit SaveSlicesOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ SaveTensors(context, &checkpoint::CreateTableTensorSliceBuilder, true);
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("SaveSlices").Device(DEVICE_CPU), SaveSlicesOp);
+
+class ShardedFilenameOp : public OpKernel {
+ public:
+ explicit ShardedFilenameOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+ void Compute(OpKernelContext* ctx) override {
+ static const char* input_names[3] = {"basename", "shard", "num_shards"};
+ for (int i = 0; i < ctx->num_inputs(); ++i) {
+ OP_REQUIRES(ctx, TensorShapeUtils::IsLegacyScalar(ctx->input(i).shape()),
+ errors::InvalidArgument(
+ input_names[i], " must be a scalar, got shape ",
+ ctx->input(i).shape().ShortDebugString()));
+ }
+ Tensor* out = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
+ out->scalar<string>()() = strings::Printf(
+ "%s-%05d-of-%05d", ctx->input(0).scalar<string>()().c_str(),
+ ctx->input(1).scalar<int32>()(), ctx->input(2).scalar<int32>()());
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ShardedFilename").Device(DEVICE_CPU),
+ ShardedFilenameOp);
+
+class ShardedFilespecOp : public OpKernel {
+ public:
+ explicit ShardedFilespecOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+ void Compute(OpKernelContext* ctx) override {
+ static const char* input_names[2] = {"basename", "num_shards"};
+ for (int i = 0; i < ctx->num_inputs(); ++i) {
+ OP_REQUIRES(ctx, TensorShapeUtils::IsLegacyScalar(ctx->input(i).shape()),
+ errors::InvalidArgument(
+ input_names[i], " must be a scalar, got shape ",
+ ctx->input(i).shape().ShortDebugString()));
+ }
+ Tensor* out = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
+ out->scalar<string>()() = strings::Printf(
+ "%s-\?\?\?\?\?-of-%05d", ctx->input(0).scalar<string>()().c_str(),
+ ctx->input(1).scalar<int32>()());
+ }
+};
+REGISTER_KERNEL_BUILDER(Name("ShardedFilespec").Device(DEVICE_CPU),
+ ShardedFilespecOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/save_op_test.cc b/tensorflow/core/kernels/save_op_test.cc
new file mode 100644
index 0000000000..ee1ba492a6
--- /dev/null
+++ b/tensorflow/core/kernels/save_op_test.cc
@@ -0,0 +1,443 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/util/tensor_slice_reader.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+namespace {
+
+class SaveOpTest : public OpsTestBase {
+ protected:
+ void MakeOp() {
+ RequireDefaultOps();
+ ASSERT_OK(NodeDefBuilder("myop", "Save")
+ .Input(FakeInput())
+ .Input(FakeInput())
+ .Input(FakeInput(
+ {DT_INT32, DT_FLOAT, DT_DOUBLE, DT_QINT8, DT_QINT32}))
+ .Finalize(node_def()));
+ ASSERT_OK(InitOp());
+ }
+};
+
+TEST_F(SaveOpTest, Simple) {
+ const string filename = io::JoinPath(testing::TmpDir(), "tensor_simple");
+ const string tensornames[] = {"tensor_int", "tensor_float", "tensor_double",
+ "tensor_qint8", "tensor_qint32"};
+
+ MakeOp();
+ // Add a file name
+ AddInput<string>(TensorShape({}),
+ [&filename](int x) -> string { return filename; });
+
+ // Add the tensor names
+ AddInput<string>(TensorShape({5}),
+ [&tensornames](int x) -> string { return tensornames[x]; });
+
+ // Add a 1-d integer tensor
+ AddInput<int32>(TensorShape({10}), [](int x) -> int32 { return x + 1; });
+
+ // Add a 2-d float tensor
+ AddInput<float>(TensorShape({2, 4}),
+ [](int x) -> float { return static_cast<float>(x) / 10; });
+
+ // Add a 2-d double tensor
+ AddInput<double>(TensorShape({2, 4}),
+ [](int x) -> double { return static_cast<double>(x) / 20; });
+
+ // Add a 2-d qint8 tensor
+ AddInput<qint8>(TensorShape({3, 2}),
+ [](int x) -> qint8 { return *reinterpret_cast<qint8*>(&x); });
+
+ // Add a 2-d qint32 tensor
+ AddInput<qint32>(TensorShape({2, 3}), [](int x) -> qint32 {
+ return *reinterpret_cast<qint32*>(&x) * qint8(2);
+ });
+
+ ASSERT_OK(RunOpKernel());
+
+ // Check that the checkpoint file is properly written
+ checkpoint::TensorSliceReader reader(filename,
+ checkpoint::OpenTableTensorSliceReader);
+ EXPECT_OK(reader.status());
+
+ // We expect to find all saved tensors
+ {
+ // The 1-d integer tensor
+ TensorShape shape;
+ DataType type;
+ EXPECT_TRUE(reader.HasTensor("tensor_int", &shape, &type));
+ TensorShape expected({10});
+ EXPECT_TRUE(shape.IsSameSize(expected));
+ EXPECT_EQ(DT_INT32, type);
+
+ // We expect the tensor value to be correct.
+ TensorSlice s = TensorSlice::ParseOrDie("-");
+ int data[10];
+ std::fill_n(data, 10, 0);
+ EXPECT_TRUE(reader.CopySliceData("tensor_int", s, data));
+ for (int i = 0; i < 10; ++i) {
+ EXPECT_EQ(i + 1, data[i]);
+ }
+ }
+
+ {
+ // The 2-d float tensor
+ TensorShape shape;
+ DataType type;
+ EXPECT_TRUE(reader.HasTensor("tensor_float", &shape, &type));
+ TensorShape expected({2, 4});
+ EXPECT_TRUE(shape.IsSameSize(expected));
+ EXPECT_EQ(DT_FLOAT, type);
+
+ // We expect the tensor value to be correct.
+ TensorSlice s = TensorSlice::ParseOrDie("-:-");
+ float data[8];
+ std::fill_n(data, 8, 0);
+ EXPECT_TRUE(reader.CopySliceData("tensor_float", s, data));
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_EQ(static_cast<float>(i) / 10, data[i]);
+ }
+ }
+
+ {
+ // The 2-d double tensor
+ TensorShape shape;
+ DataType type;
+ EXPECT_TRUE(reader.HasTensor("tensor_double", &shape, &type));
+ TensorShape expected({2, 4});
+ EXPECT_TRUE(shape.IsSameSize(expected));
+ EXPECT_EQ(DT_DOUBLE, type);
+
+ // We expect the tensor value to be correct.
+ TensorSlice s = TensorSlice::ParseOrDie("-:-");
+ double data[8];
+ std::fill_n(data, 8, 0);
+ EXPECT_TRUE(reader.CopySliceData("tensor_double", s, data));
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_EQ(static_cast<double>(i) / 20, data[i]);
+ }
+ }
+
+ {
+ // The 2-d qint8 tensor
+ TensorShape shape;
+ DataType type;
+ EXPECT_TRUE(reader.HasTensor("tensor_qint8", &shape, &type));
+ TensorShape expected({3, 2});
+ EXPECT_TRUE(shape.IsSameSize(expected));
+ EXPECT_EQ(DT_QINT8, type);
+
+ // We expect the tensor value to be correct.
+ TensorSlice s = TensorSlice::ParseOrDie("-:-");
+ qint8 data[6];
+ EXPECT_TRUE(reader.CopySliceData("tensor_qint8", s, data));
+ for (int i = 0; i < 6; ++i) {
+ EXPECT_EQ(*reinterpret_cast<qint8*>(&i), data[i]);
+ }
+ }
+
+ {
+ // The 2-d qint32 tensor
+ TensorShape shape;
+ DataType type;
+ EXPECT_TRUE(reader.HasTensor("tensor_qint32", &shape, &type));
+ TensorShape expected({2, 3});
+ EXPECT_TRUE(shape.IsSameSize(expected));
+ EXPECT_EQ(DT_QINT32, type);
+
+ // We expect the tensor value to be correct.
+ TensorSlice s = TensorSlice::ParseOrDie("-:-");
+ qint32 data[6];
+ EXPECT_TRUE(reader.CopySliceData("tensor_qint32", s, data));
+ for (int i = 0; i < 6; ++i) {
+ EXPECT_EQ(*reinterpret_cast<qint32*>(&i) * qint8(2), data[i]);
+ }
+ }
+}
+
+class SaveSlicesOpTest : public OpsTestBase {
+ protected:
+ void MakeOp() {
+ RequireDefaultOps();
+ ASSERT_OK(NodeDefBuilder("myop", "SaveSlices")
+ .Input(FakeInput())
+ .Input(FakeInput())
+ .Input(FakeInput())
+ .Input(FakeInput(
+ {DT_INT32, DT_FLOAT, DT_DOUBLE, DT_QINT8, DT_QINT32}))
+ .Finalize(node_def()));
+ ASSERT_OK(InitOp());
+ }
+};
+
+// Here we save only slices. We restore them in a larger tensor and we check
+// that the right slice is restored. It is quite tricky to check that the
+// right slices are actually restored so instead we just check that
+// CopySliceData() return true/false depending on the slice we ask for.
+TEST_F(SaveSlicesOpTest, Slices) {
+ const string filename = io::JoinPath(testing::TmpDir(), "tensor_slices");
+ const string tensornames[] = {"tensor_int", "tensor_float", "tensor_double",
+ "tensor_qint8", "tensor_qint32"};
+ // Specifies that the data we save are slices of larger tensors.
+ // See core/framework/tensor_slice.h for the slice syntax.
+ const string tensorshapes[] = {
+ "10 -", // Full contents of a 10 element vector.
+ "2 4 -:0,2", // A 2x2 slice of a 2x4 tensor.
+ "2 4 0,1:2,2", // A 1x2 slice of a 2x4 tensor.
+ "3 2 -:-", // Full contents of a 3x2 tensor.
+ "2 3 1,1:2,1" // Another 1x1 slice of a2x3 tensor.
+ };
+
+ MakeOp();
+ // Add a file name
+ AddInput<string>(TensorShape({}),
+ [&filename](int x) -> string { return filename; });
+
+ // Add the tensor names
+ AddInput<string>(TensorShape({5}),
+ [&tensornames](int x) -> string { return tensornames[x]; });
+
+ // Add the tensor shapes and slices
+ AddInput<string>(TensorShape({5}), [&tensorshapes](int x) -> string {
+ return tensorshapes[x];
+ });
+
+ // Add a 1-d integer tensor
+ AddInput<int32>(TensorShape({10}), [](int x) -> int32 { return x + 1; });
+
+ // Add a 2-d float tensor
+ AddInput<float>(TensorShape({2, 2}),
+ [](int x) -> float { return static_cast<float>(x) / 10; });
+
+ // Add a 2-d double tensor
+ AddInput<double>(TensorShape({1, 2}),
+ [](int x) -> double { return static_cast<double>(x) / 20; });
+
+ // Add a 2-d qint8 tensor
+ AddInput<qint8>(TensorShape({3, 2}),
+ [](int x) -> qint8 { return *reinterpret_cast<qint8*>(&x); });
+
+ // Add a 2-d qint32 tensor
+ AddInput<qint32>(TensorShape({1, 1}), [](int x) -> qint32 {
+ return *reinterpret_cast<qint32*>(&x) * qint8(2);
+ });
+
+ ASSERT_OK(RunOpKernel());
+
+ // Check that the checkpoint file is properly written
+ checkpoint::TensorSliceReader reader(filename,
+ checkpoint::OpenTableTensorSliceReader);
+ EXPECT_OK(reader.status());
+
+ // We expect to find all saved tensors
+ {
+ // The 1-d integer tensor
+ TensorShape shape;
+ DataType type;
+ EXPECT_TRUE(reader.HasTensor("tensor_int", &shape, &type));
+ TensorShape expected({10});
+ EXPECT_TRUE(shape.IsSameSize(expected));
+ EXPECT_EQ(DT_INT32, type);
+
+ // We saved the full tensor so we should be able to read it all.
+ TensorSlice s = TensorSlice::ParseOrDie("-");
+ int data[10];
+ EXPECT_TRUE(reader.CopySliceData("tensor_int", s, data));
+ }
+
+ {
+ // The 2-d float tensor
+ TensorShape shape;
+ DataType type;
+ EXPECT_TRUE(reader.HasTensor("tensor_float", &shape, &type));
+ TensorShape expected({2, 4});
+ EXPECT_TRUE(shape.IsSameSize(expected));
+ EXPECT_EQ(DT_FLOAT, type);
+
+ // We saved the slice "-:0,2" so we should not be able to read the full
+ // tensor.
+ TensorSlice full_slice = TensorSlice::ParseOrDie("-:-");
+ TensorSlice saved_slice = TensorSlice::ParseOrDie("-:0,2");
+ float data[8];
+ EXPECT_FALSE(reader.CopySliceData("tensor_float", full_slice, data));
+ EXPECT_TRUE(reader.CopySliceData("tensor_float", saved_slice, data));
+ }
+
+ {
+ // The 2-d double tensor
+ TensorShape shape;
+ DataType type;
+ EXPECT_TRUE(reader.HasTensor("tensor_double", &shape, &type));
+ TensorShape expected({2, 4});
+ EXPECT_TRUE(shape.IsSameSize(expected));
+ EXPECT_EQ(DT_DOUBLE, type);
+
+ // We saved the slice "0,1:2,2" so we should not be able to read the full
+ // tensor.
+ TensorSlice full_slice = TensorSlice::ParseOrDie("-:-");
+ TensorSlice saved_slice = TensorSlice::ParseOrDie("0,1:2,2");
+ double data[8];
+ EXPECT_FALSE(reader.CopySliceData("tensor_double", full_slice, data));
+ EXPECT_TRUE(reader.CopySliceData("tensor_double", saved_slice, data));
+ }
+
+ {
+ // The 2-d qint8 tensor
+ TensorShape shape;
+ DataType type;
+ EXPECT_TRUE(reader.HasTensor("tensor_qint8", &shape, &type));
+ TensorShape expected({3, 2});
+ EXPECT_TRUE(shape.IsSameSize(expected));
+ EXPECT_EQ(DT_QINT8, type);
+
+ // We saved the full slice.
+ TensorSlice s = TensorSlice::ParseOrDie("-:-");
+ qint8 data[6];
+ EXPECT_TRUE(reader.CopySliceData("tensor_qint8", s, data));
+ }
+
+ {
+ // The 2-d qint32 tensor
+ TensorShape shape;
+ DataType type;
+ EXPECT_TRUE(reader.HasTensor("tensor_qint32", &shape, &type));
+ TensorShape expected({2, 3});
+ EXPECT_TRUE(shape.IsSameSize(expected));
+ EXPECT_EQ(DT_QINT32, type);
+
+ // We expect the tensor value to be correct.
+ TensorSlice s = TensorSlice::ParseOrDie("1,1:2,1");
+ TensorSlice full_slice = TensorSlice::ParseOrDie("-:-");
+ TensorSlice saved_slice = TensorSlice::ParseOrDie("1,1:2,1");
+ qint32 data[6];
+ EXPECT_FALSE(reader.CopySliceData("tensor_qint32", full_slice, data));
+ EXPECT_TRUE(reader.CopySliceData("tensor_qint32", saved_slice, data));
+ }
+}
+
+class SaveOpSlices2Test : public OpsTestBase {
+ protected:
+ void MakeOp() {
+ RequireDefaultOps();
+ ASSERT_OK(NodeDefBuilder("myop", "SaveSlices")
+ .Input(FakeInput())
+ .Input(FakeInput())
+ .Input(FakeInput())
+ .Input(FakeInput({DT_INT32, DT_INT32, DT_FLOAT}))
+ .Finalize(node_def()));
+ ASSERT_OK(InitOp());
+ }
+};
+
+TEST_F(SaveOpSlices2Test, TwoSlices) {
+ const string filename = io::JoinPath(testing::TmpDir(), "three_slices");
+ // We will save 2 slices of the tensor named "four_by_sixteen" which is 4x16,
+ // and one slice of the "small" tensor.
+ const string tensornames[] = {"four_by_sixteen", "four_by_sixteen", "small"};
+ const string tensorshapes[] = {
+ // Slice specifications for the 2 slices of "four_by_sixteen"
+ "4 16 0,2:-", // 1st slice covers indices 0 and 1 in the first dim.
+ "4 16 2,2:-", // 2nd slice covers indices 2 and 3 in the first dim.
+ "" // We save the full "small" tensors.
+ };
+
+ MakeOp();
+ // Add a file name
+ AddInput<string>(TensorShape({}),
+ [&filename](int x) -> string { return filename; });
+
+ // Add the tensor names
+ AddInput<string>(TensorShape({3}),
+ [&tensornames](int x) -> string { return tensornames[x]; });
+
+ // Add the tensor shapes and slices
+ AddInput<string>(TensorShape({3}), [&tensorshapes](int x) -> string {
+ return tensorshapes[x];
+ });
+
+ // Add an integer tensor for slice 0,2:- of a 4x16 tensor: It is 2x16.
+ AddInput<int32>(TensorShape({2, 16}), [](int x) -> int32 { return x + 1; });
+
+ // Add an integer tensor for slice 2,2:- of a 4x16 tensor: It is 2x16.
+ AddInput<int32>(TensorShape({2, 16}),
+ [](int x) -> int32 { return 10 * (x + 1); });
+
+ // Add a float tensor for "small"
+ AddInput<float>(TensorShape({2, 4}),
+ [](int x) -> float { return static_cast<float>(x) / 10; });
+
+ ASSERT_OK(RunOpKernel());
+
+ // Check that the checkpoint file is properly written
+ checkpoint::TensorSliceReader reader(filename,
+ checkpoint::OpenTableTensorSliceReader);
+ EXPECT_OK(reader.status());
+
+ {
+ // Reload the two slices of "four_by_sixteen" into that tensor.
+ Tensor reloaded(DT_INT32, {4, 16});
+
+ // We expect to find all slices
+ TensorShape shape;
+ DataType type;
+ EXPECT_TRUE(reader.HasTensor("four_by_sixteen", &shape, &type));
+ EXPECT_TRUE(shape.IsSameSize(reloaded.shape()));
+ EXPECT_EQ(type, reloaded.dtype());
+
+ // Reload the whole tensor.
+ EXPECT_TRUE(reader.CopySliceData("four_by_sixteen",
+ TensorSlice(reloaded.dims()),
+ reloaded.flat<int>().data()));
+
+ {
+ auto slice = reloaded.Slice(0, 2).flat<int>();
+ for (int i = 0; i < slice.size(); ++i) {
+ EXPECT_EQ(i + 1, slice(i));
+ }
+ }
+ {
+ auto slice = reloaded.Slice(2, 4).flat<int>();
+ for (int i = 0; i < slice.size(); ++i) {
+ EXPECT_EQ(10 * (i + 1), slice(i));
+ }
+ }
+ }
+
+ {
+ // Reload the small float tensor.
+ Tensor reloaded(DT_FLOAT, {2, 4});
+
+ TensorShape shape;
+ DataType type;
+ EXPECT_TRUE(reader.HasTensor("small", &shape, &type));
+ EXPECT_TRUE(shape.IsSameSize(reloaded.shape()));
+ EXPECT_EQ(DT_FLOAT, reloaded.dtype());
+
+ EXPECT_TRUE(reader.CopySliceData("small", TensorSlice(reloaded.dims()),
+ reloaded.flat<float>().data()));
+
+ for (int64 i = 0; i < reloaded.NumElements(); ++i) {
+ EXPECT_EQ(static_cast<float>(i) / 10, reloaded.flat<float>().data()[i]);
+ }
+ }
+}
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
new file mode 100644
index 0000000000..88fcc1bdcc
--- /dev/null
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -0,0 +1,167 @@
+// See docs in ../ops/state_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+enum class UpdateOp { ASSIGN, ADD, SUB };
+
+template <class T, typename Index, UpdateOp op>
+class ScatterUpdateOp : public OpKernel {
+ public:
+ // QUESTION: It'd be nice to support DT_INT16, DT_UINT8,
+ // etc. here. Should we have the framework do some sort of
+ // integer promotion automatically, or should that be something
+ // that users have to do explicitly with a conversion operator
+ // in the graph?
+ explicit ScatterUpdateOp(OpKernelConstruction* c) : OpKernel(c) {
+ OP_REQUIRES_OK(c, c->GetAttr("use_locking", &use_exclusive_lock_));
+ }
+
+ void Compute(OpKernelContext* c) override {
+ if (use_exclusive_lock_) {
+ // Hold mutex while we apply updates
+ mutex_lock l(*c->input_ref_mutex(0));
+ DoCompute(c);
+ } else {
+ DoCompute(c);
+ }
+ }
+
+ private:
+ bool use_exclusive_lock_;
+
+ // Check whether updates.shape = indices.shape + params.shape[1:]
+ static bool ValidShapes(const Tensor& params, const Tensor& updates,
+ const Tensor& indices) {
+ if (updates.dims() != indices.dims() + params.dims() - 1) return false;
+ for (int d = 0; d < indices.dims(); d++) {
+ if (updates.dim_size(d) != indices.dim_size(d)) {
+ return false;
+ }
+ }
+ for (int d = 1; d < params.dims(); d++) {
+ if (params.dim_size(d) != updates.dim_size(d - 1 + indices.dims())) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ void DoCompute(OpKernelContext* c) {
+ Tensor Tparams = c->mutable_input(0, use_exclusive_lock_);
+ OP_REQUIRES(c, Tparams.IsInitialized(),
+ errors::FailedPrecondition("Null ref for params"));
+ const Tensor& Tindices = c->input(1);
+ const Tensor& Tupdates = c->input(2);
+ OP_REQUIRES(
+ c, TensorShapeUtils::IsVectorOrHigher(Tparams.shape()),
+ errors::InvalidArgument("params must be at least 1-D, got shape ",
+ Tparams.shape().ShortDebugString()));
+ OP_REQUIRES(
+ c, ValidShapes(Tparams, Tupdates, Tindices),
+ errors::InvalidArgument(
+ "Must have updates.shape = indices.shape + params.shape[1:], got ",
+ "updates.shape ", Tupdates.shape().ShortDebugString(),
+ ", indices.shape ", Tindices.shape().ShortDebugString(),
+ ", params.shape ", Tparams.shape().ShortDebugString()));
+ const Index N = Tindices.NumElements();
+
+ // We always return the input ref.
+ c->forward_ref_input_to_ref_output(0, 0);
+
+ if (N > 0) {
+ const Index first_dim_size = Tparams.dim_size(0);
+ // Validate all the indices are in range
+ auto Tindices_vec = Tindices.flat<Index>();
+ for (Index i = 0; i < N; i++) {
+ const Index index = Tindices_vec(i);
+ OP_REQUIRES(c, index >= 0 && index < first_dim_size,
+ errors::InvalidArgument(
+ strings::StrCat("Index ", index, " at offset ", i,
+ " in indices is out of range")));
+ }
+ auto Tparams_flat = Tparams.flat_outer_dims<T>();
+ auto Tupdates_flat =
+ Tupdates.shaped<T, 2>({N, Tupdates.NumElements() / N});
+ for (Index i = 0; i < N; i++) {
+ // Copy last Ndim-1 dimensions of Tupdates[i] to
+ // Tparams[Tindices[i]]
+ switch (op) {
+ case UpdateOp::ASSIGN: {
+ Tparams_flat.template chip<0>(Tindices_vec(i)) =
+ Tupdates_flat.template chip<0>(i);
+ break;
+ }
+ case UpdateOp::ADD: {
+ Tparams_flat.template chip<0>(Tindices_vec(i)) +=
+ Tupdates_flat.template chip<0>(i);
+ break;
+ }
+ case UpdateOp::SUB: {
+ Tparams_flat.template chip<0>(Tindices_vec(i)) -=
+ Tupdates_flat.template chip<0>(i);
+ break;
+ }
+ }
+ }
+ }
+ }
+};
+
+#define REGISTER_SCATTER_UPDATE(type, index_type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("ScatterUpdate") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<index_type>("Tindices"), \
+ ScatterUpdateOp<type, index_type, UpdateOp::ASSIGN>);
+
+#define REGISTER_SCATTER_UPDATE_INT32(type) REGISTER_SCATTER_UPDATE(type, int32)
+#define REGISTER_SCATTER_UPDATE_INT64(type) REGISTER_SCATTER_UPDATE(type, int64)
+
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_UPDATE_INT32);
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_UPDATE_INT64);
+
+#undef REGISTER_SCATTER_UPDATE_INT64
+#undef REGISTER_SCATTER_UPDATE_INT32
+#undef REGISTER_SCATTER_UPDATE
+
+#define REGISTER_SCATTER_ADD(type, index_type) \
+ REGISTER_KERNEL_BUILDER(Name("ScatterAdd") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<index_type>("Tindices"), \
+ ScatterUpdateOp<type, index_type, UpdateOp::ADD>);
+
+#define REGISTER_SCATTER_ADD_INT32(type) REGISTER_SCATTER_ADD(type, int32)
+#define REGISTER_SCATTER_ADD_INT64(type) REGISTER_SCATTER_ADD(type, int64)
+
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ADD_INT32);
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ADD_INT64);
+
+#undef REGISTER_SCATTER_ADD_INT32
+#undef REGISTER_SCATTER_ADD_INT64
+#undef REGISTER_SCATTER_ADD
+
+#define REGISTER_SCATTER_SUB(type, index_type) \
+ REGISTER_KERNEL_BUILDER(Name("ScatterSub") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<index_type>("Tindices"), \
+ ScatterUpdateOp<type, index_type, UpdateOp::SUB>);
+
+#define REGISTER_SCATTER_SUB_INT32(type) REGISTER_SCATTER_SUB(type, int32)
+#define REGISTER_SCATTER_SUB_INT64(type) REGISTER_SCATTER_SUB(type, int64)
+
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_SUB_INT32);
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_SUB_INT64);
+
+#undef REGISTER_SCATTER_SUB_INT64
+#undef REGISTER_SCATTER_SUB_INT32
+#undef REGISTER_SCATTER_SUB
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_op_test.cc b/tensorflow/core/kernels/scatter_op_test.cc
new file mode 100644
index 0000000000..8885f1edb3
--- /dev/null
+++ b/tensorflow/core/kernels/scatter_op_test.cc
@@ -0,0 +1,255 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+namespace {
+
+class ScatterUpdateOpTest : public OpsTestBase {
+ protected:
+ void MakeOp(DataType index_type) {
+ RequireDefaultOps();
+ ASSERT_OK(NodeDefBuilder("myop", "ScatterUpdate")
+ .Input(FakeInput(DT_FLOAT_REF))
+ .Input(FakeInput(index_type))
+ .Input(FakeInput(DT_FLOAT))
+ .Finalize(node_def()));
+ ASSERT_OK(InitOp());
+ }
+};
+
+TEST_F(ScatterUpdateOpTest, Simple_TwoD32) {
+ MakeOp(DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({5, 3}),
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+ AddInputFromArray<int32>(TensorShape({3}), {0, 4, 2});
+ AddInputFromArray<float>(TensorShape({3, 3}),
+ {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the new state of the input
+ Tensor params_tensor = *mutable_input(0).tensor;
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3}));
+ test::FillValues<float>(&expected, {100, 101, 102, 0, 0, 0, 10000, 10001,
+ 10002, 0, 0, 0, 777, 778, 779});
+ test::ExpectTensorEqual<float>(expected, params_tensor);
+}
+
+TEST_F(ScatterUpdateOpTest, Simple_Two64) {
+ MakeOp(DT_INT64);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({5, 3}),
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+ AddInputFromArray<int64>(TensorShape({3}), {0, 4, 2});
+ AddInputFromArray<float>(TensorShape({3, 3}),
+ {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the new state of the input
+ Tensor params_tensor = *mutable_input(0).tensor;
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3}));
+ test::FillValues<float>(&expected, {100, 101, 102, 0, 0, 0, 10000, 10001,
+ 10002, 0, 0, 0, 777, 778, 779});
+ test::ExpectTensorEqual<float>(expected, params_tensor);
+}
+
+TEST_F(ScatterUpdateOpTest, Simple_ZeroD) {
+ MakeOp(DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({5}), {0, 0, 0, 0, 0});
+ AddInputFromArray<int32>(TensorShape({}), {3});
+ AddInputFromArray<float>(TensorShape({}), {101});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the new state of the input
+ Tensor params_tensor = *mutable_input(0).tensor;
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({5}));
+ test::FillValues<float>(&expected, {0, 0, 0, 101, 0});
+ test::ExpectTensorEqual<float>(expected, params_tensor);
+}
+
+TEST_F(ScatterUpdateOpTest, Simple_OneD) {
+ MakeOp(DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({5}), {0, 0, 0, 0, 0});
+ AddInputFromArray<int32>(TensorShape({3}), {0, 4, 2});
+ AddInputFromArray<float>(TensorShape({3}), {100, 101, 102});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the new state of the input
+ Tensor params_tensor = *mutable_input(0).tensor;
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({5}));
+ test::FillValues<float>(&expected, {100, 0, 102, 0, 101});
+ test::ExpectTensorEqual<float>(expected, params_tensor);
+}
+
+TEST_F(ScatterUpdateOpTest, HigherRank) {
+ MakeOp(DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({8}), {0, 0, 0, 0, 0, 0, 0, 0});
+ AddInputFromArray<int32>(TensorShape({2, 3}), {0, 4, 2, 1, 3, 6});
+ AddInputFromArray<float>(TensorShape({2, 3}), {10, 20, 30, 40, 50, 60});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the new state of the input
+ Tensor params_tensor = *mutable_input(0).tensor;
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({8}));
+ test::FillValues<float>(&expected, {10, 40, 30, 50, 20, 0, 60, 0});
+ test::ExpectTensorEqual<float>(expected, params_tensor);
+}
+
+TEST_F(ScatterUpdateOpTest, Error_IndexOutOfRange) {
+ MakeOp(DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({5, 3}),
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+ AddInputFromArray<int32>(TensorShape({3}), {0, 4, 99});
+ AddInputFromArray<float>(TensorShape({3, 3}),
+ {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(StringPiece(s.ToString())
+ .contains("Index 99 at offset 2 in indices is out of range"))
+ << s;
+}
+
+TEST_F(ScatterUpdateOpTest, Error_WrongDimsIndices) {
+ MakeOp(DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({2, 3}), {0, 0, 0, 0, 0, 0});
+ AddInputFromArray<int32>(TensorShape({1, 3}), {0, 4, 99});
+ AddInputFromArray<float>(TensorShape({3, 3}),
+ {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(StringPiece(s.ToString())
+ .contains("Must have updates.shape = indices.shape + "
+ "params.shape[1:], got "))
+ << s;
+}
+
+TEST_F(ScatterUpdateOpTest, Error_MismatchedParamsAndUpdateDimensions) {
+ MakeOp(DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({5, 3}),
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+ AddInputFromArray<int32>(TensorShape({3}), {0, 4, 2});
+ AddInputFromArray<float>(
+ TensorShape({3, 4}),
+ {100, 101, 102, 103, 777, 778, 779, 780, 10000, 10001, 10002, 10004});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(StringPiece(s.ToString())
+ .contains("Must have updates.shape = indices.shape + "
+ "params.shape[1:], got "))
+
+ << s;
+}
+
+TEST_F(ScatterUpdateOpTest, Error_MismatchedIndicesAndUpdateDimensions) {
+ MakeOp(DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({5, 3}),
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+ AddInputFromArray<int32>(TensorShape({3}), {0, 4, 2});
+ AddInputFromArray<float>(TensorShape({2, 3}),
+ {100, 101, 102, 10000, 10001, 10002});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(StringPiece(s.ToString())
+ .contains("Must have updates.shape = indices.shape + "
+ "params.shape[1:], got "))
+ << s;
+}
+
+class ScatterUpdateBM : public ScatterUpdateOpTest {
+ public:
+ virtual void TestBody() {}
+ void MakeBenchmarkOp(const char* op, DataType index_type) {
+ ASSERT_OK(NodeDefBuilder("myop", op)
+ .Input(FakeInput(DT_FLOAT_REF))
+ .Input(FakeInput(index_type))
+ .Input(FakeInput(DT_FLOAT))
+ .Finalize(node_def()));
+ TF_CHECK_OK(InitOp());
+ }
+};
+
+template <typename Index>
+static void BM_ScatterHelper(int iters, int embedding_size, const char* op) {
+ testing::StopTiming();
+ const int kRows = 10000000 / embedding_size;
+ std::vector<float> values;
+ for (int i = 0; i < kRows * embedding_size; i++) {
+ values.push_back(i);
+ }
+ const int kNumUpdates = 1000;
+ random::PhiloxRandom philox(301, 17);
+ random::SimplePhilox rnd(&philox);
+ std::vector<Index> indices;
+ std::vector<float> updates;
+ for (int i = 0; i < kNumUpdates; i++) {
+ indices.push_back(rnd.Uniform(kRows));
+ for (int j = 0; j < embedding_size; j++) {
+ updates.push_back(i * 10 + j);
+ }
+ }
+
+ ScatterUpdateBM bm;
+ bm.MakeBenchmarkOp(op, DataTypeToEnum<Index>::v());
+ bm.AddInputFromArray<float>(TensorShape({kRows, embedding_size}), values);
+ bm.AddInputFromArray<Index>(TensorShape({kNumUpdates}), indices);
+ bm.AddInputFromArray<float>(TensorShape({kNumUpdates, embedding_size}),
+ updates);
+ testing::ItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) *
+ iters);
+ testing::StartTiming();
+ while (iters-- > 0) {
+ Status s = bm.RunOpKernel();
+ }
+}
+
+static void BM_ScatterUpdateInt32(int iters, int embedding_size) {
+ BM_ScatterHelper<int32>(iters, embedding_size, "ScatterUpdate");
+}
+static void BM_ScatterUpdateInt64(int iters, int embedding_size) {
+ BM_ScatterHelper<int64>(iters, embedding_size, "ScatterUpdate");
+}
+
+static void BM_ScatterAddInt32(int iters, int embedding_size) {
+ BM_ScatterHelper<int32>(iters, embedding_size, "ScatterAdd");
+}
+static void BM_ScatterAddInt64(int iters, int embedding_size) {
+ BM_ScatterHelper<int64>(iters, embedding_size, "ScatterAdd");
+}
+
+BENCHMARK(BM_ScatterUpdateInt32)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
+BENCHMARK(BM_ScatterUpdateInt64)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
+
+BENCHMARK(BM_ScatterAddInt32)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
+BENCHMARK(BM_ScatterAddInt64)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
new file mode 100644
index 0000000000..2b6a8c5a88
--- /dev/null
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -0,0 +1,466 @@
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/public/status.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+// This operator handles reducing segments along the first dimension.
+// See core/ops/math_ops.cc for more details.
+template <typename Device, class T, class Index, typename Reducer>
+class SegmentReductionOp : public OpKernel {
+ public:
+ explicit SegmentReductionOp(OpKernelConstruction* context)
+ : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ const Tensor& segment_ids = context->input(1);
+
+ OP_REQUIRES(context, TensorShapeUtils::IsVector(segment_ids.shape()),
+ errors::InvalidArgument("segment_ids should be a vector."));
+ const int64 num_indices = segment_ids.NumElements();
+ OP_REQUIRES(context, num_indices == input.dim_size(0),
+ errors::InvalidArgument(
+ "segment_ids should be the same size as dimension 0 of"
+ " input."));
+
+ auto input_flat = input.flat_outer_dims<T>();
+ const int64 num_col = input_flat.dimension(1);
+
+ const auto segment_vec = segment_ids.vec<Index>();
+ // Note that the current implementation assumes that segment_vec values are
+ // sorted.
+ const Index output_rows =
+ num_indices > 0 ? segment_vec(num_indices - 1) + 1 : 0;
+
+ TensorShape output_shape = input.shape();
+ output_shape.set_dim(0, output_rows);
+
+ // Note that we do not initialize the output buffer with a default value.
+ // We require that segment ids be sorted and cover all values (otherwise we
+ // return an error).
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+ auto output_flat = output->flat_outer_dims<T>();
+
+#if !defined(EIGEN_HAS_INDEX_LIST)
+ Eigen::DSizes<Eigen::DenseIndex, 1> dims_to_reduce;
+ dims_to_reduce[0] = 0;
+#else
+ Eigen::IndexList<Eigen::type2index<0>> dims_to_reduce;
+#endif
+ Index start = 0, end = 1;
+ // TODO(agarwal): if this loop becomes a bottleneck, consider sharding it
+ // across threads.
+ Eigen::DSizes<Eigen::DenseIndex, 1> out_slice_shape(num_col);
+ while (end <= num_indices) {
+ if (end < num_indices) {
+ if (segment_vec(start) == segment_vec(end)) {
+ ++end;
+ continue;
+ }
+ // We have a new segment here. Verify that the segment ids grow by one
+ // each time, so that we cover every possible output value.
+ OP_REQUIRES(
+ context, segment_vec(start) + 1 == segment_vec(end),
+ errors::InvalidArgument("segment ids are not increasing by 1"));
+ }
+
+ // Process segment [start, end)
+ const T* in_slice_ptr = &input_flat(start, 0);
+ typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>,
+ Eigen::Unaligned> OutT;
+ T* out_slice_ptr = &output_flat(segment_vec(start), 0);
+ OutT out_slice(out_slice_ptr, out_slice_shape);
+ // We don't use out_slice.device(context->egien_device<Device>)
+ // because these pieces of work are likely to be very small and
+ // the context switching overhead dwarfs any benefit we get from
+ // using another thread to do this work.
+ if (start == end - 1) {
+ typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>,
+ Eigen::Unaligned> InT;
+ InT in_slice(in_slice_ptr, out_slice_shape);
+ out_slice = in_slice;
+ } else {
+ Eigen::DSizes<Eigen::DenseIndex, 2> in_slice_shape(end - start,
+ num_col);
+ typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
+ Eigen::Unaligned> InT;
+ InT in_slice(in_slice_ptr, in_slice_shape);
+
+ out_slice = in_slice.reduce(dims_to_reduce, Reducer());
+ }
+ start = end;
+ ++end;
+ }
+ }
+};
+
+#define REGISTER_CPU_KERNELS(type, index_type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("SegmentSum") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<index_type>("Tindices"), \
+ SegmentReductionOp<CPUDevice, type, index_type, \
+ Eigen::internal::SumReducer<type>>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("SegmentMean") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<index_type>("Tindices"), \
+ SegmentReductionOp<CPUDevice, type, index_type, \
+ Eigen::internal::MeanReducer<type>>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("SegmentProd") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<index_type>("Tindices"), \
+ SegmentReductionOp<CPUDevice, type, index_type, \
+ Eigen::internal::ProdReducer<type>>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("SegmentMin") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<index_type>("Tindices"), \
+ SegmentReductionOp<CPUDevice, type, index_type, \
+ Eigen::internal::MinReducer<type>>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("SegmentMax") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<index_type>("Tindices"), \
+ SegmentReductionOp<CPUDevice, type, index_type, \
+ Eigen::internal::MaxReducer<type>>);
+
+#define REGISTER_CPU_KERNELS_ALL(type) \
+ REGISTER_CPU_KERNELS(type, int32); \
+ REGISTER_CPU_KERNELS(type, int64);
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS_ALL);
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_CPU_KERNELS_ALL
+
+// Similar to SegmentReductionOp but can handle unsorted segment definitions and
+// specifying size of output.
+template <typename Device, class T, class Index>
+class UnsortedSegmentSumOp : public OpKernel {
+ public:
+ explicit UnsortedSegmentSumOp(OpKernelConstruction* context)
+ : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& data = context->input(0);
+ const Tensor& segment_ids = context->input(1);
+ const Tensor& num_segments = context->input(2);
+
+ OP_REQUIRES(
+ context, TensorShapeUtils::IsLegacyScalar(num_segments.shape()),
+ errors::InvalidArgument("num_segments should be a scalar, not shape ",
+ num_segments.shape().ShortDebugString()));
+
+ OP_REQUIRES(context,
+ TensorShapeUtils::StartsWith(data.shape(), segment_ids.shape()),
+ errors::InvalidArgument(
+ "data.shape = ", data.shape().ShortDebugString(),
+ " does not start with segment_ids.shape = ",
+ segment_ids.shape().ShortDebugString()));
+
+ const auto segment_flat = segment_ids.flat<Index>();
+ const int32 N = segment_flat.dimension(0);
+ const int32 output_rows = num_segments.scalar<int32>()();
+
+ if (N > 0) {
+ Eigen::Tensor<Index, 0, Eigen::RowMajor> m = segment_flat.maximum();
+ OP_REQUIRES(
+ context, m() < output_rows,
+ errors::InvalidArgument("More segments found than output size"));
+ }
+
+ TensorShape output_shape;
+ output_shape.AddDim(output_rows);
+ for (int i = segment_ids.dims(); i < data.dims(); i++) {
+ output_shape.AddDim(data.dim_size(i));
+ }
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+ auto output_flat = output->flat_outer_dims<T>();
+ output_flat.setZero();
+
+ if (data.NumElements() > 0) {
+ auto data_flat = data.shaped<T, 2>({N, data.NumElements() / N});
+ for (int i = 0; i < N; ++i) {
+ output_flat.template chip<0>(segment_flat(i)) +=
+ data_flat.template chip<0>(i);
+ }
+ }
+ }
+};
+
+#define REGISTER_CPU_UNSORTED_KERNELS(type, index_type) \
+ REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentSum") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<index_type>("Tindices"), \
+ UnsortedSegmentSumOp<CPUDevice, type, index_type>);
+
+#define REGISTER_CPU_UNSORTED_KERNELS_ALL(type) \
+ REGISTER_CPU_UNSORTED_KERNELS(type, int32); \
+ REGISTER_CPU_UNSORTED_KERNELS(type, int64);
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_UNSORTED_KERNELS_ALL);
+#undef REGISTER_CPU_UNSORTED_KERNELS
+#undef REGISTER_CPU_UNSORTED_KERNELS_ALL
+
+// Same as SegmentReductionOp but takes as input a "sparse" tensor, represented
+// by two dense tensors, one containing the data, and the other containing
+// indices into the data.
+template <typename Device, class T>
+class SparseSegmentReductionOpBase : public OpKernel {
+ public:
+ explicit SparseSegmentReductionOpBase(OpKernelConstruction* context,
+ bool is_mean)
+ : OpKernel(context), is_mean_(is_mean) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ const Tensor& indices = context->input(1);
+ const Tensor& segment_ids = context->input(2);
+
+ OP_REQUIRES(context, TensorShapeUtils::IsVector(indices.shape()),
+ errors::InvalidArgument("indices should be a vector."));
+ OP_REQUIRES(context, TensorShapeUtils::IsVector(segment_ids.shape()),
+ errors::InvalidArgument("segment_ids should be a vector."));
+
+ const int32 num_indices = indices.NumElements();
+ OP_REQUIRES(context, num_indices == segment_ids.NumElements(),
+ errors::InvalidArgument(
+ "segment_ids and indices should have same size."));
+
+ auto input_flat = input.flat_outer_dims<T>();
+
+ const auto indices_vec = indices.vec<int32>();
+ const auto segment_vec = segment_ids.vec<int32>();
+ // Note that the current implementation assumes that segment_vec values are
+ // sorted.
+ const int32 output_rows =
+ num_indices > 0 ? segment_vec(num_indices - 1) + 1 : 0;
+
+ TensorShape output_shape = input.shape();
+ output_shape.set_dim(0, output_rows);
+
+ // Note that we do not initialize the output buffer with a default value.
+ // We require that segment ids be sorted and cover all values (otherwise we
+ // return an error).
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+ if (num_indices == 0) return;
+ auto output_flat = output->flat_outer_dims<T>();
+
+ int32 start = 0, end = 1;
+ while (end <= num_indices) {
+ if (end < num_indices) {
+ if (segment_vec(start) == segment_vec(end)) {
+ ++end;
+ continue;
+ }
+ // We have a new segment here. Verify that the segment ids grow by one
+ // each time, so that we cover every possible output value.
+ OP_REQUIRES(
+ context, segment_vec(start) + 1 == segment_vec(end),
+ errors::InvalidArgument("segment ids are not increasing by 1"));
+ }
+
+ auto out = output_flat.template chip<0>(segment_vec(start));
+#define I(i) input_flat.template chip<0>(indices_vec(start + i))
+ int num = end - start;
+ if (num == 1) {
+ out = I(0);
+ } else {
+ int r = num % 8;
+ T m = (is_mean_ && (num < 10)) ? num : 1;
+ switch (r) {
+ case 2:
+ out = (I(0) + I(1)) / m;
+ break;
+ case 3:
+ out = (I(0) + I(1) + I(2)) / m;
+ break;
+ case 4:
+ out = (I(0) + I(1) + I(2) + I(3)) / m;
+ break;
+ case 5:
+ out = (I(0) + I(1) + I(2) + I(3) + I(4)) / m;
+ break;
+ case 6:
+ out = (I(0) + I(1) + I(2) + I(3) + I(4) + I(5)) / m;
+ break;
+ case 7:
+ out = (I(0) + I(1) + I(2) + I(3) + I(4) + I(5) + I(6)) / m;
+ break;
+ case 0:
+ out = (I(0) + I(1) + I(2) + I(3) + I(4) + I(5) + I(6) + I(7)) / m;
+ r = 8;
+ break;
+ case 1:
+ out =
+ (I(0) + I(1) + I(2) + I(3) + I(4) + I(5) + I(6) + I(7) + I(8)) /
+ m;
+ r = 9;
+ break;
+ }
+ for (; r < num; r += 8) {
+ out += I(r) + I(r + 1) + I(r + 2) + I(r + 3) + I(r + 4) + I(r + 5) +
+ I(r + 6) + I(r + 7);
+ }
+#undef I
+ if (is_mean_ && num >= 10) {
+ out = out / static_cast<T>(num);
+ }
+ }
+ start = end;
+ ++end;
+ }
+ }
+
+ private:
+ bool is_mean_;
+};
+
+template <typename Device, class T>
+class SparseSegmentReductionMeanOp
+ : public SparseSegmentReductionOpBase<Device, T> {
+ public:
+ explicit SparseSegmentReductionMeanOp(OpKernelConstruction* context)
+ : SparseSegmentReductionOpBase<Device, T>(context, true /*is_mean*/) {}
+};
+
+template <typename Device, class T>
+class SparseSegmentReductionSumOp
+ : public SparseSegmentReductionOpBase<Device, T> {
+ public:
+ explicit SparseSegmentReductionSumOp(OpKernelConstruction* context)
+ : SparseSegmentReductionOpBase<Device, T>(context, false /*is_mean*/) {}
+};
+
+#define REGISTER_CPU_SPARSE_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("SparseSegmentSum").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ SparseSegmentReductionSumOp<CPUDevice, type>);
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_SPARSE_KERNELS);
+#undef REGISTER_CPU_SPARSE_KERNELS
+
+#define REGISTER_CPU_SPARSE_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("SparseSegmentMean").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ SparseSegmentReductionMeanOp<CPUDevice, type>);
+REGISTER_CPU_SPARSE_KERNELS(float);
+REGISTER_CPU_SPARSE_KERNELS(double);
+#undef REGISTER_CPU_SPARSE_KERNELS
+
+template <class T>
+class SparseSegmentMeanGradOp : public OpKernel {
+ public:
+ explicit SparseSegmentMeanGradOp(OpKernelConstruction* context)
+ : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ const Tensor& indices = context->input(1);
+ const Tensor& segment_ids = context->input(2);
+ const Tensor& output_dim0 = context->input(3);
+
+ OP_REQUIRES(context, TensorShapeUtils::IsVector(indices.shape()),
+ errors::InvalidArgument("indices should be a vector."));
+ OP_REQUIRES(context, TensorShapeUtils::IsVector(segment_ids.shape()),
+ errors::InvalidArgument("segment_ids should be a vector."));
+ OP_REQUIRES(context, TensorShapeUtils::IsLegacyScalar(output_dim0.shape()),
+ errors::InvalidArgument("output_dim0 should be a scalar."));
+
+ const int64 N = indices.NumElements();
+ OP_REQUIRES(context, N == segment_ids.NumElements(),
+ errors::InvalidArgument(
+ "segment_ids and indices should have same size."));
+ const int32 M = output_dim0.scalar<int32>()();
+
+ auto input_flat = input.flat_outer_dims<T>();
+ const auto indices_vec = indices.vec<int32>();
+ const auto segment_vec = segment_ids.vec<int32>();
+
+ TensorShape output_shape = input.shape();
+ output_shape.set_dim(0, M);
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+ if (M == 0 || N == 0) return;
+
+ // Note that similar to SparseSegmentMean, we assume that segment_vec is
+ // already sorted and has non-negative values.
+ int num_segments = segment_vec(N - 1) + 1;
+ OP_REQUIRES(context, input.dim_size(0) == num_segments,
+ errors::InvalidArgument("Invalid number of segments"));
+
+ // Compute scaling factors for input.
+ std::vector<double> scaling(num_segments, 0.0);
+ for (int64 i = 0; i < N; ++i) {
+ scaling[segment_vec(i)] += 1;
+ }
+ for (int i = 0; i < scaling.size(); ++i) {
+ scaling[i] = 1.0 / std::max(scaling[i], 1.0);
+ }
+
+ auto output_flat = output->flat_outer_dims<T>();
+ output_flat.setZero();
+ std::vector<bool> is_modified(M, false);
+
+ for (int64 i = 0; i < N; ++i) {
+ int output_idx = indices_vec(i);
+ int idx = segment_vec(i);
+ T scale = static_cast<T>(scaling[idx]);
+ if (is_modified[output_idx]) {
+ if (scale == 1.0) {
+ output_flat.template chip<0>(output_idx) +=
+ input_flat.template chip<0>(idx);
+ } else {
+ output_flat.template chip<0>(output_idx) +=
+ input_flat.template chip<0>(idx) * scale;
+ }
+ } else {
+ if (scale == 1.0) {
+ output_flat.template chip<0>(output_idx) =
+ input_flat.template chip<0>(idx);
+ } else {
+ output_flat.template chip<0>(output_idx) =
+ input_flat.template chip<0>(idx) * scale;
+ }
+ }
+ is_modified[output_idx] = true;
+ }
+ }
+};
+
+#define REGISTER_CPU_SPARSE_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER(Name("SparseSegmentMeanGrad") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T"), \
+ SparseSegmentMeanGradOp<type>);
+
+REGISTER_CPU_SPARSE_KERNELS(float);
+REGISTER_CPU_SPARSE_KERNELS(double);
+
+#undef REGISTER_CPU_SPARSE_KERNELS
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/segment_reduction_ops_test.cc b/tensorflow/core/kernels/segment_reduction_ops_test.cc
new file mode 100644
index 0000000000..87647a21a8
--- /dev/null
+++ b/tensorflow/core/kernels/segment_reduction_ops_test.cc
@@ -0,0 +1,157 @@
+#include <functional>
+
+#include "tensorflow/core/public/session_options.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+
+namespace tensorflow {
+
+template <typename Index>
+static void BM_SegmentReduction(int iters, string reduction, Index num_rows,
+ Index num_cols, Index segment_size) {
+ testing::StopTiming();
+ std::unique_ptr<Device> device(
+ DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+ // Create inputs
+ gtl::InlinedVector<TensorValue, 4> reduction_inputs;
+ TensorShape shape1({num_rows, num_cols});
+ Tensor input1(DT_FLOAT, shape1);
+ reduction_inputs.push_back({nullptr, &input1});
+
+ TensorShape shape2({num_rows});
+ Tensor input2(DataTypeToEnum<Index>::v(), shape2);
+ test::FillFn<Index>(&input2, [&num_rows, &segment_size](Index i) -> Index {
+ return std::min(i / segment_size, num_rows - 1);
+ });
+ reduction_inputs.push_back({nullptr, &input2});
+
+ NodeDef reduction_node_def;
+ TF_CHECK_OK(NodeDefBuilder(reduction, reduction)
+ .Input(FakeInput(DT_FLOAT))
+ .Input(FakeInput(DataTypeToEnum<Index>::v()))
+ .Finalize(&reduction_node_def));
+ Status status;
+ std::unique_ptr<OpKernel> reduction_op(CreateOpKernel(
+ DEVICE_CPU, device.get(), cpu_allocator(), reduction_node_def, &status));
+ OpKernelContext::Params params;
+ params.device = device.get();
+ params.frame_iter = FrameAndIter(0, 0);
+ params.inputs = &reduction_inputs;
+ params.op_kernel = reduction_op.get();
+ params.output_alloc_attr = [&device, &reduction_op, &params](int index) {
+ AllocatorAttributes attr;
+ const bool on_host =
+ (reduction_op->output_memory_types()[index] == HOST_MEMORY);
+ attr.set_on_host(on_host);
+ return attr;
+ };
+
+ std::unique_ptr<OpKernelContext> reduction_context(
+ new OpKernelContext(params));
+
+ reduction_op->Compute(reduction_context.get());
+ TF_CHECK_OK(reduction_context->status());
+ testing::StartTiming();
+ for (int i = 0; i < iters; ++i) {
+ delete reduction_context->release_output(0).tensor;
+ reduction_op->Compute(reduction_context.get());
+ }
+ int64 bytes_per_iter =
+ static_cast<int64>(num_rows * num_cols * sizeof(float));
+ testing::BytesProcessed(bytes_per_iter * iters);
+}
+
+#define BM_Reduce(O, R, C, S) \
+ static void BM_Reduce_##O##_##R##_##C##_##S##_int32(int iters) { \
+ BM_SegmentReduction<int32>(iters, #O, R, C, S); \
+ } \
+ static void BM_Reduce_##O##_##R##_##C##_##S##_int64(int iters) { \
+ BM_SegmentReduction<int64>(iters, #O, R, C, S); \
+ } \
+ BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int32); \
+ BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int64);
+
+#define BM_Reduce_Arg(R, C, S) \
+ BM_Reduce(SegmentSum, R, C, S); \
+ BM_Reduce(SegmentMean, R, C, S);
+
+BM_Reduce_Arg(64, 32, 1);
+BM_Reduce_Arg(4096, 128, 1);
+
+BM_Reduce_Arg(16, 8, 2);
+BM_Reduce_Arg(64, 32, 2);
+BM_Reduce_Arg(4096, 32, 2);
+BM_Reduce_Arg(4096, 128, 2);
+
+static void SparseSegmentMeanGradHelper(int iters, float uniqueness, int size) {
+ testing::StopTiming();
+ RequireDefaultOps();
+ Graph* g = new Graph(OpRegistry::Global());
+ CHECK_LE(uniqueness, 1.0);
+ CHECK_GT(uniqueness, 0.0);
+
+ const int kNumIndices = size;
+ Tensor indices(DT_INT32, TensorShape({kNumIndices}));
+ auto indices_flat = indices.flat<int32>();
+ Tensor segments(DT_INT32, TensorShape({kNumIndices}));
+ auto segments_flat = segments.flat<int32>();
+
+ int kUniqueIndices = uniqueness * kNumIndices;
+ Tensor output_dim0(DT_INT32, TensorShape({}));
+ output_dim0.scalar<int32>()() = kUniqueIndices;
+
+ for (int i = 0; i < kNumIndices; ++i) {
+ indices_flat(i) = (i * 31) % kUniqueIndices;
+ segments_flat(i) = i * .8;
+ }
+
+ const int kDim1 = segments_flat(kNumIndices - 1) + 1;
+ const int kDim2 = 128;
+ Tensor input(DT_FLOAT, TensorShape({kDim1, kDim2}));
+ input.flat<float>().setRandom();
+
+ Node* node;
+ TF_CHECK_OK(NodeBuilder(g->NewName("n"), "SparseSegmentMeanGrad")
+ .Input(test::graph::Constant(g, input))
+ .Input(test::graph::Constant(g, indices))
+ .Input(test::graph::Constant(g, segments))
+ .Input(test::graph::Constant(g, output_dim0))
+ .Attr("T", DT_FLOAT)
+ .Finalize(g, &node));
+
+ testing::UseRealTime();
+ testing::BytesProcessed(static_cast<int64>(iters) * (kDim1 * kDim2) *
+ sizeof(float));
+ testing::StartTiming();
+ test::Benchmark("cpu", g).Run(iters);
+}
+
+static void BM_SparseSegmentMeanGrad_Low(int iters, int size) {
+ return SparseSegmentMeanGradHelper(iters, 1.0, size);
+}
+
+static void BM_SparseSegmentMeanGrad_High(int iters, int size) {
+ return SparseSegmentMeanGradHelper(iters, 0.01, size);
+}
+
+BENCHMARK(BM_SparseSegmentMeanGrad_Low)->Arg(1000)->Arg(100000);
+BENCHMARK(BM_SparseSegmentMeanGrad_High)->Arg(1000)->Arg(100000);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
new file mode 100644
index 0000000000..2abb183d1a
--- /dev/null
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -0,0 +1,116 @@
+#include "tensorflow/core/kernels/sendrecv_ops.h"
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+static string GetRendezvousKeyPrefix(const string& send_device,
+ const string& recv_device,
+ const uint64 send_device_incarnation,
+ const string& tensor_name) {
+ return strings::StrCat(send_device, ";",
+ strings::FpToString(send_device_incarnation), ";",
+ recv_device, ";", tensor_name);
+}
+
+static string GetRendezvousKey(const string& key_prefix,
+ const FrameAndIter& frame_iter) {
+ return strings::StrCat(key_prefix, ";", frame_iter.frame_id, ":",
+ frame_iter.iter_id);
+}
+
+SendOp::SendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ string send_device;
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device));
+ string recv_device;
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_device", &recv_device));
+ uint64 send_device_incarnation;
+ OP_REQUIRES_OK(
+ ctx, ctx->GetAttr("send_device_incarnation",
+ reinterpret_cast<int64*>(&send_device_incarnation)));
+ string tensor_name;
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+ key_prefix_ = GetRendezvousKeyPrefix(send_device, recv_device,
+ send_device_incarnation, tensor_name);
+}
+
+void SendOp::Compute(OpKernelContext* ctx) {
+ OP_REQUIRES(
+ ctx, ctx->rendezvous() != nullptr,
+ errors::Internal("Op kernel context needs to provide a rendezvous."));
+ const string key = GetRendezvousKey(key_prefix_, ctx->frame_iter());
+ VLOG(2) << "Send " << key;
+
+ // The device context may be passed between the Send/Recv
+ // boundary, so that the device context used to produce the Tensor
+ // is used when performing the copy on the recv side (which may be
+ // a different device).
+ Rendezvous::Args args;
+ args.device_context = ctx->op_device_context();
+ args.alloc_attrs = ctx->input_alloc_attr(0);
+ Status s =
+ ctx->rendezvous()->Send(key, args, ctx->input(0), ctx->is_input_dead());
+ ctx->SetStatus(s);
+}
+
+REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_CPU), SendOp);
+REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_GPU), SendOp);
+
+REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp);
+REGISTER_KERNEL_BUILDER(
+ Name("_HostSend").Device(DEVICE_GPU).HostMemory("tensor"), SendOp);
+
+RecvOp::RecvOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+ string send_device;
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device));
+ string recv_device;
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_device", &recv_device));
+ uint64 send_device_incarnation;
+ OP_REQUIRES_OK(
+ ctx, ctx->GetAttr("send_device_incarnation",
+ reinterpret_cast<int64*>(&send_device_incarnation)));
+ string tensor_name;
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+ key_prefix_ = GetRendezvousKeyPrefix(send_device, recv_device,
+ send_device_incarnation, tensor_name);
+}
+
+void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
+ OP_REQUIRES(
+ ctx, ctx->rendezvous() != nullptr,
+ errors::Internal("Op kernel context needs to provide a rendezvous."));
+ const string key = GetRendezvousKey(key_prefix_, ctx->frame_iter());
+ VLOG(2) << "Recv " << key;
+
+ Rendezvous::Args args;
+ args.device_context = ctx->op_device_context();
+ args.alloc_attrs = ctx->output_alloc_attr(0);
+ ctx->rendezvous()->RecvAsync(
+ key, args, [ctx, done](const Status& s, const Rendezvous::Args& send_args,
+ const Rendezvous::Args& recv_args,
+ const Tensor& val, bool is_dead) {
+ ctx->SetStatus(s);
+ if (s.ok()) {
+ // 'ctx' allocates the output tensor of the expected type. The
+ // runtime checks whether the tensor received here is the same type.
+ if (!is_dead) {
+ ctx->set_output(0, val);
+ }
+ *ctx->is_output_dead() = is_dead;
+ }
+ done();
+ });
+}
+
+REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_CPU), RecvOp);
+REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_GPU), RecvOp);
+
+REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp);
+REGISTER_KERNEL_BUILDER(
+ Name("_HostRecv").Device(DEVICE_GPU).HostMemory("tensor"), RecvOp);
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/sendrecv_ops.h b/tensorflow/core/kernels/sendrecv_ops.h
new file mode 100644
index 0000000000..b3f5703ccf
--- /dev/null
+++ b/tensorflow/core/kernels/sendrecv_ops.h
@@ -0,0 +1,32 @@
+#ifndef TENSORFLOW_KERNELS_SENDRECV_OPS_H_
+#define TENSORFLOW_KERNELS_SENDRECV_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class SendOp : public OpKernel {
+ public:
+ explicit SendOp(OpKernelConstruction* ctx);
+ void Compute(OpKernelContext* ctx) override;
+
+ private:
+ string key_prefix_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(SendOp);
+};
+
+class RecvOp : public AsyncOpKernel {
+ public:
+ explicit RecvOp(OpKernelConstruction* ctx);
+ void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+
+ private:
+ string key_prefix_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(RecvOp);
+};
+
+} // end namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_SENDRECV_OPS_H_
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
new file mode 100644
index 0000000000..60ba2e15f9
--- /dev/null
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -0,0 +1,123 @@
+// See docs in ../ops/math_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+int32 GetValue(int32 v) { return v; }
+
+template <typename T>
+class RangeOp : public OpKernel {
+ public:
+ explicit RangeOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& start_in = context->input(0);
+ const Tensor& limit_in = context->input(1);
+ const Tensor& delta_in = context->input(2);
+ OP_REQUIRES(context, TensorShapeUtils::IsLegacyScalar(start_in.shape()),
+ errors::InvalidArgument("start must be a scalar, not shape ",
+ start_in.shape().ShortDebugString()));
+ OP_REQUIRES(context, TensorShapeUtils::IsLegacyScalar(limit_in.shape()),
+ errors::InvalidArgument("limit must be a scalar, not shape ",
+ limit_in.shape().ShortDebugString()));
+ OP_REQUIRES(context, TensorShapeUtils::IsLegacyScalar(delta_in.shape()),
+ errors::InvalidArgument("delta must be a scalar, not shape ",
+ delta_in.shape().ShortDebugString()));
+ const int32 start = GetValue(start_in.scalar<T>()());
+ const int32 limit = GetValue(limit_in.scalar<T>()());
+ OP_REQUIRES(context, start <= limit,
+ errors::InvalidArgument("Requires start <= limit: ", start, "/",
+ limit));
+ const int32 delta = GetValue(delta_in.scalar<T>()());
+ OP_REQUIRES(context, delta > 0,
+ errors::InvalidArgument("Requires delta > 0: ", delta));
+ int32 size = (limit - start + delta - 1) / delta;
+ Tensor* out = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, TensorShape({size}), &out));
+ auto flat = out->flat<T>();
+ int32 val = start;
+ for (int32 i = 0; i < size; ++i) {
+ flat(i) = T(val);
+ val += delta;
+ }
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("Range")
+ .Device(DEVICE_CPU)
+ .HostMemory("start")
+ .HostMemory("limit")
+ .HostMemory("delta")
+ .HostMemory("output"),
+ RangeOp<int32>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("Range")
+ .Device(DEVICE_GPU)
+ .HostMemory("start")
+ .HostMemory("limit")
+ .HostMemory("delta")
+ .HostMemory("output"),
+ RangeOp<int32>);
+#endif // GOOGLE_CUDA
+
+template <typename T>
+class LinSpaceOp : public OpKernel {
+ public:
+ explicit LinSpaceOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& start_in = context->input(0);
+ const Tensor& stop_in = context->input(1);
+ const Tensor& num_in = context->input(2);
+ OP_REQUIRES(context, TensorShapeUtils::IsScalar(start_in.shape()),
+ errors::InvalidArgument("start must be a scalar, not shape ",
+ start_in.shape().ShortDebugString()));
+ OP_REQUIRES(context, TensorShapeUtils::IsScalar(stop_in.shape()),
+ errors::InvalidArgument("stop must be a scalar, not shape ",
+ stop_in.shape().ShortDebugString()));
+ OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_in.shape()),
+ errors::InvalidArgument("num must be a scalar, not shape ",
+ num_in.shape().ShortDebugString()));
+ const T start = start_in.scalar<T>()();
+ const T stop = stop_in.scalar<T>()();
+ const int32 num = num_in.scalar<int32>()();
+ OP_REQUIRES(context, num > 0,
+ errors::InvalidArgument("Requires num > 0: ", num));
+ Tensor* out = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, TensorShape({num}), &out));
+ auto flat = out->flat<T>();
+ if (num == 1) {
+ flat(0) = start;
+ } else {
+ const T step = (stop - start) / (num - 1);
+ for (int32 i = 0; i < num; ++i) flat(i) = start + step * i;
+ }
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("LinSpace")
+ .Device(DEVICE_CPU)
+ .TypeConstraint<float>("T")
+ .HostMemory("start")
+ .HostMemory("stop")
+ .HostMemory("num")
+ .HostMemory("output"),
+ LinSpaceOp<float>);
+REGISTER_KERNEL_BUILDER(Name("LinSpace")
+ .Device(DEVICE_CPU)
+ .TypeConstraint<double>("T")
+ .HostMemory("start")
+ .HostMemory("stop")
+ .HostMemory("num")
+ .HostMemory("output"),
+ LinSpaceOp<double>);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
new file mode 100644
index 0000000000..7cb1da8983
--- /dev/null
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -0,0 +1,261 @@
+// See docs in ../ops/array_ops.cc.
+
+#include <unordered_set>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+class ShapeOp : public OpKernel {
+ public:
+ explicit ShapeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor& inp = ctx->input(0);
+ const int rank = inp.dims();
+ Tensor* out = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({rank}), &out));
+ auto vec = out->vec<int32>();
+ for (int i = 0; i < rank; ++i) vec(i) = inp.dim_size(i);
+ }
+
+ bool IsExpensive() override { return false; }
+};
+REGISTER_KERNEL_BUILDER(Name("Shape").Device(DEVICE_CPU).HostMemory("output"),
+ ShapeOp);
+
+#define REGISTER_GPU_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER(Name("Shape") \
+ .Device(DEVICE_GPU) \
+ .HostMemory("output") \
+ .TypeConstraint<type>("T"), \
+ ShapeOp)
+TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Shape")
+ .Device(DEVICE_GPU)
+ .HostMemory("input")
+ .HostMemory("output")
+ .TypeConstraint<int32>("T"),
+ ShapeOp);
+
+class RankOp : public OpKernel {
+ public:
+ explicit RankOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor& inp = ctx->input(0);
+ const int rank = inp.dims();
+ Tensor* out = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
+ out->scalar<int32>()() = rank;
+ }
+
+ bool IsExpensive() override { return false; }
+};
+REGISTER_KERNEL_BUILDER(Name("Rank").Device(DEVICE_CPU).HostMemory("output"),
+ RankOp);
+
+#define REGISTER_GPU_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER(Name("Rank") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("output"), \
+ RankOp);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Rank")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<int32>("T")
+ .HostMemory("input")
+ .HostMemory("output"),
+ RankOp);
+
+class SizeOp : public OpKernel {
+ public:
+ explicit SizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor& inp = ctx->input(0);
+ const int64 size = inp.NumElements();
+ Tensor* out = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
+ // TODO(josh11b): switch output to int64?
+ out->scalar<int32>()() = size;
+ }
+
+ bool IsExpensive() override { return false; }
+};
+REGISTER_KERNEL_BUILDER(Name("Size").Device(DEVICE_CPU).HostMemory("output"),
+ SizeOp);
+
+#define REGISTER_GPU_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER(Name("Size") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("output"), \
+ SizeOp);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Size")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<int32>("T")
+ .HostMemory("input")
+ .HostMemory("output"),
+ SizeOp);
+
+class ExpandDimsOp : public OpKernel {
+ public:
+ explicit ExpandDimsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+ void Compute(OpKernelContext* ctx) override {
+ int dim = ctx->input(1).flat<int>()(0);
+ OP_REQUIRES(
+ ctx, (dim >= -1 - ctx->input(0).dims() && dim <= ctx->input(0).dims()),
+ errors::InvalidArgument("Tried to expand dim index ", dim,
+ " for tensor with ", ctx->input(0).dims(),
+ " dimensions."));
+
+ auto existing_dims = ctx->input(0).shape().dim_sizes();
+ std::vector<int64> new_shape(existing_dims.size());
+ for (size_t i = 0; i < new_shape.size(); ++i) {
+ new_shape[i] = existing_dims[i];
+ }
+
+ // We emulate numpy's interpretation of the dim axis when
+ // -input.dims() >= dim <= input.dims().
+ if (dim < 0) {
+ dim += existing_dims.size() + 1;
+ }
+
+ // Clamp to the end if needed.
+ dim = std::min<int32>(dim, existing_dims.size());
+ new_shape.emplace(new_shape.begin() + dim, 1);
+ const TensorShape output_shape(new_shape);
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {0}, &output));
+ if (!output->CopyFrom(ctx->input(0), output_shape)) {
+ // This should never happen, since the sizes of the input and output
+ // should always be the same (we only expand the dimension with 1).
+ ctx->SetStatus(
+ errors::Internal("Could not expand dimension with input shape ",
+ ctx->input(0).shape().DebugString(),
+ " and output shape ", output_shape.DebugString()));
+ }
+ }
+};
+REGISTER_KERNEL_BUILDER(Name("ExpandDims").Device(DEVICE_CPU).HostMemory("dim"),
+ ExpandDimsOp);
+
+#define REGISTER_GPU_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER(Name("ExpandDims") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("dim"), \
+ ExpandDimsOp);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+REGISTER_KERNEL_BUILDER(Name("ExpandDims")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<int32>("T")
+ .HostMemory("input")
+ .HostMemory("dim")
+ .HostMemory("output"),
+ ExpandDimsOp);
+
+class SqueezeOp : public OpKernel {
+ public:
+ explicit SqueezeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ std::vector<int32> squeeze_dims;
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("squeeze_dims", &squeeze_dims));
+ squeeze_dims_.insert(squeeze_dims.begin(), squeeze_dims.end());
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ auto existing_dims = ctx->input(0).shape().dim_sizes();
+ std::vector<int64> new_shape;
+
+ std::unordered_set<int32> wrapped_squeeze_dims;
+ wrapped_squeeze_dims.reserve(squeeze_dims_.size());
+ // Validate squeeze dims against the input.
+ for (int32 dim : squeeze_dims_) {
+ OP_REQUIRES(
+ ctx, (dim >= -ctx->input(0).dims() && dim < ctx->input(0).dims()),
+ errors::InvalidArgument("Tried to squeeze dim index ", dim,
+ " for tensor with ", ctx->input(0).dims(),
+ " dimensions."));
+ // If dim is < 0, we wrap around (-1 means the last element).
+ if (dim < 0) {
+ dim = existing_dims.size() + dim;
+ }
+
+ wrapped_squeeze_dims.insert(dim);
+ }
+
+ for (size_t i = 0; i < existing_dims.size(); ++i) {
+ auto existing_dim = existing_dims[i];
+
+ // If squeeze_set is non-empty, only squeeze those dimensions.
+ if (!wrapped_squeeze_dims.empty()) {
+ if (wrapped_squeeze_dims.count(i) > 0) {
+ OP_REQUIRES(ctx, existing_dim == 1,
+ errors::InvalidArgument("Tried to explicitly squeeze "
+ "dimension ",
+ i, " but dimension was not 1: ",
+ existing_dim));
+ } else {
+ // This dimension is not being squeezed.
+ new_shape.push_back(existing_dim);
+ }
+ } else {
+ // Copy over all non-1-length dimensions.
+ if (existing_dim != 1) {
+ new_shape.push_back(existing_dim);
+ }
+ }
+ }
+
+ const TensorShape output_shape(new_shape);
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {0}, &output));
+ if (!output->CopyFrom(ctx->input(0), output_shape)) {
+ // This should never happen, since the sizes of the input and
+ // output should always be the same.
+ ctx->SetStatus(errors::Internal("Could not squeeze input with shape ",
+ ctx->input(0).shape().DebugString(),
+ " and output shape ",
+ output_shape.DebugString()));
+ }
+ }
+
+ private:
+ std::unordered_set<int32> squeeze_dims_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Squeeze").Device(DEVICE_CPU), SqueezeOp);
+
+#define REGISTER_GPU_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Squeeze").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+ SqueezeOp);
+TF_CALL_NUMBER_TYPES(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
new file mode 100644
index 0000000000..3477266d5d
--- /dev/null
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -0,0 +1,242 @@
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif // GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/slice_op.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace {
+
+gtl::InlinedVector<int64, 4> IntTensorToInt64Vec(const Tensor& tensor) {
+ gtl::InlinedVector<int64, 4> out;
+ if (tensor.dtype() == DT_INT32) {
+ for (int64 i = 0; i < tensor.NumElements(); ++i) {
+ out.push_back(tensor.flat<int32>()(i));
+ }
+ } else if (tensor.dtype() == DT_INT64) {
+ for (int64 i = 0; i < tensor.NumElements(); ++i) {
+ out.push_back(tensor.flat<int64>()(i));
+ }
+ } else {
+ LOG(FATAL) << "begin must be either int32 or int64";
+ }
+ return out;
+}
+
+} // namespace
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// Shared code that is not dependent on the type of T. We do this to reduce
+// code size by not duplicating all this for all T (float, double, int32, etc.)
+static void SharedValidation(OpKernelContext* context,
+ TensorShape* output_shape, bool* is_identity,
+ bool* slice_dim0,
+ gtl::InlinedVector<int64, 4>* begin,
+ gtl::InlinedVector<int64, 4>* size) {
+ const Tensor& input = context->input(0);
+ const Tensor& begin_tensor = context->input(1);
+ const Tensor& size_tensor = context->input(2);
+
+ OP_REQUIRES(
+ context, TensorShapeUtils::IsLegacyVector(begin_tensor.shape()) &&
+ TensorShapeUtils::IsLegacyVector(size_tensor.shape()) &&
+ begin_tensor.NumElements() == input.dims() &&
+ size_tensor.NumElements() == input.dims(),
+ errors::InvalidArgument(
+ "Expected begin and size arguments to be 1-D tensors of size ",
+ input.dims(), ", but got ", begin_tensor.NumElements(), " and ",
+ size_tensor.NumElements(), " instead."));
+
+ const int input_dims = input.dims();
+ *begin = IntTensorToInt64Vec(begin_tensor);
+ *size = IntTensorToInt64Vec(size_tensor);
+ for (int i = 0; i < input_dims; ++i) {
+ if ((*size)[i] == -1) {
+ // A size[i] of -1 means "all elements from begin[i] to dim_size(i)".
+ (*size)[i] = input.dim_size(i) - (*begin)[i];
+ }
+ }
+
+ *is_identity = true;
+ *slice_dim0 = true;
+ for (int i = 0; i < input_dims; ++i) {
+ int64 b = (*begin)[i];
+ int64 s = (*size)[i];
+ if (input.dim_size(i) == 0) {
+ OP_REQUIRES(
+ context, b == 0 && s == 0,
+ errors::InvalidArgument("Expected begin[", i, "] == 0 (got ", b,
+ ") and size[", i, "] == 0 ", "(got ", s,
+ ") when ", "input.dim_size(", i, ") == 0"));
+ } else {
+ OP_REQUIRES(context, 0 <= b && b <= input.dim_size(i),
+ errors::InvalidArgument("Expected begin[", i, "] in [0, ",
+ input.dim_size(i), "], but got ", b));
+ OP_REQUIRES(
+ context, 0 <= s && b + s <= input.dim_size(i),
+ errors::InvalidArgument("Expected size[", i, "] in [0, ",
+ input.dim_size(i) - b, "], but ", "got ", s));
+ }
+ output_shape->AddDim(s);
+ const bool take_all = (b == 0) && (s == input.dim_size(i));
+ (*is_identity) &= take_all;
+ (*slice_dim0) &= (i == 0) || take_all;
+ }
+}
+
+template <typename Device, typename T>
+class SliceOp : public OpKernel {
+ public:
+ explicit SliceOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ TensorShape output_shape;
+ bool is_identity = true;
+ bool slice_dim0 = true;
+ gtl::InlinedVector<int64, 4> begin;
+ gtl::InlinedVector<int64, 4> size;
+ SharedValidation(context, &output_shape, &is_identity, &slice_dim0, &begin,
+ &size);
+ if (!context->status().ok()) return;
+ const Tensor& input = context->input(0);
+ if (is_identity) {
+ VLOG(1) << "Slice identity";
+ context->set_output(0, input);
+ return;
+ }
+
+ if (slice_dim0 && IsInnerDimsSizeAligned<T>(input.shape())) {
+ VLOG(1) << "Slice dim 0: " << input.shape().DebugString();
+ CHECK_GE(input.dims(), 1); // Otherwise, is_identity should be true.
+ context->set_output(0, input.Slice(begin[0], begin[0] + size[0]));
+ return;
+ }
+
+ Tensor* result = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
+ const int input_dims = input.dims();
+
+ if (output_shape.num_elements() > 0) {
+ if (std::is_same<Device, CPUDevice>::value && input_dims == 2 &&
+ DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
+ auto input = context->input(0).tensor<T, 2>();
+ auto output = result->tensor<T, 2>();
+ // TODO(agarwal): Consider multi-threading this loop for cases where
+ // size[0] is very large.
+ for (int i = 0; i < size[0]; ++i) {
+ const int row = begin[0] + i;
+ if (i + 1 < size[0]) {
+ port::prefetch<port::PREFETCH_HINT_T0>(&output(i + 1, 0));
+ port::prefetch<port::PREFETCH_HINT_T0>(&input(row + 1, begin[1]));
+ }
+ memcpy(&output(i, 0), &input(row, begin[1]), size[1] * sizeof(T));
+ }
+ return;
+ }
+#define HANDLE_DIM(NDIM) \
+ if (input_dims == NDIM) { \
+ HandleCase<NDIM>(context, begin, size, result); \
+ return; \
+ }
+
+ HANDLE_DIM(1);
+ HANDLE_DIM(2);
+ HANDLE_DIM(3);
+ HANDLE_DIM(4);
+ HANDLE_DIM(5);
+
+#undef HANDLE_DIM
+
+ OP_REQUIRES(context, false, errors::Unimplemented(
+ "SliceOp : Unhandled input dimensions"));
+ }
+ }
+
+ private:
+ template <int NDIM>
+ void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
+ const gtl::ArraySlice<int64>& size, Tensor* result) {
+ Eigen::DSizes<ptrdiff_t, NDIM> indices;
+ Eigen::DSizes<ptrdiff_t, NDIM> sizes;
+ for (int i = 0; i < NDIM; ++i) {
+ indices[i] = begin[i];
+ sizes[i] = size[i];
+ }
+
+ functor::Slice<Device, T, NDIM>()(
+ context->eigen_device<Device>(), result->tensor<T, NDIM>(),
+ context->input(0).tensor<T, NDIM>(), indices, sizes);
+ }
+};
+
+#define REGISTER_SLICE(type) \
+ REGISTER_KERNEL_BUILDER(Name("Slice") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("begin") \
+ .HostMemory("size"), \
+ SliceOp<CPUDevice, type>)
+
+TF_CALL_ALL_TYPES(REGISTER_SLICE);
+REGISTER_SLICE(bfloat16);
+
+#undef REGISTER_SLICE
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T, NDIM) \
+ template <> \
+ void Slice<GPUDevice, T, NDIM>::operator()( \
+ const GPUDevice& d, typename TTypes<T, NDIM>::Tensor output, \
+ typename TTypes<T, NDIM>::ConstTensor input, \
+ const Eigen::DSizes<ptrdiff_t, NDIM>& indices, \
+ const Eigen::DSizes<ptrdiff_t, NDIM>& sizes); \
+ extern template struct Slice<GPUDevice, T, NDIM>;
+
+#define DECLARE_FOR_N(T) \
+ DECLARE_GPU_SPEC(T, 1); \
+ DECLARE_GPU_SPEC(T, 2); \
+ DECLARE_GPU_SPEC(T, 3); \
+ DECLARE_GPU_SPEC(T, 4); \
+ DECLARE_GPU_SPEC(T, 5);
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N);
+DECLARE_FOR_N(int32);
+
+#undef DECLARE_FOR_N
+#undef DECLARE_GPU_SPEC
+} // namespace functor
+
+#define REGISTER_GPU(type) \
+ REGISTER_KERNEL_BUILDER(Name("Slice") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("begin") \
+ .HostMemory("size") \
+ .TypeConstraint<int32>("Index"), \
+ SliceOp<GPUDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+REGISTER_GPU(int32);
+
+#undef REGISTER_GPU
+
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/slice_op.h b/tensorflow/core/kernels/slice_op.h
new file mode 100644
index 0000000000..1b6bd9c112
--- /dev/null
+++ b/tensorflow/core/kernels/slice_op.h
@@ -0,0 +1,25 @@
+#ifndef TENSORFLOW_KERNELS_SLICE_OP_H_
+#define TENSORFLOW_KERNELS_SLICE_OP_H_
+
+// Functor definition for SliceOp, must be compilable by nvcc.
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T, int NDIMS>
+struct Slice {
+ void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor output,
+ typename TTypes<T, NDIMS>::ConstTensor input,
+ const Eigen::DSizes<ptrdiff_t, NDIMS>& slice_indices,
+ const Eigen::DSizes<ptrdiff_t, NDIMS>& slice_sizes) {
+ output.device(d) = input.slice(slice_indices, slice_sizes);
+ }
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_SLICE_OP_H_
diff --git a/tensorflow/core/kernels/slice_op_gpu.cu.cc b/tensorflow/core/kernels/slice_op_gpu.cu.cc
new file mode 100644
index 0000000000..6e919b244c
--- /dev/null
+++ b/tensorflow/core/kernels/slice_op_gpu.cu.cc
@@ -0,0 +1,31 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+
+#include "tensorflow/core/kernels/slice_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/port.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_GPU_KERNELS(T) \
+ template struct functor::Slice<GPUDevice, T, 1>; \
+ template struct functor::Slice<GPUDevice, T, 2>; \
+ template struct functor::Slice<GPUDevice, T, 3>; \
+ template struct functor::Slice<GPUDevice, T, 4>; \
+ template struct functor::Slice<GPUDevice, T, 5>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+DEFINE_GPU_KERNELS(int32);
+
+#undef DEFINE_GPU_KERNELS
+
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/slice_op_test.cc b/tensorflow/core/kernels/slice_op_test.cc
new file mode 100644
index 0000000000..27c78c6dc0
--- /dev/null
+++ b/tensorflow/core/kernels/slice_op_test.cc
@@ -0,0 +1,73 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+// For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
+// in size, and concat them together along "concat_dimension"
+template <typename T>
+static void SliceHelper(int iters, int size) {
+ testing::StopTiming();
+ RequireDefaultOps();
+ Graph* g = new Graph(OpRegistry::Global());
+ DataType dt = DataTypeToEnum<T>::v();
+ int kDim = 100;
+ int kMaxSize = 15000;
+ CHECK_LT(size, kMaxSize);
+
+ Tensor begin(DT_INT32, TensorShape({2}));
+ begin.flat<int32>()(0) = 10;
+ begin.flat<int32>()(1) = 10;
+
+ Tensor sizes(DT_INT32, TensorShape({2}));
+ sizes.flat<int32>()(0) = kDim;
+ sizes.flat<int32>()(1) = size;
+
+ Tensor input(dt, TensorShape({2 * kDim, kMaxSize}));
+ input.flat<T>().setRandom();
+
+ Node* node;
+ TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Slice")
+ .Input(test::graph::Constant(g, input))
+ .Input(test::graph::Constant(g, begin))
+ .Input(test::graph::Constant(g, sizes))
+ .Attr("T", dt)
+ .Finalize(g, &node));
+
+ testing::BytesProcessed(static_cast<int64>(iters) * kDim * size * sizeof(T));
+ testing::StartTiming();
+ test::Benchmark("cpu", g).Run(iters);
+ testing::UseRealTime();
+}
+
+static void BM_SliceFloat(int iters, int dim2) {
+ SliceHelper<float>(iters, dim2);
+}
+
+BENCHMARK(BM_SliceFloat)->Arg(100)->Arg(1000)->Arg(10000);
+
+static void BM_SliceBFloat16(int iters, int dim2) {
+ SliceHelper<bfloat16>(iters, dim2);
+}
+
+BENCHMARK(BM_SliceBFloat16)->Arg(100)->Arg(1000)->Arg(10000);
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
new file mode 100644
index 0000000000..abe6331a4f
--- /dev/null
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -0,0 +1,62 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/kernels/softmax_op.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class SoftmaxOp : public OpKernel {
+ public:
+ explicit SoftmaxOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& logits_in = context->input(0);
+ OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()),
+ errors::InvalidArgument("logits must be 2-dimensional"));
+ Tensor* softmax_out = nullptr;
+ OP_REQUIRES_OK(
+ context, context->allocate_output(0, logits_in.shape(), &softmax_out));
+ functor::SoftmaxFunctor<Device, T> functor;
+ functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
+ softmax_out->matrix<T>());
+ }
+};
+
+// Partial specialization for a CPUDevice, that uses the Eigen implementation
+// from SoftmaxEigenImpl.
+namespace functor {
+template <typename T>
+struct SoftmaxFunctor<CPUDevice, T> {
+ void operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+ typename TTypes<T>::Matrix softmax) {
+ SoftmaxEigenImpl<CPUDevice, T>::Compute(d, logits, softmax);
+ }
+};
+} // namespace functor
+
+REGISTER_KERNEL_BUILDER(Name("Softmax")
+ .Device(DEVICE_CPU)
+ .TypeConstraint<float>("T"),
+ SoftmaxOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(Name("Softmax")
+ .Device(DEVICE_CPU)
+ .TypeConstraint<double>("T"),
+ SoftmaxOp<CPUDevice, double>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("Softmax")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<float>("T"),
+ SoftmaxOp<GPUDevice, float>);
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/softmax_op.h b/tensorflow/core/kernels/softmax_op.h
new file mode 100644
index 0000000000..69bd531b70
--- /dev/null
+++ b/tensorflow/core/kernels/softmax_op.h
@@ -0,0 +1,70 @@
+#ifndef TENSORFLOW_KERNELS_SOFTMAX_OP_H_
+#define TENSORFLOW_KERNELS_SOFTMAX_OP_H_
+// Functor definition for SoftmaxOp, must be compilable by nvcc.
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by SoftmaxOp to do the computations.
+template <typename Device, typename T>
+struct SoftmaxFunctor {
+ // Computes Softmax activation.
+ //
+ // logits: dim: batch_size, num_classes.
+ // softmax: dims: batch_size, num_classes.
+ void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
+ typename TTypes<T>::Matrix softmax);
+};
+
+// Eigen code implementing SoftmaxFunctor::operator().
+// This code works for both CPU and GPU and is used by the functor
+// specializations for both device types.
+template <typename Device, typename T>
+struct SoftmaxEigenImpl {
+ static void Compute(const Device& d, typename TTypes<T>::ConstMatrix logits,
+ typename TTypes<T>::Matrix softmax) {
+ const int kBatchDim = 0;
+ const int kClassDim = 1;
+
+ const int batch_size = logits.dimension(kBatchDim);
+ const int num_classes = logits.dimension(kClassDim);
+
+// These arrays are used to reduce along the class dimension, and broadcast
+// the resulting value to all classes.
+#if !defined(EIGEN_HAS_INDEX_LIST)
+ Eigen::DSizes<int, 1> along_class(kClassDim);
+ Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+ Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+#else
+ Eigen::IndexList<Eigen::type2index<kClassDim> > along_class;
+ Eigen::IndexList<Eigen::type2index<1> > depth_dim;
+ Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one;
+ batch_by_one.set(0, batch_size);
+ Eigen::IndexList<Eigen::type2index<1>, int> one_by_class;
+ one_by_class.set(1, num_classes);
+#endif
+ // NOTE(mdevin): If you modify this implementation please run
+ // the ImageNetSoftmaxFwd benchmark in core_ops_test.cc.
+ //
+ // softmax = exp(logits - max(logits along classes));
+ softmax.device(d) = (logits -
+ logits.maximum(along_class)
+ .eval()
+ .reshape(batch_by_one)
+ .broadcast(one_by_class)).exp();
+ // softmax = softmax / sum(softmax along classes);
+ softmax.device(d) = (softmax /
+ softmax.sum(along_class)
+ .eval()
+ .reshape(batch_by_one)
+ .broadcast(one_by_class));
+ }
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_SOFTMAX_OP_H_
diff --git a/tensorflow/core/kernels/softmax_op_gpu.cu.cc b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
new file mode 100644
index 0000000000..d5aaf9c364
--- /dev/null
+++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
@@ -0,0 +1,31 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/softmax_op.h"
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Partial specialization for a GPUDevice, that uses the Eigen implementation
+// from SoftmaxEigenImpl.
+namespace functor {
+template <typename T>
+struct SoftmaxFunctor<GPUDevice, T> {
+ void operator()(const GPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+ typename TTypes<T>::Matrix softmax) {
+ SoftmaxEigenImpl<GPUDevice, T>::Compute(d, logits, softmax);
+ }
+};
+} // end namespace functor
+
+// Instantiate the GPU implementation for float.
+template struct functor::SoftmaxFunctor<GPUDevice, float>;
+
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/softplus_op.cc b/tensorflow/core/kernels/softplus_op.cc
new file mode 100644
index 0000000000..b5fb57d3c5
--- /dev/null
+++ b/tensorflow/core/kernels/softplus_op.cc
@@ -0,0 +1,97 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/softplus_op.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class SoftplusOp : public UnaryElementWiseOp<T, SoftplusOp<Device, T>> {
+ public:
+ using UnaryElementWiseOp<T, SoftplusOp<Device, T>>::UnaryElementWiseOp;
+
+ void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+ functor::Softplus<Device, T> functor;
+ functor(context->eigen_device<Device>(), input.flat<T>(),
+ output->flat<T>());
+ }
+};
+
+template <typename Device, typename T>
+class SoftplusGradOp
+ : public BinaryElementWiseOp<T, SoftplusGradOp<Device, T>> {
+ public:
+ using BinaryElementWiseOp<T, SoftplusGradOp<Device, T>>::BinaryElementWiseOp;
+
+ // INPUTS:
+ // g (gradients): backpropagated gradients
+ // a (inputs): inputs that were passed to SoftplusOp()
+ // OUTPUT:
+ // gradients to backprop
+ template <int NDIMS>
+ void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
+ Tensor* output) {
+ OP_REQUIRES(context, a.IsSameSize(g),
+ errors::InvalidArgument("g and a must be the same size"));
+ functor::SoftplusGrad<Device, T> functor;
+ functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
+ output->flat<T>());
+ }
+};
+
+#define REGISTER_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Softplus").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ SoftplusOp<CPUDevice, type>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("SoftplusGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ SoftplusGradOp<CPUDevice, type>);
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T) \
+ template <> \
+ void Softplus<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T>::ConstTensor features, \
+ typename TTypes<T>::Tensor activations); \
+ extern template struct Softplus<GPUDevice, T>; \
+ \
+ template <> \
+ void SoftplusGrad<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T>::ConstTensor gradients, \
+ typename TTypes<T>::ConstTensor features, \
+ typename TTypes<T>::Tensor backprops); \
+ extern template struct SoftplusGrad<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+} // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Softplus").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+ SoftplusOp<GPUDevice, type>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("SoftplusGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+ SoftplusGradOp<GPUDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/softplus_op.h b/tensorflow/core/kernels/softplus_op.h
new file mode 100644
index 0000000000..3545a78246
--- /dev/null
+++ b/tensorflow/core/kernels/softplus_op.h
@@ -0,0 +1,46 @@
+#ifndef TENSORFLOW_KERNELS_SOFTPLUS_OP_H_
+#define TENSORFLOW_KERNELS_SOFTPLUS_OP_H_
+// Functor definition for SoftplusOp and SoftplusGradOp, must be compilable by
+// nvcc.
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by SoftplusOp to do the computations.
+template <typename Device, typename T>
+struct Softplus {
+ // Computes Softplus activation.
+ //
+ // features: any shape.
+ // activations: same shape as "features".
+ void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+ typename TTypes<T>::Tensor activations) {
+ activations.device(d) =
+ (features > features.constant(30.f))
+ .select(features, (features.exp() + features.constant(1.0f)).log());
+ }
+};
+
+// Functor used by SoftplusGradOp to do the computations.
+template <typename Device, typename T>
+struct SoftplusGrad {
+ // Computes SoftplusGrad backprops.
+ //
+ // gradients: gradients backpropagated to the Softplus op.
+ // features: inputs that where passed to the Softplus op.
+ // backprops: gradients to backpropagate to the Softplus inputs.
+ void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+ typename TTypes<T>::ConstTensor features,
+ typename TTypes<T>::Tensor backprops) {
+ backprops.device(d) =
+ gradients / ((-features).exp() + features.constant(1.0f));
+ }
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_SOFTPLUS_OP_H_
diff --git a/tensorflow/core/kernels/softplus_op_gpu.cu.cc b/tensorflow/core/kernels/softplus_op_gpu.cu.cc
new file mode 100644
index 0000000000..7a974321a7
--- /dev/null
+++ b/tensorflow/core/kernels/softplus_op_gpu.cu.cc
@@ -0,0 +1,25 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+
+#include "tensorflow/core/kernels/softplus_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Definition of the GPU implementations declared in softplus_op.cc.
+#define DEFINE_GPU_KERNELS(T) \
+ template struct functor::Softplus<GPUDevice, T>; \
+ template struct functor::SoftplusGrad<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/sparse_concat_op.cc b/tensorflow/core/kernels/sparse_concat_op.cc
new file mode 100644
index 0000000000..72c267a47d
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_concat_op.cc
@@ -0,0 +1,139 @@
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+namespace tensorflow {
+
+template <typename T>
+class SparseConcatOp : public OpKernel {
+ public:
+ explicit SparseConcatOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("concat_dim", &concat_dim_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ OpInputList inds;
+ OP_REQUIRES_OK(context, context->input_list("indices", &inds));
+ const int N = inds.size();
+ for (int i = 0; i < N; i++) {
+ OP_REQUIRES(context, TensorShapeUtils::IsMatrix(inds[i].shape()),
+ errors::InvalidArgument(
+ "Input indices should be a matrix but received shape ",
+ inds[i].shape().DebugString(), " at position ", i));
+ }
+
+ OpInputList vals;
+ OP_REQUIRES_OK(context, context->input_list("values", &vals));
+ OP_REQUIRES(context, vals.size() == N,
+ errors::InvalidArgument("Expected ", N, " input values, got ",
+ vals.size()));
+ for (int i = 0; i < N; i++) {
+ OP_REQUIRES(context, TensorShapeUtils::IsVector(vals[i].shape()),
+ errors::InvalidArgument(
+ "Input values should be a vector but received shape ",
+ vals[i].shape().DebugString(), " at position ", i));
+ }
+
+ OpInputList shapes;
+ OP_REQUIRES_OK(context, context->input_list("shapes", &shapes));
+ OP_REQUIRES(context, shapes.size() == N,
+ errors::InvalidArgument("Expected ", N, " input shapes, got ",
+ shapes.size()));
+ for (int i = 0; i < N; i++) {
+ OP_REQUIRES(context, TensorShapeUtils::IsVector(shapes[i].shape()),
+ errors::InvalidArgument(
+ "Input shapes should be a vector but received shape ",
+ shapes[i].shape().DebugString(), " at position ", i));
+ }
+
+ const TensorShape input_shape(shapes[0].vec<int64>());
+ OP_REQUIRES(
+ context, concat_dim_ >= 0 && concat_dim_ < input_shape.dims(),
+ errors::InvalidArgument("Concat dimension must be between 0 and rank (",
+ input_shape.dims(), "), got ", concat_dim_));
+ for (int i = 1; i < N; ++i) {
+ const TensorShape current_shape(shapes[i].vec<int64>());
+ OP_REQUIRES(context, current_shape.dims() == input_shape.dims(),
+ errors::InvalidArgument(
+ "Ranks of all input tensors must match: expected ",
+ input_shape.dims(), " but got ", current_shape.dims(),
+ " at position ", i));
+ for (int j = 0; j < input_shape.dims(); ++j) {
+ if (j != concat_dim_) {
+ OP_REQUIRES(
+ context, input_shape.dim_size(j) == current_shape.dim_size(j),
+ errors::InvalidArgument(
+ "Input shapes must match: expected ", input_shape.dim_size(j),
+ " for dimension ", j, " but got ", current_shape.dim_size(j),
+ " at position ", i));
+ }
+ }
+ }
+
+ // The input and output sparse tensors are assumed to be ordered along
+ // increasing dimension number. But in order for concat to work properly,
+ // order[0] must be concat_dim. So we will reorder the inputs to the
+ // concat ordering, concatenate, then reorder back to the standard order.
+ // We make a deep copy of the input tensors to ensure that the in-place
+ // reorder doesn't create race conditions for other ops that may be
+ // concurrently reading the indices and values tensors.
+
+ gtl::InlinedVector<int64, 8> std_order(input_shape.dims());
+ std::iota(std_order.begin(), std_order.end(), 0);
+
+ std::vector<int64> concat_order;
+ concat_order.reserve(input_shape.dims());
+ concat_order.push_back(concat_dim_);
+ for (int j = 0; j < input_shape.dims(); ++j) {
+ if (j != concat_dim_) {
+ concat_order.push_back(j);
+ }
+ }
+
+ std::vector<sparse::SparseTensor> sp_inputs;
+ for (int i = 0; i < N; ++i) {
+ const TensorShape current_shape(shapes[i].vec<int64>());
+ sp_inputs.emplace_back(tensor::DeepCopy(inds[i]),
+ tensor::DeepCopy(vals[i]), current_shape,
+ std_order);
+ sp_inputs[i].Reorder<T>(concat_order);
+ }
+
+ sparse::SparseTensor concat = sparse::SparseTensor::Concat<T>(sp_inputs);
+ concat.Reorder<T>(std_order);
+
+ context->set_output(0, concat.indices());
+ context->set_output(1, concat.values());
+
+ Tensor* output_shape_out = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(
+ 2, TensorShape({concat.shape().dims()}),
+ &output_shape_out));
+ auto output_shape = output_shape_out->vec<int64>();
+ for (int j = 0; j < concat.shape().dims(); ++j) {
+ output_shape(j) = concat.shape().dim_size(j);
+ }
+ }
+
+ private:
+ int concat_dim_;
+};
+
+#define REGISTER_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("SparseConcat").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ SparseConcatOp<type>)
+
+TF_CALL_ALL_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
new file mode 100644
index 0000000000..919e129ff8
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -0,0 +1,192 @@
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/port.h"
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename T>
+void PrefetchBlockNTA(const T& tensor, int si, int ei, int sj, int ej) {
+ for (int i = si; i < ei; ++i) {
+ for (int j = sj; j < ej; j = j + 16) {
+ port::prefetch<port::PREFETCH_HINT_NTA>(&tensor(i, j));
+ }
+ }
+}
+
+template <typename T>
+void PrefetchBlockT1(const T& tensor, int si, int ei, int sj, int ej) {
+ for (int i = si; i < ei; ++i) {
+ for (int j = sj; j < ej; j = j + 16) {
+ port::prefetch<port::PREFETCH_HINT_T1>(&tensor(i, j));
+ }
+ }
+}
+
+struct Block {
+ Block(int sm, int em, int sk, int ek, int sn, int en)
+ : startm(sm), endm(em), startk(sk), endk(ek), startn(sn), endn(en) {}
+
+ int startm;
+ int endm;
+ int startk;
+ int endk;
+ int startn;
+ int endn;
+};
+
+bool NextBlock(const int Bm, const int Bk, const int Bn, const int m_start,
+ const int m, const int k, const int n, const Block& b,
+ Block* next) {
+ *next = b;
+ if (b.endk < k) {
+ next->startk = b.endk;
+ next->endk = std::min(b.endk + Bk, k);
+ } else {
+ next->startk = 0;
+ next->endk = std::min(Bk, k);
+ if (b.endm < m) {
+ next->startm = b.endm;
+ next->endm = std::min(b.endm + Bm, m);
+ } else {
+ next->startm = m_start;
+ next->endm = std::min(m_start + Bm, m);
+ next->startn = b.endn;
+ next->endn = std::min(b.endn + Bn, n);
+ }
+ }
+ return next->startn == next->endn;
+}
+
+class SparseMatMulOp : public OpKernel {
+ public:
+ explicit SparseMatMulOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_));
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_));
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("a_is_sparse", &a_is_sparse_));
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("b_is_sparse", &b_is_sparse_));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor& a = ctx->input(0);
+ const Tensor& b = ctx->input(1);
+
+ OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()),
+ errors::InvalidArgument("a is not a matrix"));
+ OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()),
+ errors::InvalidArgument("b is not a matrix"));
+
+ auto left = a.matrix<float>();
+ auto right_mat = b.matrix<float>();
+ const int m = transpose_a_ ? left.dimension(1) : left.dimension(0);
+ const int k = transpose_a_ ? left.dimension(0) : left.dimension(1);
+ const int n =
+ transpose_b_ ? right_mat.dimension(0) : right_mat.dimension(1);
+ const int k2 =
+ transpose_b_ ? right_mat.dimension(1) : right_mat.dimension(0);
+
+ OP_REQUIRES(ctx, k == k2,
+ errors::InvalidArgument("Matrix size incompatible: a: ",
+ a.shape().DebugString(), ", b: ",
+ b.shape().DebugString()));
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({m, n}), &output));
+ auto out = output->matrix<float>();
+
+ if (!a_is_sparse_) {
+ // Fallback to Eigen contract.
+ // Note that we currently don't optimize the case where only right is
+ // sparse. That can generally be handled by tranposing the order of the
+ // matmul.
+ Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+ dim_pair[0].first = transpose_a_ ? 0 : 1;
+ dim_pair[0].second = transpose_b_ ? 1 : 0;
+ out.device(ctx->template eigen_device<CPUDevice>()) =
+ left.contract(right_mat, dim_pair);
+ return;
+ }
+ typedef Eigen::Tensor<float, 2, Eigen::RowMajor> Matrix;
+ std::unique_ptr<Matrix> right_tr_mat;
+ std::unique_ptr<TTypes<float>::ConstMatrix> right_tr_map;
+ if (transpose_b_) {
+ right_tr_mat.reset(new Matrix(k, n));
+ Eigen::array<int, 2> perm({1, 0});
+ right_tr_mat->device(ctx->template eigen_device<CPUDevice>()) =
+ right_mat.shuffle(perm);
+ right_tr_map.reset(new TTypes<float>::ConstMatrix(
+ right_tr_mat->data(), right_tr_mat->dimensions()));
+ }
+ TTypes<float>::ConstMatrix& right =
+ transpose_b_ ? *right_tr_map : right_mat;
+
+ const bool transpose_a = transpose_a_;
+
+ typedef Eigen::TensorMap<Eigen::Tensor<float, 1, Eigen::RowMajor>,
+ Eigen::Unaligned> TensorMap;
+ typedef Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>,
+ Eigen::Unaligned> ConstTensorMap;
+ typedef Eigen::DSizes<Eigen::DenseIndex, 1> DSizes;
+ const int Bm = 16;
+ const int Bk = 16;
+ const int Bn = 1024;
+
+ auto work_shard = [m, n, k, transpose_a, Bm, Bk, Bn, &left, &right, &out](
+ int64 start64, int64 end64) {
+ const int start = static_cast<int>(start64);
+ const int end = static_cast<int>(end64);
+ Block curr(start, std::min(start + Bm, end), 0, std::min(Bk, k), 0,
+ std::min(Bn, n));
+ Block next(curr);
+ bool done = false;
+ for (int i = start; i < end; ++i) {
+ out.chip<0>(i).setZero();
+ }
+ while (true) {
+ done = NextBlock(Bm, Bk, Bn, start, end, k, n, curr, &next);
+
+ PrefetchBlockT1(right, curr.startk, curr.endk, curr.startn, curr.endn);
+
+ // Process current block
+ for (int i = curr.startm; i < curr.endm; ++i) {
+ PrefetchBlockNTA(left, i, i + 1, curr.startk, curr.endk);
+ PrefetchBlockNTA(out, i, i + 1, curr.startn, curr.endn);
+ DSizes out_slice_shape(curr.endn - curr.startn);
+ TensorMap out_i(&out(i, curr.startn), out_slice_shape);
+ for (int j = curr.startk; j < curr.endk; ++j) {
+ const float l = transpose_a ? left(j, i) : left(i, j);
+ if (l == 0) continue;
+ ConstTensorMap right_j(&right(j, curr.startn), out_slice_shape);
+ out_i += right_j * l;
+ }
+ }
+ if (done) break;
+ curr = next;
+ }
+ };
+ auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+ Shard(worker_threads.num_threads, worker_threads.workers, m, 2 * k * n,
+ work_shard);
+ }
+
+ private:
+ bool transpose_a_;
+ bool transpose_b_;
+ bool a_is_sparse_;
+ bool b_is_sparse_;
+ TF_DISALLOW_COPY_AND_ASSIGN(SparseMatMulOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("SparseMatMul").Device(DEVICE_CPU),
+ SparseMatMulOp);
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_matmul_op_test.cc b/tensorflow/core/kernels/sparse_matmul_op_test.cc
new file mode 100644
index 0000000000..883d0d1224
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc
@@ -0,0 +1,139 @@
+#include "tensorflow/core/framework/types.pb.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+random::PhiloxRandom philox(1, 1);
+random::SimplePhilox rnd(&philox);
+
+void Sparsify(Tensor* t, float sparsity) {
+ const int64 N = t->NumElements();
+ CHECK_LE(sparsity, 1);
+ if (sparsity <= 0) return;
+ auto flat = t->flat<float>();
+ static const uint32 K = 10000;
+ for (int64 i = 0; i < N; ++i) {
+ if (rnd.Uniform(K) < sparsity * K) {
+ flat(i) = 0;
+ }
+ }
+}
+
+Node* SparseMatMulNode(Graph* g, Node* in0, Node* in1, bool transpose_a,
+ bool transpose_b, bool a_sparse, bool b_sparse) {
+ Node* ret;
+ TF_CHECK_OK(NodeBuilder(g->NewName("n"), "SparseMatMul")
+ .Input(in0)
+ .Input(in1)
+ .Attr("transpose_a", transpose_a)
+ .Attr("transpose_b", transpose_b)
+ .Attr("a_is_sparse", a_sparse)
+ .Attr("b_is_sparse", b_sparse)
+ .Finalize(g, &ret));
+ return ret;
+}
+
+static Graph* SparseMatMulHelper(Graph* g, int m, int n, int d, float sparsity,
+ bool transpose_a, bool transpose_b,
+ bool a_sparse, bool b_sparse) {
+ a_sparse = a_sparse && (sparsity > 0);
+ b_sparse = b_sparse && (sparsity > 0);
+
+ auto left_shape = transpose_a ? TensorShape({d, m}) : TensorShape({m, d});
+ Tensor left(DataTypeToEnum<float>::value, left_shape);
+ left.flat<float>().setRandom();
+ if (a_sparse) {
+ Sparsify(&left, sparsity);
+ }
+
+ auto right_shape = transpose_b ? TensorShape({n, d}) : TensorShape({d, n});
+ Tensor right(DataTypeToEnum<float>::value, right_shape);
+ right.flat<float>().setRandom();
+ if (b_sparse) {
+ Sparsify(&right, sparsity);
+ }
+
+ SparseMatMulNode(g, test::graph::Constant(g, left),
+ test::graph::Constant(g, right), transpose_a, transpose_b,
+ a_sparse, b_sparse);
+ return g;
+}
+
+static Graph* SparseMatMul(int m, int n, int d, float sparsity,
+ bool transpose_a, bool transpose_b) {
+ Graph* g = new Graph(OpRegistry::Global());
+ return SparseMatMulHelper(g, m, n, d, sparsity, transpose_a, transpose_b,
+ true, false);
+}
+
+static Graph* MultiSparseMatMul(int m, int n, int d, float sparsity_a,
+ float sparsity_b) {
+ Graph* g = new Graph(OpRegistry::Global());
+ if (sparsity_a == 0 && sparsity_b > 0) {
+ SparseMatMulHelper(g, m, n, d, sparsity_a, false, false, false, false);
+ SparseMatMulHelper(g, n, d, m, sparsity_b, true, true, true, false);
+ SparseMatMulHelper(g, m, d, n, sparsity_b, false, false, true, false);
+ } else {
+ SparseMatMulHelper(g, m, n, d, sparsity_a, false, true, true, false);
+ SparseMatMulHelper(g, d, n, m, sparsity_a, true, false, true, true);
+ SparseMatMulHelper(g, m, d, n, sparsity_b, false, false, true, false);
+ }
+ return g;
+}
+
+#define BM_SPARSE(M, K, N, S) \
+ static void BM_Sparse##_##M##_##K##_##N##_##S(int iters) { \
+ testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2); \
+ std::string label = strings::Printf("%d_%d_%d_%0.2f", M, K, N, S / 100.0); \
+ testing::SetLabel(label); \
+ test::Benchmark("cpu", SparseMatMul(M, N, K, S / 100.0, false, false)) \
+ .Run(iters); \
+ } \
+ BENCHMARK(BM_Sparse##_##M##_##K##_##N##_##S);
+
+BM_SPARSE(2048, 2048, 2048, 0);
+BM_SPARSE(2048, 2048, 2048, 1);
+BM_SPARSE(2048, 2048, 2048, 85);
+
+BM_SPARSE(1024, 1024, 1024, 0);
+BM_SPARSE(1024, 1024, 1024, 1);
+BM_SPARSE(1024, 1024, 1024, 85);
+
+BM_SPARSE(256, 256, 256, 1);
+BM_SPARSE(512, 512, 512, 1);
+
+#define BM_SPARSE_MULTI(M, K, N, S1, S2) \
+ static void BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2(int iters) { \
+ testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2 * 3); \
+ std::string label = strings::Printf("%d_%d_%d_%0.2f_%0.2f", M, K, N, \
+ S1 / 100.0, S2 / 100.0); \
+ testing::SetLabel(label); \
+ test::Benchmark("cpu", MultiSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0)) \
+ .Run(iters); \
+ } \
+ BENCHMARK(BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2);
+
+BM_SPARSE_MULTI(512, 2140, 4096, 0, 82);
+BM_SPARSE_MULTI(512, 4096, 2048, 83, 83);
+
+#define BM_SPARSE_TR(M, K, N, S, TA, TB) \
+ static void BM_Sparse##_##M##_##K##_##N##_##S##_##TA##_##TB(int iters) { \
+ testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2); \
+ std::string label = \
+ strings::Printf("%d_%d_%d_%d_%d_%0.2f", M, K, N, TA, TB, S / 100.0); \
+ testing::SetLabel(label); \
+ test::Benchmark("cpu", SparseMatMul(M, N, K, S / 100.0, TA, TB)) \
+ .Run(iters); \
+ } \
+ BENCHMARK(BM_Sparse##_##M##_##K##_##N##_##S##_##TA##_##TB);
+
+BM_SPARSE_TR(2048, 2048, 2048, 1, true, false);
+BM_SPARSE_TR(2048, 2048, 2048, 1, false, true);
+BM_SPARSE_TR(2048, 2048, 2048, 1, true, true);
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_reorder_op.cc b/tensorflow/core/kernels/sparse_reorder_op.cc
new file mode 100644
index 0000000000..fd6824a4e2
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_reorder_op.cc
@@ -0,0 +1,71 @@
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+namespace tensorflow {
+
+template <typename T>
+class SparseReorderOp : public OpKernel {
+ public:
+ explicit SparseReorderOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input_ind = context->input(0);
+ OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_ind.shape()),
+ errors::InvalidArgument(
+ "Input indices should be a matrix but received shape",
+ input_ind.shape().DebugString()));
+
+ const Tensor& input_val = context->input(1);
+ OP_REQUIRES(context, TensorShapeUtils::IsVector(input_val.shape()),
+ errors::InvalidArgument(
+ "Input values should be a vector but received shape",
+ input_val.shape().DebugString()));
+
+ const Tensor& input_shape_in = context->input(2);
+ OP_REQUIRES(context, TensorShapeUtils::IsVector(input_shape_in.shape()),
+ errors::InvalidArgument(
+ "Input shape should be a vector but received shape",
+ input_shape_in.shape().DebugString()));
+
+ const TensorShape input_shape(input_shape_in.vec<int64>());
+
+ gtl::InlinedVector<int64, 8> std_order(input_shape.dims());
+ std::iota(std_order.begin(), std_order.end(), 0);
+
+ // Check if the sparse tensor is already ordered correctly
+ sparse::SparseTensor input_sp(input_ind, input_val, input_shape, std_order);
+
+ if (input_sp.IndicesValid()) {
+ context->set_output(0, input_sp.indices());
+ context->set_output(1, input_sp.values());
+ } else {
+ // Deep-copy the input Tensors, then reorder in-place
+ sparse::SparseTensor reordered_sp(tensor::DeepCopy(input_ind),
+ tensor::DeepCopy(input_val),
+ input_shape);
+ reordered_sp.Reorder<T>(std_order);
+ context->set_output(0, reordered_sp.indices());
+ context->set_output(1, reordered_sp.values());
+ }
+ }
+};
+
+#define REGISTER_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("SparseReorder").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ SparseReorderOp<type>)
+
+TF_CALL_ALL_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_to_dense_op.cc b/tensorflow/core/kernels/sparse_to_dense_op.cc
new file mode 100644
index 0000000000..47e91c134d
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_to_dense_op.cc
@@ -0,0 +1,129 @@
+// See core/ops/sparse_ops.cc for documentation.
+//
+// NOTE: the operations in this file only are suitable for execution
+// on CPUs.
+
+#define EIGEN_USE_THREADS
+
+#include <string>
+#include <sstream>
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+namespace tensorflow {
+
+// Operator to convert sparse representations to dense.
+template <typename T, typename Index>
+class SparseToDense : public OpKernel {
+ public:
+ explicit SparseToDense(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* c) override {
+ // sparse_indices
+ const Tensor& indices = c->input(0);
+ OP_REQUIRES(c, indices.dims() <= 2,
+ errors::InvalidArgument(
+ "sparse_indices should be a scalar, vector, or matrix, "
+ "got shape ",
+ indices.shape().ShortDebugString()));
+ const int64 num_elems = indices.dims() > 0 ? indices.dim_size(0) : 1;
+ const int64 num_dims = indices.dims() > 1 ? indices.dim_size(1) : 1;
+
+ // output_shape
+ const Tensor& output_shape = c->input(1);
+ OP_REQUIRES(
+ c, TensorShapeUtils::IsLegacyVector(output_shape.shape()),
+ errors::InvalidArgument("output_shape should be a vector, got shape ",
+ output_shape.shape().ShortDebugString()));
+ OP_REQUIRES(c, output_shape.NumElements() == num_dims,
+ errors::InvalidArgument(
+ "output_shape has incorrect number of elements: ",
+ output_shape.NumElements(), " should be: ", num_dims));
+
+ // sparse_values
+ const Tensor& sparse_values = c->input(2);
+ const int64 num_values = sparse_values.NumElements();
+ OP_REQUIRES(
+ c, sparse_values.dims() == 0 ||
+ (sparse_values.dims() == 1 && num_values == num_elems),
+ errors::InvalidArgument("sparse_values has incorrect shape ",
+ sparse_values.shape().ShortDebugString(),
+ ", should be [] or [", num_elems, "]"));
+
+ // default_value
+ const Tensor& default_value = c->input(3);
+ OP_REQUIRES(c, TensorShapeUtils::IsScalar(default_value.shape()),
+ errors::InvalidArgument("default_value should be a scalar."));
+
+ auto output_shape_vec = output_shape.flat<Index>();
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(c, c->allocate_output(0, TensorShapeUtils::MakeShape(
+ output_shape_vec.data(),
+ output_shape_vec.size()),
+ &output));
+
+ TensorShape ix_shape({num_elems, num_dims});
+ Tensor indices_shaped(DT_INT64, ix_shape);
+ if (indices.dtype() == DT_INT64) {
+ CHECK(indices_shaped.CopyFrom(indices, ix_shape));
+ } else {
+ indices_shaped.matrix<int64>() =
+ indices.shaped<Index, 2>(ix_shape.dim_sizes()).template cast<int64>();
+ }
+
+ // If we received a scalar, we'll need to create a new
+ // tensor with copies of the values as a vec.
+ // TODO(ebrevdo): find a way to avoid this temp allocation.
+ Tensor sparse_values_b;
+
+ if (TensorShapeUtils::IsScalar(sparse_values.shape())) {
+ OP_REQUIRES_OK(
+ c, c->allocate_temp(DataTypeToEnum<T>::value,
+ TensorShape({num_elems}), &sparse_values_b));
+ sparse_values_b.vec<T>().setConstant(sparse_values.scalar<T>()());
+ } else {
+ sparse_values_b = sparse_values;
+ }
+
+ gtl::InlinedVector<int64, 8> order(output->shape().dims());
+ std::iota(order.begin(), order.end(), 0); // Assume order is correct
+ sparse::SparseTensor st(indices_shaped, sparse_values_b, output->shape(),
+ order);
+
+ output->flat<T>().setConstant(default_value.scalar<T>()());
+ OP_REQUIRES(c, st.template ToDense<T>(output, false /* initialize */),
+ errors::InvalidArgument(
+ "Indices are not valid (out of bounds). Shape: ",
+ output->shape().DebugString()));
+ }
+};
+
+#define REGISTER_KERNELS(type, index_type) \
+ REGISTER_KERNEL_BUILDER(Name("SparseToDense") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<index_type>("Tindices"), \
+ SparseToDense<type, index_type>);
+
+#define REGISTER_KERNELS_ALL(type) \
+ REGISTER_KERNELS(type, int32); \
+ REGISTER_KERNELS(type, int64);
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS_ALL);
+REGISTER_KERNELS_ALL(bool);
+REGISTER_KERNELS_ALL(string);
+
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_to_dense_op_test.cc b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
new file mode 100644
index 0000000000..e9800ccd68
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
@@ -0,0 +1,283 @@
+#include <functional>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+
+namespace {
+
+class SparseToDenseTest : public OpsTestBase {
+ protected:
+ void SetUp() override { RequireDefaultOps(); }
+
+ void MakeOp(int dim, DataType index_type, DataType value_type) {
+ ASSERT_OK(NodeDefBuilder("sparsetodense", "SparseToDense")
+ .Input(FakeInput(index_type))
+ .Input(FakeInput(index_type))
+ .Input(FakeInput(value_type))
+ .Input(FakeInput(value_type))
+ .Finalize(node_def()));
+ ASSERT_OK(InitOp());
+ }
+};
+
+TEST_F(SparseToDenseTest, OneD_OneValue) {
+ MakeOp(1, DT_INT32, DT_FLOAT);
+
+ // sparse_indices
+ AddInputFromArray<int32>(TensorShape({3}), {1, 3, 4});
+ // output_shape
+ AddInputFromArray<int32>(TensorShape({1}), {5});
+ // sparse_values
+ AddInputFromArray<float>(TensorShape({}), {2});
+ // default_value
+ AddInputFromArray<float>(TensorShape({}), {-2});
+
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_FLOAT, {5});
+ test::FillValues<float>(&expected, {-2, 2, -2, 2, 2});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(SparseToDenseTest, OneD_OneValue_int64_double) {
+ MakeOp(1, DT_INT64, DT_DOUBLE);
+
+ // sparse_indices
+ AddInputFromArray<int64>(TensorShape({3}), {1, 3, 4});
+ // output_shape
+ AddInputFromArray<int64>(TensorShape({1}), {5});
+ // sparse_values
+ AddInputFromArray<double>(TensorShape({}), {2});
+ // default_value
+ AddInputFromArray<double>(TensorShape({}), {-2});
+
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_DOUBLE, {5});
+ test::FillValues<double>(&expected, {-2, 2, -2, 2, 2});
+ test::ExpectTensorEqual<double>(expected, *GetOutput(0));
+}
+
+TEST_F(SparseToDenseTest, OneD_MultValues) {
+ MakeOp(1, DT_INT32, DT_FLOAT);
+
+ // sparse_indices
+ AddInputFromArray<int32>({3}, {1, 3, 4});
+ // output_shape
+ AddInputFromArray<int32>({1}, {5});
+ // sparse_values
+ AddInputFromArray<float>({3}, {3, 4, 5});
+ // default_value
+ AddInputFromArray<float>({}, {-2});
+
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_FLOAT, {5});
+ test::FillValues<float>(&expected, {-2, 3, -2, 4, 5});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(SparseToDenseTest, TwoD_OneValue) {
+ MakeOp(2, DT_INT32, DT_FLOAT);
+
+ // sparse_indices
+ AddInputFromArray<int32>(TensorShape({3, 2}), {0, 1, 0, 2, 2, 3});
+ // output_shape
+ AddInputFromArray<int32>(TensorShape({2}), {3, 4});
+ // sparse_values
+ AddInputFromArray<float>(TensorShape({}), {2});
+ // default_value
+ AddInputFromArray<float>(TensorShape({}), {-2});
+
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_FLOAT, {3, 4});
+ expected.flat<float>().setConstant(-2);
+ expected.tensor<float, 2>()(0, 1) = 2;
+ expected.tensor<float, 2>()(0, 2) = 2;
+ expected.tensor<float, 2>()(2, 3) = 2;
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(SparseToDenseTest, TwoD_MultValues) {
+ MakeOp(2, DT_INT32, DT_FLOAT);
+
+ // sparse_indices
+ AddInputFromArray<int32>(TensorShape({3, 2}), {0, 1, 0, 2, 2, 3});
+ // output_shape
+ AddInputFromArray<int32>(TensorShape({2}), {3, 4});
+ // sparse_values
+ AddInputFromArray<float>(TensorShape({3}), {3, 4, 5});
+ // default_value
+ AddInputFromArray<float>(TensorShape({}), {-2});
+
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_FLOAT, {3, 4});
+ expected.flat<float>().setConstant(-2);
+ expected.tensor<float, 2>()(0, 1) = 3;
+ expected.tensor<float, 2>()(0, 2) = 4;
+ expected.tensor<float, 2>()(2, 3) = 5;
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(SparseToDenseTest, ThreeD_OneValue) {
+ MakeOp(3, DT_INT32, DT_FLOAT);
+
+ // sparse_indices
+ AddInputFromArray<int32>(TensorShape({3, 3}), {0, 1, 1, 0, 2, 0, 2, 3, 1});
+ // output_shape
+ AddInputFromArray<int32>(TensorShape({3}), {3, 4, 2});
+ // sparse_values
+ AddInputFromArray<float>(TensorShape({}), {2});
+ // default_value
+ AddInputFromArray<float>(TensorShape({}), {-2});
+
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_FLOAT, {3, 4, 2});
+ expected.flat<float>().setConstant(-2);
+ expected.tensor<float, 3>()(0, 1, 1) = 2;
+ expected.tensor<float, 3>()(0, 2, 0) = 2;
+ expected.tensor<float, 3>()(2, 3, 1) = 2;
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(SparseToDenseTest, ThreeD_MultValues) {
+ MakeOp(3, DT_INT32, DT_FLOAT);
+
+ // sparse_indices
+ AddInputFromArray<int32>(TensorShape({3, 3}), {0, 1, 1, 0, 2, 0, 2, 3, 1});
+ // output_shape
+ AddInputFromArray<int32>(TensorShape({3}), {3, 4, 2});
+ // sparse_values
+ AddInputFromArray<float>(TensorShape({3}), {3, 4, 5});
+ // default_value
+ AddInputFromArray<float>(TensorShape({}), {-2});
+
+ ASSERT_OK(RunOpKernel());
+
+ Tensor expected(allocator(), DT_FLOAT, {3, 4, 2});
+ expected.flat<float>().setConstant(-2);
+ expected.tensor<float, 3>()(0, 1, 1) = 3;
+ expected.tensor<float, 3>()(0, 2, 0) = 4;
+ expected.tensor<float, 3>()(2, 3, 1) = 5;
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+} // namespace
+
+static int BM_Arg(int ndim, int n) { return (ndim * 1000000) + n; }
+static int NDIM_from_arg(int bm_arg) { return bm_arg / 1000000; }
+static int N_from_arg(int bm_arg) { return bm_arg % 1000000; }
+
+static void BM_SparseToDense(int iters, const int bm_arg) {
+ const int NDIM = NDIM_from_arg(bm_arg);
+ const int N = N_from_arg(bm_arg);
+ // TODO(zhifengc): Switch to use kernel_benchmark_testlib.h
+ tensorflow::testing::StopTiming();
+
+ const int IndexDim = (NDIM == 1) ? 0 : 1;
+
+ std::unique_ptr<Device> device(
+ DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+ gtl::InlinedVector<TensorValue, 4> inputs;
+
+ // Create a dense tensor with dims [1, ..., 1, N]
+ Tensor output_shape(DT_INT32, TensorShape({NDIM}));
+ Tensor sparse_indices(DT_INT32, TensorShape({N, NDIM}));
+ Tensor sparse_values(DT_FLOAT, TensorShape({N}));
+ Tensor default_value(DT_FLOAT, TensorShape({}));
+ auto output_shape_t = output_shape.vec<int32>();
+ for (int d = 0; d < NDIM; ++d) {
+ output_shape_t(d) = (d == IndexDim) ? N : 3;
+ }
+
+ auto sparse_indices_t = sparse_indices.matrix<int32>();
+ for (int n = 0; n < N; ++n) {
+ for (int d = 0; d < NDIM; ++d)
+ sparse_indices_t(n, d) = (d == IndexDim) ? n : 0;
+ }
+
+ for (auto* ptr :
+ {&sparse_indices, &output_shape, &sparse_values, &default_value}) {
+ inputs.push_back({nullptr, ptr});
+ }
+
+ NodeDef sparse_node_def;
+ TF_CHECK_OK(NodeDefBuilder("sparsetodense", "SparseToDense")
+ .Input(FakeInput(DT_INT32))
+ .Input(FakeInput(DT_INT32))
+ .Input(FakeInput(DT_FLOAT))
+ .Input(FakeInput(DT_FLOAT))
+ .Finalize(&sparse_node_def));
+
+ Status status;
+ std::unique_ptr<OpKernel> op(CreateOpKernel(
+ DEVICE_CPU, device.get(), cpu_allocator(), sparse_node_def, &status));
+
+ OpKernelContext::Params params;
+ params.device = device.get();
+ params.frame_iter = FrameAndIter(0, 0);
+ params.inputs = &inputs;
+ params.op_kernel = op.get();
+ params.output_alloc_attr = [&device, &op, &params](int index) {
+ AllocatorAttributes attr;
+ const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
+ attr.set_on_host(on_host);
+ return attr;
+ };
+
+ std::unique_ptr<OpKernelContext> sparse_context(new OpKernelContext(params));
+ op->Compute(sparse_context.get());
+ tensorflow::testing::StartTiming();
+ for (int i = 0; i < iters; ++i) {
+ delete sparse_context->release_output(0).tensor;
+ op->Compute(sparse_context.get());
+ ASSERT_OK(sparse_context->status());
+ }
+ tensorflow::testing::StopTiming();
+
+ // processing input, mainly
+ int64 bytes_per_iter = static_cast<int64>((N + N * NDIM) * sizeof(float));
+
+ tensorflow::testing::BytesProcessed(bytes_per_iter * iters);
+}
+
+BENCHMARK(BM_SparseToDense)
+ ->Arg(BM_Arg(1, 10))
+ ->Arg(BM_Arg(1, 100))
+ ->Arg(BM_Arg(1, 1000))
+ ->Arg(BM_Arg(1, 10000))
+ ->Arg(BM_Arg(2, 10))
+ ->Arg(BM_Arg(2, 100))
+ ->Arg(BM_Arg(2, 1000))
+ ->Arg(BM_Arg(2, 10000))
+ ->Arg(BM_Arg(3, 10))
+ ->Arg(BM_Arg(3, 100))
+ ->Arg(BM_Arg(3, 1000))
+ ->Arg(BM_Arg(3, 10000))
+ ->Arg(BM_Arg(5, 10))
+ ->Arg(BM_Arg(5, 100))
+ ->Arg(BM_Arg(5, 1000))
+ ->Arg(BM_Arg(5, 10000));
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
new file mode 100644
index 0000000000..f4f9ada000
--- /dev/null
+++ b/tensorflow/core/kernels/split_op.cc
@@ -0,0 +1,146 @@
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/split_op.h"
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class SplitOp : public OpKernel {
+ public:
+ explicit SplitOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+ void Compute(OpKernelContext* context) override {
+ const int32 split_dim = context->input(0).flat<int32>()(0);
+ const int32 num_split = num_outputs();
+ const Tensor& input = context->input(1);
+ const TensorShape& input_shape = input.shape();
+
+ OP_REQUIRES(
+ context, 0 <= split_dim && split_dim < input_shape.dims(),
+ errors::InvalidArgument("0 <= split_dim < number of input dimensions (",
+ input_shape.dims(), "), but got ", split_dim));
+
+ OP_REQUIRES(
+ context, num_split > 0,
+ errors::InvalidArgument(
+ "Number of ways to split should be > 0, but got ", num_split));
+
+ OP_REQUIRES(context, input_shape.dim_size(split_dim) % num_split == 0,
+ errors::InvalidArgument(
+ "Number of ways to split should evenly divide the split "
+ "dimension, but got split_dim ",
+ split_dim, " (size = ", input_shape.dim_size(split_dim),
+ ") ", "and num_split ", num_split));
+
+ // Special case 1: num_split == 1. Nothing to do.
+ if (num_split == 1) {
+ VLOG(1) << "Split identity";
+ context->set_output(0, context->input(1));
+ return;
+ }
+
+ // Special case 2: split along the 1st dimension. We can share the
+ // underlying buffer.
+ //
+ // Apply this optimization conservatively: if input is aligned,
+ // the resulting tensors must be aligned. It's conservative
+ // because if the immediate consumer of the resulting tensors are
+ // not using eigen for computation, its perfectly fine to avoid
+ // the copying.
+ if ((split_dim == 0) && IsInnerDimsSizeAligned<T>(input_shape)) {
+ VLOG(1) << "Slice dim 0: " << input_shape.DebugString();
+ const int64 delta = input_shape.dim_size(0) / num_split;
+ for (int i = 0; i < num_split; ++i) {
+ context->set_output(i, input.Slice(i * delta, (i + 1) * delta));
+ }
+ return;
+ }
+
+ int32 prefix_dim_size = 1;
+ for (int i = 0; i < split_dim; ++i) {
+ prefix_dim_size *= input_shape.dim_size(i);
+ }
+
+ int32 split_dim_size = input_shape.dim_size(split_dim);
+
+ int32 suffix_dim_size = 1;
+ for (int i = split_dim + 1; i < input_shape.dims(); ++i) {
+ suffix_dim_size *= input_shape.dim_size(i);
+ }
+
+ auto input_reshaped =
+ input.shaped<T, 3>({prefix_dim_size, split_dim_size, suffix_dim_size});
+
+ const int32 split_dim_output_size = split_dim_size / num_split;
+ TensorShape output_shape(input_shape);
+ output_shape.set_dim(split_dim, split_dim_output_size);
+
+ Eigen::DSizes<ptrdiff_t, 3> indices{0, 0, 0};
+ Eigen::DSizes<ptrdiff_t, 3> sizes{prefix_dim_size, split_dim_output_size,
+ suffix_dim_size};
+
+ for (int i = 0; i < num_split; ++i) {
+ Tensor* result = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(i, output_shape, &result));
+ if (prefix_dim_size * split_dim_output_size * suffix_dim_size > 0) {
+ Eigen::DSizes<ptrdiff_t, 3> slice_indices;
+ Eigen::DSizes<ptrdiff_t, 3> slice_sizes;
+ for (int j = 0; j < 3; ++j) {
+ slice_indices[j] = indices[j];
+ slice_sizes[j] = sizes[j];
+ }
+
+ auto result_shaped = result->shaped<T, 3>(
+ {prefix_dim_size, split_dim_output_size, suffix_dim_size});
+
+ functor::Split<Device, T>()(context->eigen_device<Device>(),
+ result_shaped, input_reshaped,
+ slice_indices, slice_sizes);
+ }
+ indices[1] += split_dim_output_size;
+ }
+ }
+};
+
+#define REGISTER_SPLIT(type) \
+ REGISTER_KERNEL_BUILDER(Name("Split") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("split_dim"), \
+ SplitOp<CPUDevice, type>)
+
+TF_CALL_ALL_TYPES(REGISTER_SPLIT);
+
+#undef REGISTER_SPLIT
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU(type) \
+ REGISTER_KERNEL_BUILDER(Name("Split") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("split_dim"), \
+ SplitOp<GPUDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+#undef REGISTER_GPU
+
+#endif // GOOGLE_CUDA
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/split_op.h b/tensorflow/core/kernels/split_op.h
new file mode 100644
index 0000000000..2572c77285
--- /dev/null
+++ b/tensorflow/core/kernels/split_op.h
@@ -0,0 +1,31 @@
+#ifndef TENSORFLOW_KERNELS_SPLIT_OP_H_
+#define TENSORFLOW_KERNELS_SPLIT_OP_H_
+// Functor definition for SplitOp, must be compilable by nvcc.
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct Split {
+ void operator()(const Device& d, typename TTypes<T, 3>::Tensor output,
+ typename TTypes<T, 3>::ConstTensor input,
+ const Eigen::DSizes<ptrdiff_t, 3>& slice_indices,
+ const Eigen::DSizes<ptrdiff_t, 3>& slice_sizes);
+};
+
+template <typename T>
+struct Split<Eigen::ThreadPoolDevice, T> {
+ void operator()(const Eigen::ThreadPoolDevice& d,
+ typename TTypes<T, 3>::Tensor output,
+ typename TTypes<T, 3>::ConstTensor input,
+ const Eigen::DSizes<ptrdiff_t, 3>& slice_indices,
+ const Eigen::DSizes<ptrdiff_t, 3>& slice_sizes);
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_SPLIT_OP_H_
diff --git a/tensorflow/core/kernels/split_op_cpu.cc b/tensorflow/core/kernels/split_op_cpu.cc
new file mode 100644
index 0000000000..b86deeb8fb
--- /dev/null
+++ b/tensorflow/core/kernels/split_op_cpu.cc
@@ -0,0 +1,30 @@
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/split_op.h"
+
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename T>
+void Split<Eigen::ThreadPoolDevice, T>::operator()(
+ const Eigen::ThreadPoolDevice& d, typename TTypes<T, 3>::Tensor output,
+ typename TTypes<T, 3>::ConstTensor input,
+ const Eigen::DSizes<ptrdiff_t, 3>& slice_indices,
+ const Eigen::DSizes<ptrdiff_t, 3>& slice_sizes) {
+ if (output.size() < 131072) {
+ output = input.slice(slice_indices, slice_sizes);
+ } else {
+ output.device(d) = input.slice(slice_indices, slice_sizes);
+ }
+}
+
+#define DEFINE_CPU_KERNELS(T) template struct Split<Eigen::ThreadPoolDevice, T>;
+
+TF_CALL_ALL_TYPES(DEFINE_CPU_KERNELS)
+
+} // namespace functor
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_op_gpu.cu.cc b/tensorflow/core/kernels/split_op_gpu.cu.cc
new file mode 100644
index 0000000000..f8931d6a89
--- /dev/null
+++ b/tensorflow/core/kernels/split_op_gpu.cu.cc
@@ -0,0 +1,31 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+
+#include "tensorflow/core/kernels/split_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+void Split<Device, T>::operator()(
+ const Device& d, typename TTypes<T, 3>::Tensor output,
+ typename TTypes<T, 3>::ConstTensor input,
+ const Eigen::DSizes<ptrdiff_t, 3>& slice_indices,
+ const Eigen::DSizes<ptrdiff_t, 3>& slice_sizes) {
+ output.device(d) = input.slice(slice_indices, slice_sizes);
+}
+
+#define DEFINE_GPU_KERNELS(T) template struct Split<Eigen::GpuDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/string_to_hash_bucket_op.cc b/tensorflow/core/kernels/string_to_hash_bucket_op.cc
new file mode 100644
index 0000000000..bd6fa47268
--- /dev/null
+++ b/tensorflow/core/kernels/string_to_hash_bucket_op.cc
@@ -0,0 +1,47 @@
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+class StringToHashBucketOp : public OpKernel {
+ public:
+ explicit StringToHashBucketOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("num_buckets", &num_buckets_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor* input_tensor;
+ OP_REQUIRES_OK(context, context->input("string_tensor", &input_tensor));
+ const auto& input_flat = input_tensor->flat<string>();
+
+ Tensor* output_tensor = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output("output", input_tensor->shape(),
+ &output_tensor));
+ auto output_flat = output_tensor->flat<int64>();
+
+ for (int i = 0; i < input_flat.size(); ++i) {
+ const uint64 input_hash = Hash64(input_flat(i));
+ const uint64 bucket_id = input_hash % num_buckets_;
+ // The number of buckets is always in the positive range of int64 so is
+ // the resulting bucket_id. Casting the bucket_id from uint64 to int64 is
+ // safe.
+ output_flat(i) = static_cast<int64>(bucket_id);
+ }
+ }
+
+ private:
+ int64 num_buckets_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(StringToHashBucketOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("StringToHashBucket").Device(DEVICE_CPU),
+ StringToHashBucketOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/string_to_number_op.cc b/tensorflow/core/kernels/string_to_number_op.cc
new file mode 100644
index 0000000000..8d23a4fdf8
--- /dev/null
+++ b/tensorflow/core/kernels/string_to_number_op.cc
@@ -0,0 +1,71 @@
+// See docs in ../ops/parse_ops.cc.
+
+#include <errno.h>
+#include <string>
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+static constexpr char kErrorMessage[] =
+ "StringToNumberOp could not correctly convert string: ";
+
+template <typename OutputType>
+class StringToNumberOp : public OpKernel {
+ public:
+ using OpKernel::OpKernel;
+
+ void Compute(OpKernelContext* context) override {
+ // This is not a deep copy of the input tensor; they will share the same
+ // underlying storage.
+ const Tensor* input_tensor;
+ OP_REQUIRES_OK(context, context->input("string_tensor", &input_tensor));
+ const auto& input_flat = input_tensor->flat<string>();
+
+ Tensor* output_tensor = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output("output", input_tensor->shape(),
+ &output_tensor));
+ auto output_flat = output_tensor->flat<OutputType>();
+
+ for (int i = 0; i < input_flat.size(); ++i) {
+ const char* s = input_flat(i).data();
+ Convert(s, &output_flat(i), context);
+ }
+ }
+
+ private:
+ void Convert(const char* s, OutputType* output_data,
+ OpKernelContext* context);
+};
+
+template <>
+void StringToNumberOp<float>::Convert(const char* s, float* output_data,
+ OpKernelContext* context) {
+ OP_REQUIRES(context, strings::safe_strtof(s, output_data),
+ errors::InvalidArgument(kErrorMessage, s));
+}
+
+template <>
+void StringToNumberOp<int32>::Convert(const char* s, int32* output_data,
+ OpKernelContext* context) {
+ OP_REQUIRES(context, strings::safe_strto32(s, output_data),
+ errors::InvalidArgument(kErrorMessage, s));
+}
+
+// Registers the currently supported output types.
+#define REGISTER(type) \
+ REGISTER_KERNEL_BUILDER(Name("StringToNumber") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("out_type"), \
+ StringToNumberOp<type>)
+REGISTER(float);
+REGISTER(int32);
+#undef REGISTER
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/summary_image_op.cc b/tensorflow/core/kernels/summary_image_op.cc
new file mode 100644
index 0000000000..ba765f2e84
--- /dev/null
+++ b/tensorflow/core/kernels/summary_image_op.cc
@@ -0,0 +1,169 @@
+// Operators that deal with SummaryProtos (encoded as DT_STRING tensors) as
+// inputs or outputs in various ways.
+
+// See docs in ../ops/summary_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/png/png_io.h"
+
+namespace tensorflow {
+
+class SummaryImageOp : public OpKernel {
+ public:
+ explicit SummaryImageOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("max_images", &max_images_));
+ const TensorProto* proto;
+ OP_REQUIRES_OK(context, context->GetAttr("bad_color", &proto));
+ OP_REQUIRES_OK(context, context->device()->MakeTensorFromProto(
+ *proto, AllocatorAttributes(), &bad_color_));
+ OP_REQUIRES(context, bad_color_.dtype() == DT_UINT8,
+ errors::InvalidArgument("bad_color must be uint8, got ",
+ DataTypeString(bad_color_.dtype())));
+ OP_REQUIRES(
+ context, TensorShapeUtils::IsVector(bad_color_.shape()),
+ errors::InvalidArgument("bad_color must be a vector, got shape ",
+ bad_color_.shape().ShortDebugString()));
+ }
+
+ void Compute(OpKernelContext* c) override {
+ const Tensor& tags = c->input(0);
+ const Tensor& tensor = c->input(1);
+ OP_REQUIRES(c, TensorShapeUtils::IsLegacyScalar(tags.shape()),
+ errors::InvalidArgument("Tags must have be a scalar"));
+ OP_REQUIRES(c, tensor.dims() == 4 &&
+ (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 ||
+ tensor.dim_size(3) == 4),
+ errors::InvalidArgument(
+ "Tensor must be 4-D with last dim 1, 3, or 4, not ",
+ tensor.shape().DebugString()));
+ const string& base_tag = tags.scalar<string>()();
+
+ const int batch_size = tensor.dim_size(0);
+ const int h = tensor.dim_size(1);
+ const int w = tensor.dim_size(2);
+ const int hw = h * w; // Compact these two dims for simplicity
+ const int depth = tensor.dim_size(3);
+ auto tensor_eigen = tensor.shaped<float, 3>({batch_size, hw, depth});
+
+ OP_REQUIRES(c, bad_color_.dim_size(0) >= depth,
+ errors::InvalidArgument(
+ "expected depth <= bad_color.size, got depth = ", depth,
+ ", bad_color.size = ", bad_color_.dim_size(0)));
+ auto bad_color_full = bad_color_.vec<uint8>();
+ typename TTypes<uint8>::Vec bad_color(bad_color_full.data(), depth);
+
+ // RGB (or gray or RGBA) is last dimension
+ Eigen::Tensor<uint8, 2, Eigen::RowMajor> image(hw, depth);
+
+ Summary s;
+ const int N = std::min<int>(max_images_, batch_size);
+ for (int i = 0; i < N; ++i) {
+ Summary::Value* v = s.add_value();
+ // The tag depends on the number of requested images (not the number
+ // produced.)
+ //
+ // Note that later on avisu uses "/" to figure out a consistent naming
+ // convention for display, so we append "/image" to guarantee that the
+ // image(s) won't be displayed in the global scope with no name.
+ if (max_images_ > 1) {
+ v->set_tag(strings::StrCat(base_tag, "/image/", i));
+ } else {
+ v->set_tag(strings::StrCat(base_tag, "/image"));
+ }
+
+ if (image.size()) {
+ typename TTypes<float>::ConstMatrix values(
+ &tensor_eigen(i, 0, 0),
+ Eigen::DSizes<Eigen::DenseIndex, 2>(hw, depth));
+
+ // Rescale the image to uint8 range.
+ //
+ // We are trying to generate an RCG image from a float tensor. We do
+ // not have any info about the expected range of values in the tensor
+ // but the generated image needs to have all RGB values within [0, 255].
+ //
+ // We use two different algorithms to generate these values. If the
+ // tensor has only positive values we scale them all by 255/max(values).
+ // If the tensor has both negative and positive values we scale them by
+ // the max of their absolute values and center them around 127.
+ //
+ // This works for most cases, but has the incovenient of not respecting
+ // the relative dynamic range across different instances of the tensor.
+
+ // Compute min and max ignoring nonfinite pixels
+ float image_min = std::numeric_limits<float>::infinity();
+ float image_max = -image_min;
+ for (int i = 0; i < hw; i++) {
+ bool finite = true;
+ for (int j = 0; j < depth; j++) {
+ if (!std::isfinite(values(i, j))) {
+ finite = false;
+ break;
+ }
+ }
+ if (finite) {
+ for (int j = 0; j < depth; j++) {
+ float value = values(i, j);
+ image_min = std::min(image_min, value);
+ image_max = std::max(image_max, value);
+ }
+ }
+ }
+
+ // Pick an affine transform into uint8
+ const float kZeroThreshold = 1e-6;
+ float scale, offset;
+ if (image_min < 0) {
+ float max_val = std::max(std::abs(image_min), std::abs(image_max));
+ scale = max_val < kZeroThreshold ? 0.0f : 127.0f / max_val;
+ offset = 128.0f;
+ } else {
+ scale = image_max < kZeroThreshold ? 0.0f : 255.0f / image_max;
+ offset = 0.0f;
+ }
+
+ // Transform image, turning nonfinite values to bad_color
+ for (int i = 0; i < hw; i++) {
+ bool finite = true;
+ for (int j = 0; j < depth; j++) {
+ if (!std::isfinite(values(i, j))) {
+ finite = false;
+ break;
+ }
+ }
+ if (finite) {
+ image.chip<0>(i) =
+ (values.chip<0>(i) * scale + offset).cast<uint8>();
+ } else {
+ image.chip<0>(i) = bad_color;
+ }
+ }
+ }
+
+ Summary::Image* si = v->mutable_image();
+ si->set_height(h);
+ si->set_width(w);
+ si->set_colorspace(depth);
+ OP_REQUIRES(c, png::WriteImageToBuffer(
+ image.data(), w, h, w * depth, depth, 8, -1,
+ si->mutable_encoded_image_string(), nullptr),
+ errors::Internal("PNG encoding failed"));
+ }
+
+ Tensor* summary_tensor = nullptr;
+ OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
+ CHECK(s.SerializeToString(&summary_tensor->scalar<string>()()));
+ }
+
+ private:
+ int64 max_images_;
+ Tensor bad_color_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ImageSummary").Device(DEVICE_CPU),
+ SummaryImageOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/summary_image_op_test.cc b/tensorflow/core/kernels/summary_image_op_test.cc
new file mode 100644
index 0000000000..ddfeeffc0b
--- /dev/null
+++ b/tensorflow/core/kernels/summary_image_op_test.cc
@@ -0,0 +1,141 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/public/env.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+namespace {
+
+static void EXPECT_SummaryMatches(const Summary& actual,
+ const string& expected_str) {
+ Summary expected;
+ CHECK(protobuf::TextFormat::ParseFromString(expected_str, &expected));
+ EXPECT_EQ(expected.DebugString(), actual.DebugString());
+}
+
+// --------------------------------------------------------------------------
+// SummaryImageOp
+// --------------------------------------------------------------------------
+class SummaryImageOpTest : public OpsTestBase {
+ protected:
+ void MakeOp(int max_images) {
+ RequireDefaultOps();
+ ASSERT_OK(NodeDefBuilder("myop", "ImageSummary")
+ .Input(FakeInput())
+ .Input(FakeInput())
+ .Attr("max_images", max_images)
+ .Finalize(node_def()));
+ ASSERT_OK(InitOp());
+ }
+
+ void CheckAndRemoveEncodedImages(Summary* summary) {
+ for (int i = 0; i < summary->value_size(); ++i) {
+ Summary::Value* value = summary->mutable_value(i);
+ ASSERT_TRUE(value->has_image()) << "No image for value: " << value->tag();
+ ASSERT_FALSE(value->image().encoded_image_string().empty())
+ << "No encoded_image_string for value: " << value->tag();
+ if (VLOG_IS_ON(2)) {
+ // When LOGGING, output the images to disk for manual inspection.
+ TF_CHECK_OK(WriteStringToFile(
+ Env::Default(), strings::StrCat("/tmp/", value->tag(), ".png"),
+ value->image().encoded_image_string()));
+ }
+ value->mutable_image()->clear_encoded_image_string();
+ }
+ }
+};
+
+TEST_F(SummaryImageOpTest, ThreeGrayImagesOutOfFive4dInput) {
+ MakeOp(3 /* max images */);
+
+ // Feed and run
+ AddInputFromArray<string>(TensorShape({}), {"tag"});
+ AddInputFromArray<float>(TensorShape({5, 2, 1, 1}),
+ {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the output size.
+ Tensor* out_tensor = GetOutput(0);
+ ASSERT_EQ(0, out_tensor->dims());
+ Summary summary;
+ ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+
+ CheckAndRemoveEncodedImages(&summary);
+ EXPECT_SummaryMatches(summary, R"(
+ value { tag: 'tag/image/0' image { width: 1 height: 2 colorspace: 1} }
+ value { tag: 'tag/image/1' image { width: 1 height: 2 colorspace: 1} }
+ value { tag: 'tag/image/2' image { width: 1 height: 2 colorspace: 1} }
+ )");
+}
+
+TEST_F(SummaryImageOpTest, OneGrayImage4dInput) {
+ MakeOp(1 /* max images */);
+
+ // Feed and run
+ AddInputFromArray<string>(TensorShape({}), {"tag"});
+ AddInputFromArray<float>(TensorShape({5 /*batch*/, 2, 1, 1 /*depth*/}),
+ {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the output size.
+ Tensor* out_tensor = GetOutput(0);
+ ASSERT_EQ(0, out_tensor->dims());
+ Summary summary;
+ ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+
+ CheckAndRemoveEncodedImages(&summary);
+ EXPECT_SummaryMatches(summary, R"(
+ value { tag: 'tag/image' image { width: 1 height: 2 colorspace: 1} })");
+}
+
+TEST_F(SummaryImageOpTest, OneColorImage4dInput) {
+ MakeOp(1 /* max images */);
+
+ // Feed and run
+ AddInputFromArray<string>(TensorShape({}), {"tag"});
+ AddInputFromArray<float>(
+ TensorShape({1 /*batch*/, 5 /*rows*/, 2 /*columns*/, 3 /*depth*/}),
+ {
+ /* r0, c0, RGB */ 1.0, 0.1, 0.2,
+ /* r0, c1, RGB */ 1.0, 0.3, 0.4,
+ /* r1, c0, RGB */ 0.0, 1.0, 0.0,
+ /* r1, c1, RGB */ 0.0, 1.0, 0.0,
+ /* r2, c0, RGB */ 0.0, 0.0, 1.0,
+ /* r2, c1, RGB */ 0.0, 0.0, 1.0,
+ /* r3, c0, RGB */ 1.0, 1.0, 0.0,
+ /* r3, c1, RGB */ 1.0, 0.0, 1.0,
+ /* r4, c0, RGB */ 1.0, 1.0, 0.0,
+ /* r4, c1, RGB */ 1.0, 0.0, 1.0,
+ });
+ ASSERT_OK(RunOpKernel());
+
+ // Check the output size.
+ Tensor* out_tensor = GetOutput(0);
+ ASSERT_EQ(0, out_tensor->dims());
+ Summary summary;
+ ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+
+ CheckAndRemoveEncodedImages(&summary);
+ EXPECT_SummaryMatches(summary, R"(
+ value { tag: 'tag/image' image { width: 2 height: 5 colorspace: 3} })");
+}
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
new file mode 100644
index 0000000000..1c4be64b8b
--- /dev/null
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -0,0 +1,141 @@
+// Operators that deal with SummaryProtos (encoded as DT_STRING tensors) as
+// inputs or outputs in various ways.
+
+// See docs in ../ops/summary_ops.cc.
+
+#include <unordered_set>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+template <typename T>
+class SummaryScalarOp : public OpKernel {
+ public:
+ explicit SummaryScalarOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* c) override {
+ const Tensor& tags = c->input(0);
+ const Tensor& values = c->input(1);
+
+ OP_REQUIRES(c, tags.IsSameSize(values) ||
+ (TensorShapeUtils::IsLegacyScalar(tags.shape()) &&
+ TensorShapeUtils::IsLegacyScalar(values.shape())),
+ errors::InvalidArgument("tags and values not the same shape: ",
+ tags.shape().ShortDebugString(), " != ",
+ values.shape().ShortDebugString()));
+ auto Ttags = tags.flat<string>();
+ auto Tvalues = values.flat<T>();
+ Summary s;
+ for (int i = 0; i < Ttags.size(); i++) {
+ Summary::Value* v = s.add_value();
+ v->set_tag(Ttags(i));
+ v->set_simple_value(Tvalues(i));
+ }
+
+ Tensor* summary_tensor = nullptr;
+ OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
+ CHECK(s.SerializeToString(&summary_tensor->scalar<string>()()));
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ScalarSummary")
+ .Device(DEVICE_CPU)
+ .TypeConstraint<float>("T"),
+ SummaryScalarOp<float>);
+REGISTER_KERNEL_BUILDER(Name("ScalarSummary")
+ .Device(DEVICE_CPU)
+ .TypeConstraint<double>("T"),
+ SummaryScalarOp<double>);
+
+class SummaryHistoOp : public OpKernel {
+ public:
+ // SummaryHistoOp could be extended to take a list of custom bucket
+ // boundaries as an option.
+ explicit SummaryHistoOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* c) override {
+ const Tensor& tags = c->input(0);
+ const Tensor& values = c->input(1);
+ const auto flat = values.flat<float>();
+ OP_REQUIRES(c, TensorShapeUtils::IsLegacyScalar(tags.shape()),
+ errors::InvalidArgument("tags must be scalar"));
+ // Build histogram of values in "values" tensor
+ histogram::Histogram histo;
+ for (int64 i = 0; i < flat.size(); i++) {
+ float v = flat(i);
+ if (!std::isfinite(v)) {
+ c->SetStatus(
+ errors::OutOfRange("Nan in summary histogram for: ", name()));
+ break;
+ }
+ histo.Add(v);
+ }
+
+ Summary s;
+ Summary::Value* v = s.add_value();
+ v->set_tag(tags.scalar<string>()());
+ histo.EncodeToProto(v->mutable_histo(), false /* Drop zero buckets */);
+
+ Tensor* summary_tensor = nullptr;
+ OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
+ CHECK(s.SerializeToString(&summary_tensor->scalar<string>()()));
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("HistogramSummary").Device(DEVICE_CPU),
+ SummaryHistoOp);
+
+struct HistogramResource : public ResourceBase {
+ histogram::ThreadSafeHistogram histogram;
+
+ string DebugString() override { return "A historam summary. Stats ..."; }
+};
+
+class SummaryMergeOp : public OpKernel {
+ public:
+ explicit SummaryMergeOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* c) override {
+ Summary s;
+ std::unordered_set<string> tags;
+ for (int input_num = 0; input_num < c->num_inputs(); input_num++) {
+ const Tensor& in = c->input(input_num);
+ auto in_vec = in.flat<string>();
+ for (int i = 0; i < in_vec.dimension(0); i++) {
+ const string& s_in = in_vec(i);
+ Summary summary_in;
+ if (!ParseProtoUnlimited(&summary_in, s_in)) {
+ c->SetStatus(errors::InvalidArgument(
+ "Could not parse one of the summary inputs"));
+ return;
+ }
+
+ for (int v = 0; v < summary_in.value_size(); v++) {
+ if (!tags.insert(summary_in.value(v).tag()).second) {
+ c->SetStatus(errors::InvalidArgument(
+ strings::StrCat("Duplicate tag ", summary_in.value(v).tag(),
+ " found in summary inputs")));
+ return;
+ }
+ *s.add_value() = summary_in.value(v);
+ }
+ }
+ }
+
+ Tensor* summary_tensor = nullptr;
+ OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
+ CHECK(s.SerializeToString(&summary_tensor->scalar<string>()()));
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MergeSummary").Device(DEVICE_CPU),
+ SummaryMergeOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/summary_op_test.cc b/tensorflow/core/kernels/summary_op_test.cc
new file mode 100644
index 0000000000..fd271a6862
--- /dev/null
+++ b/tensorflow/core/kernels/summary_op_test.cc
@@ -0,0 +1,282 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/public/env.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+static void EXPECT_SummaryMatches(const Summary& actual,
+ const string& expected_str) {
+ Summary expected;
+ CHECK(protobuf::TextFormat::ParseFromString(expected_str, &expected));
+ EXPECT_EQ(expected.DebugString(), actual.DebugString());
+}
+
+class SummaryScalarOpTest : public OpsTestBase {
+ protected:
+ void MakeOp(DataType dt) {
+ RequireDefaultOps();
+ ASSERT_OK(NodeDefBuilder("myop", "ScalarSummary")
+ .Input(FakeInput())
+ .Input(FakeInput(dt))
+ .Finalize(node_def()));
+ ASSERT_OK(InitOp());
+ }
+};
+
+TEST_F(SummaryScalarOpTest, SimpleFloat) {
+ MakeOp(DT_FLOAT);
+
+ // Feed and run
+ AddInputFromArray<string>(TensorShape({3}), {"tag1", "tag2", "tag3"});
+ AddInputFromArray<float>(TensorShape({3}), {1.0, -0.73, 10000.0});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the output size.
+ Tensor* out_tensor = GetOutput(0);
+ ASSERT_EQ(0, out_tensor->dims());
+ Summary summary;
+ ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+ EXPECT_SummaryMatches(summary, R"(
+ value { tag: 'tag1' simple_value: 1.0 }
+ value { tag: 'tag2' simple_value: -0.73 }
+ value { tag: 'tag3' simple_value: 10000.0 }
+ )");
+}
+
+TEST_F(SummaryScalarOpTest, SimpleDouble) {
+ MakeOp(DT_DOUBLE);
+
+ // Feed and run
+ AddInputFromArray<string>(TensorShape({3}), {"tag1", "tag2", "tag3"});
+ AddInputFromArray<double>(TensorShape({3}), {1.0, -0.73, 10000.0});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the output size.
+ Tensor* out_tensor = GetOutput(0);
+ ASSERT_EQ(0, out_tensor->dims());
+ Summary summary;
+ ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+ EXPECT_SummaryMatches(summary, R"(
+ value { tag: 'tag1' simple_value: 1.0 }
+ value { tag: 'tag2' simple_value: -0.73 }
+ value { tag: 'tag3' simple_value: 10000.0 }
+ )");
+}
+
+TEST_F(SummaryScalarOpTest, Error_MismatchedSize) {
+ MakeOp(DT_FLOAT);
+
+ // Feed and run
+ AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
+ AddInputFromArray<float>(TensorShape({3}), {1.0, -0.73, 10000.0});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(StringPiece(s.ToString()).contains("not the same shape")) << s;
+}
+
+TEST_F(SummaryScalarOpTest, Error_WrongDimsTags) {
+ MakeOp(DT_FLOAT);
+
+ // Feed and run
+ AddInputFromArray<string>(TensorShape({2, 1}), {"tag1", "tag2"});
+ AddInputFromArray<float>(TensorShape({2}), {1.0, -0.73});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(
+ StringPiece(s.ToString()).contains("tags and values not the same shape"))
+ << s;
+}
+
+TEST_F(SummaryScalarOpTest, Error_WrongDimsValues) {
+ MakeOp(DT_FLOAT);
+
+ // Feed and run
+ AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
+ AddInputFromArray<float>(TensorShape({2, 1}), {1.0, -0.73});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(
+ StringPiece(s.ToString()).contains("tags and values not the same shape"))
+ << s;
+}
+
+// --------------------------------------------------------------------------
+// SummaryHistoOp
+// --------------------------------------------------------------------------
+class SummaryHistoOpTest : public OpsTestBase {
+ protected:
+ void MakeOp() {
+ ASSERT_OK(NodeDefBuilder("myop", "HistogramSummary")
+ .Input(FakeInput())
+ .Input(FakeInput())
+ .Finalize(node_def()));
+ ASSERT_OK(InitOp());
+ }
+};
+
+TEST_F(SummaryHistoOpTest, Simple) {
+ MakeOp();
+
+ // Feed and run
+ AddInputFromArray<string>(TensorShape({}), {"taghisto"});
+ AddInputFromArray<float>(TensorShape({3, 2}), {0.1, -0.7, 4.1, 4., 5., 4.});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the output size.
+ Tensor* out_tensor = GetOutput(0);
+ ASSERT_EQ(0, out_tensor->dims());
+ Summary summary;
+ ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+ ASSERT_EQ(summary.value_size(), 1);
+ EXPECT_EQ(summary.value(0).tag(), "taghisto");
+ histogram::Histogram histo;
+ EXPECT_TRUE(histo.DecodeFromProto(summary.value(0).histo()));
+ EXPECT_EQ(
+ "Count: 6 Average: 2.7500 StdDev: 2.20\n"
+ "Min: -0.7000 Median: 3.9593 Max: 5.0000\n"
+ "------------------------------------------------------\n"
+ "[ -0.76, -0.69 ) 1 16.667% 16.667% ###\n"
+ "[ 0.093, 0.1 ) 1 16.667% 33.333% ###\n"
+ "[ 3.8, 4.2 ) 3 50.000% 83.333% ##########\n"
+ "[ 4.6, 5.1 ) 1 16.667% 100.000% ###\n",
+ histo.ToString());
+}
+
+TEST_F(SummaryHistoOpTest, Error_WrongDimsTags) {
+ MakeOp();
+
+ // Feed and run
+ AddInputFromArray<string>(TensorShape({2, 1}), {"tag1", "tag2"});
+ AddInputFromArray<float>(TensorShape({2}), {1.0, -0.73});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(StringPiece(s.ToString()).contains("tags must be scalar")) << s;
+}
+
+TEST_F(SummaryHistoOpTest, Error_TooManyTagValues) {
+ MakeOp();
+
+ // Feed and run
+ AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
+ AddInputFromArray<float>(TensorShape({2, 1}), {1.0, -0.73});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(StringPiece(s.ToString()).contains("tags must be scalar")) << s;
+}
+
+// --------------------------------------------------------------------------
+// SummaryMergeOp
+// --------------------------------------------------------------------------
+class SummaryMergeOpTest : public OpsTestBase {
+ protected:
+ void MakeOp(int num_inputs) {
+ ASSERT_OK(NodeDefBuilder("myop", "MergeSummary")
+ .Input(FakeInput(num_inputs))
+ .Finalize(node_def()));
+ ASSERT_OK(InitOp());
+ }
+};
+
+TEST_F(SummaryMergeOpTest, Simple) {
+ MakeOp(1);
+
+ // Feed and run
+ Summary s1;
+ ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+ "value { tag: \"tag1\" simple_value: 1.0 } "
+ "value { tag: \"tag2\" simple_value: -0.73 } ",
+ &s1));
+ Summary s2;
+ ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+ "value { tag: \"tag3\" simple_value: 10000.0 }", &s2));
+ Summary s3;
+ ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+ "value { tag: \"tag4\" simple_value: 11.0 }", &s3));
+
+ AddInputFromArray<string>(
+ TensorShape({3}),
+ {s1.SerializeAsString(), s2.SerializeAsString(), s3.SerializeAsString()});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the output size.
+ Tensor* out_tensor = GetOutput(0);
+ ASSERT_EQ(0, out_tensor->dims());
+ Summary summary;
+ ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+
+ EXPECT_SummaryMatches(summary,
+ "value { tag: \"tag1\" simple_value: 1.0 } "
+ "value { tag: \"tag2\" simple_value: -0.73 } "
+ "value { tag: \"tag3\" simple_value: 10000.0 }"
+ "value { tag: \"tag4\" simple_value: 11.0 }");
+}
+
+TEST_F(SummaryMergeOpTest, Simple_MultipleInputs) {
+ MakeOp(3);
+
+ // Feed and run
+ Summary s1;
+ ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+ "value { tag: \"tag1\" simple_value: 1.0 } "
+ "value { tag: \"tag2\" simple_value: -0.73 } ",
+ &s1));
+ Summary s2;
+ ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+ "value { tag: \"tag3\" simple_value: 10000.0 }", &s2));
+ Summary s3;
+ ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+ "value { tag: \"tag4\" simple_value: 11.0 }", &s3));
+
+ AddInputFromArray<string>(TensorShape({}), {s1.SerializeAsString()});
+ AddInputFromArray<string>(TensorShape({}), {s2.SerializeAsString()});
+ AddInputFromArray<string>(TensorShape({}), {s3.SerializeAsString()});
+ ASSERT_OK(RunOpKernel());
+
+ // Check the output size.
+ Tensor* out_tensor = GetOutput(0);
+ ASSERT_EQ(0, out_tensor->dims());
+ Summary summary;
+ ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+
+ EXPECT_SummaryMatches(summary,
+ "value { tag: \"tag1\" simple_value: 1.0 } "
+ "value { tag: \"tag2\" simple_value: -0.73 } "
+ "value { tag: \"tag3\" simple_value: 10000.0 }"
+ "value { tag: \"tag4\" simple_value: 11.0 }");
+}
+
+TEST_F(SummaryMergeOpTest, Error_MismatchedSize) {
+ MakeOp(1);
+
+ // Feed and run
+ Summary s1;
+ ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+ "value { tag: \"tag1\" simple_value: 1.0 } "
+ "value { tag: \"tagduplicate\" simple_value: -0.73 } ",
+ &s1));
+ Summary s2;
+ ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+ "value { tag: \"tagduplicate\" simple_value: 1.0 } ", &s2));
+ AddInputFromArray<string>(TensorShape({2}),
+ {s1.SerializeAsString(), s2.SerializeAsString()});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(StringPiece(s.ToString()).contains("Duplicate tag")) << s;
+}
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/text_line_reader_op.cc b/tensorflow/core/kernels/text_line_reader_op.cc
new file mode 100644
index 0000000000..51e4d6a2b8
--- /dev/null
+++ b/tensorflow/core/kernels/text_line_reader_op.cc
@@ -0,0 +1,99 @@
+// See docs in ../ops/io_ops.cc.
+
+#include <memory>
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/kernels/reader_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/public/env.h"
+
+namespace tensorflow {
+
+class TextLineReader : public ReaderBase {
+ public:
+ TextLineReader(const string& node_name, int skip_header_lines, Env* env)
+ : ReaderBase(strings::StrCat("TextLineReader '", node_name, "'")),
+ skip_header_lines_(skip_header_lines),
+ env_(env),
+ line_number_(0) {}
+
+ Status OnWorkStartedLocked() override {
+ line_number_ = 0;
+ RandomAccessFile* file = nullptr;
+ TF_RETURN_IF_ERROR(env_->NewRandomAccessFile(current_work(), &file));
+ input_buffer_.reset(new io::InputBuffer(file, kBufferSize));
+ for (; line_number_ < skip_header_lines_; ++line_number_) {
+ string line_contents;
+ Status status = input_buffer_->ReadLine(&line_contents);
+ if (errors::IsOutOfRange(status)) {
+ // We ignore an end of file error when skipping header lines.
+ // We will end up skipping this file.
+ return Status::OK();
+ }
+ TF_RETURN_IF_ERROR(status);
+ }
+ return Status::OK();
+ }
+
+ Status OnWorkFinishedLocked() override {
+ input_buffer_.reset(nullptr);
+ return Status::OK();
+ }
+
+ Status ReadLocked(string* key, string* value, bool* produced,
+ bool* at_end) override {
+ Status status = input_buffer_->ReadLine(value);
+ ++line_number_;
+ if (status.ok()) {
+ *key = strings::StrCat(current_work(), ":", line_number_);
+ *produced = true;
+ return status;
+ }
+ if (errors::IsOutOfRange(status)) { // End of file, advance to the next.
+ *at_end = true;
+ return Status::OK();
+ } else { // Some other reading error
+ return status;
+ }
+ }
+
+ Status ResetLocked() override {
+ line_number_ = 0;
+ input_buffer_.reset(nullptr);
+ return ReaderBase::ResetLocked();
+ }
+
+ // TODO(josh11b): Implement serializing and restoring the state. Need
+ // to create TextLineReaderState proto to store ReaderBaseState,
+ // line_number_, and input_buffer_->Tell().
+
+ private:
+ enum { kBufferSize = 256 << 10 /* 256 kB */ };
+ const int skip_header_lines_;
+ Env* const env_;
+ int64 line_number_;
+ std::unique_ptr<io::InputBuffer> input_buffer_;
+};
+
+class TextLineReaderOp : public ReaderOpKernel {
+ public:
+ explicit TextLineReaderOp(OpKernelConstruction* context)
+ : ReaderOpKernel(context) {
+ int skip_header_lines = -1;
+ OP_REQUIRES_OK(context,
+ context->GetAttr("skip_header_lines", &skip_header_lines));
+ OP_REQUIRES(context, skip_header_lines >= 0,
+ errors::InvalidArgument("skip_header_lines must be >= 0 not ",
+ skip_header_lines));
+ Env* env = context->env();
+ SetReaderFactory([this, skip_header_lines, env]() {
+ return new TextLineReader(name(), skip_header_lines, env);
+ });
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("TextLineReader").Device(DEVICE_CPU),
+ TextLineReaderOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/tf_record_reader_op.cc b/tensorflow/core/kernels/tf_record_reader_op.cc
new file mode 100644
index 0000000000..551be18d5f
--- /dev/null
+++ b/tensorflow/core/kernels/tf_record_reader_op.cc
@@ -0,0 +1,76 @@
+// See docs in ../ops/io_ops.cc.
+
+#include <memory>
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/kernels/reader_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/public/env.h"
+
+namespace tensorflow {
+
+class TFRecordReader : public ReaderBase {
+ public:
+ TFRecordReader(const string& node_name, Env* env)
+ : ReaderBase(strings::StrCat("TFRecordReader '", node_name, "'")),
+ env_(env),
+ offset_(0) {}
+
+ Status OnWorkStartedLocked() override {
+ offset_ = 0;
+ RandomAccessFile* file = nullptr;
+ TF_RETURN_IF_ERROR(env_->NewRandomAccessFile(current_work(), &file));
+ file_.reset(file);
+ reader_.reset(new io::RecordReader(file));
+ return Status::OK();
+ }
+
+ Status OnWorkFinishedLocked() override {
+ reader_.reset(nullptr);
+ file_.reset(nullptr);
+ return Status::OK();
+ }
+
+ Status ReadLocked(string* key, string* value, bool* produced,
+ bool* at_end) override {
+ *key = strings::StrCat(current_work(), ":", offset_);
+ Status status = reader_->ReadRecord(&offset_, value);
+ if (errors::IsOutOfRange(status)) {
+ *at_end = true;
+ return Status::OK();
+ }
+ if (!status.ok()) return status;
+ *produced = true;
+ return Status::OK();
+ }
+
+ Status ResetLocked() override {
+ offset_ = 0;
+ reader_.reset(nullptr);
+ file_.reset(nullptr);
+ return ReaderBase::ResetLocked();
+ }
+
+ // TODO(josh11b): Implement serializing and restoring the state.
+
+ private:
+ Env* const env_;
+ uint64 offset_;
+ std::unique_ptr<RandomAccessFile> file_;
+ std::unique_ptr<io::RecordReader> reader_;
+};
+
+class TFRecordReaderOp : public ReaderOpKernel {
+ public:
+ explicit TFRecordReaderOp(OpKernelConstruction* context)
+ : ReaderOpKernel(context) {
+ Env* env = context->env();
+ SetReaderFactory([this, env]() { return new TFRecordReader(name(), env); });
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("TFRecordReader").Device(DEVICE_CPU),
+ TFRecordReaderOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
new file mode 100644
index 0000000000..d5e0e89d60
--- /dev/null
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -0,0 +1,460 @@
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#ifdef GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif // GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/tile_ops.h"
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// --------------------------------------------------------------------------
+template <typename Device>
+class TileOp : public OpKernel {
+ public:
+ explicit TileOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ const Tensor& multiples = context->input(1);
+
+ OP_REQUIRES(
+ context, TensorShapeUtils::IsLegacyVector(multiples.shape()),
+ errors::InvalidArgument("Expected multiples to be 1-D, but got shape ",
+ multiples.shape().ShortDebugString()));
+ OP_REQUIRES(context, input.dims() == multiples.NumElements(),
+ errors::InvalidArgument(
+ "Expected multiples argument to be a vector of length ",
+ input.dims(), " but got length ", multiples.dim_size(0)));
+
+ const int input_dims = input.dims();
+ const gtl::ArraySlice<int32> multiples_array(multiples.flat<int32>().data(),
+ input_dims);
+
+ TensorShape output_shape;
+ for (int i = 0; i < input_dims; ++i) {
+ OP_REQUIRES(
+ context, multiples_array[i] > 0,
+ errors::InvalidArgument("Expected multiples[", i, "] > 0, but got ",
+ multiples_array[i]));
+ output_shape.AddDim(input.dim_size(i) * multiples_array[i]);
+ }
+ Tensor* result = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
+
+#define HANDLE_DIM(DT, NDIM) \
+ if (context->input(0).dtype() == DT && input_dims == NDIM) { \
+ HandleCase<DT, NDIM>(context, multiples_array, result); \
+ return; \
+ }
+
+#define HANDLE_TYPE(T) \
+ HANDLE_DIM(T, 0) \
+ HANDLE_DIM(T, 1) \
+ HANDLE_DIM(T, 2) \
+ HANDLE_DIM(T, 3) \
+ HANDLE_DIM(T, 4) \
+ HANDLE_DIM(T, 5)
+
+ HANDLE_TYPE(DT_BOOL);
+ HANDLE_TYPE(DT_FLOAT);
+ HANDLE_TYPE(DT_DOUBLE);
+ HANDLE_TYPE(DT_UINT8);
+ HANDLE_TYPE(DT_INT32);
+ HANDLE_TYPE(DT_INT16);
+ HANDLE_TYPE(DT_INT64);
+ HANDLE_TYPE(DT_STRING); // when DEVICE=CPUDevice.
+
+#undef HANDLE_TYPE
+#undef HANDLE_DIM
+
+ OP_REQUIRES(context, false,
+ errors::Unimplemented(
+ "TileOp : Unhandled input dimensions, DT : ",
+ context->input(0).dtype(), ", dims : ", input_dims));
+ }
+
+ private:
+ template <DataType DT, int NDIM>
+ void HandleCaseImpl(OpKernelContext* context,
+ const gtl::ArraySlice<int32>& multiples_array,
+ Tensor* result) {
+ typedef typename EnumToDataType<DT>::Type T;
+ Eigen::array<int32, NDIM> broadcast_array;
+ for (int i = 0; i < NDIM; ++i) {
+ broadcast_array[i] = multiples_array[i];
+ }
+ functor::Tile<Device, T, NDIM>()(
+ context->eigen_device<Device>(), result->tensor<T, NDIM>(),
+ context->input(0).tensor<T, NDIM>(), broadcast_array);
+ }
+
+ template <DataType DT, int NDIM>
+ void HandleCase(OpKernelContext* context,
+ const gtl::ArraySlice<int32>& multiples_array,
+ Tensor* result);
+
+ TF_DISALLOW_COPY_AND_ASSIGN(TileOp);
+};
+
+template <typename Device>
+template <DataType DT, int NDIM>
+inline void TileOp<Device>::HandleCase(
+ OpKernelContext* context, const gtl::ArraySlice<int32>& multiples_array,
+ Tensor* result) {
+ LOG(FATAL) << "TileOp: Invalid combination of Device, DT and NDIM: "
+ << typeid(Device).name() << ", " << DataTypeString(DT) << ", "
+ << NDIM;
+}
+
+#define HANDLE_CASE(device, dtype, ndim) \
+ template <> \
+ template <> \
+ void TileOp<device>::HandleCase<dtype, ndim>( \
+ OpKernelContext * context, \
+ const gtl::ArraySlice<int32>& multiples_array, Tensor* result) { \
+ HandleCaseImpl<dtype, ndim>(context, multiples_array, result); \
+ }
+
+#define HANDLE_CASE_DIM_POSITIVE(device, dtype) \
+ HANDLE_CASE(device, dtype, 1); \
+ HANDLE_CASE(device, dtype, 2); \
+ HANDLE_CASE(device, dtype, 3); \
+ HANDLE_CASE(device, dtype, 4); \
+ HANDLE_CASE(device, dtype, 5);
+
+#define HANDLE_CASE_DIM(device, dtype) \
+ HANDLE_CASE(device, dtype, 0); \
+ HANDLE_CASE_DIM_POSITIVE(device, dtype);
+
+HANDLE_CASE_DIM(CPUDevice, DT_BOOL);
+HANDLE_CASE_DIM(CPUDevice, DT_FLOAT);
+HANDLE_CASE_DIM(CPUDevice, DT_DOUBLE);
+HANDLE_CASE_DIM(CPUDevice, DT_UINT8);
+HANDLE_CASE_DIM(CPUDevice, DT_INT32);
+HANDLE_CASE_DIM(CPUDevice, DT_INT16);
+HANDLE_CASE_DIM(CPUDevice, DT_INT64);
+HANDLE_CASE_DIM(CPUDevice, DT_STRING);
+
+#if GOOGLE_CUDA
+// Eigen on GPU does not handle 0-dimension data types yet.
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_FLOAT);
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_DOUBLE);
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT16);
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT32);
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT64);
+#endif // GOOGLE_CUDA
+
+#undef HANDLE_CASE_DIM_POSITIVE
+#undef HANDLE_CASE_DIM
+#undef HANDLE_CASE
+
+// --------------------------------------------------------------------------
+template <typename Device>
+class TileGradientOp : public OpKernel {
+ public:
+ explicit TileGradientOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ const Tensor& multiples = context->input(1);
+ OP_REQUIRES(
+ context, TensorShapeUtils::IsLegacyVector(multiples.shape()),
+ errors::InvalidArgument("Expected multiples to be 1-D, but got shape ",
+ multiples.shape().ShortDebugString()));
+ OP_REQUIRES(context, input.dims() == multiples.NumElements(),
+ errors::InvalidArgument(
+ "Expected multiples argument to be a vector of length ",
+ input.dims(), " but got length ", multiples.dim_size(0)));
+
+ const int input_dims = input.dims();
+ const gtl::ArraySlice<int32> multiples_array(multiples.flat<int32>().data(),
+ input_dims);
+
+ TensorShape output_shape;
+ std::vector<int32> input_dim_size_vec;
+ for (int i = 0; i < input_dims; ++i) {
+ OP_REQUIRES(
+ context, multiples_array[i] > 0,
+ errors::InvalidArgument("Expected multiples[", i, "] > 0, but got ",
+ multiples_array[i]));
+ OP_REQUIRES(context, input.dim_size(i) % multiples_array[i] == 0,
+ errors::InvalidArgument("Expected input_dim[", i,
+ "] to be divisible by multiples[", i,
+ "], but ", input.dim_size(i), " % ",
+ multiples_array[i], " != 0"));
+ output_shape.AddDim(input.dim_size(i) / multiples_array[i]);
+ input_dim_size_vec.push_back(input.dim_size(i));
+ }
+ Tensor* result = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
+
+#define HANDLE_DIM(DT, NDIM) \
+ if (context->input(0).dtype() == DT && input_dims == NDIM) { \
+ HandleCase<DT, NDIM>(context, input_dim_size_vec, multiples_array, \
+ result); \
+ return; \
+ }
+
+#define HANDLE_TYPE(T) \
+ HANDLE_DIM(T, 0) \
+ HANDLE_DIM(T, 1) \
+ HANDLE_DIM(T, 2) \
+ HANDLE_DIM(T, 3) \
+ HANDLE_DIM(T, 4) \
+ HANDLE_DIM(T, 5)
+
+ HANDLE_TYPE(DT_FLOAT);
+ HANDLE_TYPE(DT_DOUBLE);
+ HANDLE_TYPE(DT_INT32);
+ HANDLE_TYPE(DT_INT16);
+ HANDLE_TYPE(DT_INT64);
+
+#undef HANDLE_TYPE
+#undef HANDLE_DIM
+
+ OP_REQUIRES(context, false,
+ errors::Unimplemented(
+ "TileGradientOp : Unhandled input dimensions, DT : ",
+ context->input(0).dtype(), ", dims : ", input_dims));
+ }
+
+ private:
+ template <DataType DT, int NDIM>
+ void HandleCase(OpKernelContext* context,
+ const std::vector<int32>& input_dims,
+ const gtl::ArraySlice<int32>& multiples_array,
+ Tensor* result);
+
+ template <DataType DT, int NDIM>
+ void HandleCaseImpl(OpKernelContext* context,
+ const std::vector<int32>& input_dims,
+ const gtl::ArraySlice<int32>& multiples_array,
+ Tensor* result) {
+ typedef typename EnumToDataType<DT>::Type T;
+
+ bool reduction_only = true;
+ std::vector<int> reduction_dims;
+
+ for (int i = 0; i < NDIM; ++i) {
+ if (input_dims[i] > multiples_array[i] && multiples_array[i] > 1) {
+ reduction_only = false;
+ break;
+ } else {
+ if (multiples_array[i] == input_dims[i]) {
+ reduction_dims.push_back(i);
+ }
+ }
+ }
+
+ if (reduction_only) {
+#define HANDLE_DIM(D) \
+ if (reduction_dims.size() == (D)) { \
+ HandleReduce<T, NDIM, (D)>(context, reduction_dims, result); \
+ return; \
+ }
+ // NOTE(keveman): Handling the most common case here.
+ // Adding more cases here would require more templating and code
+ // explosion. For instance, HANDLE_DIM(2) wouldn't make sense for NDIM=1.
+ HANDLE_DIM(NDIM > 0 ? 1 : 0);
+
+// Fall through to the unoptimized version.
+#undef HANDLE_DIM
+ }
+
+ Eigen::DSizes<ptrdiff_t, NDIM> indices;
+ Eigen::DSizes<ptrdiff_t, NDIM> sizes;
+
+ // Accumulate slices along the dimensions into the output. The number of
+ // slices along dimension 'i' is simply the multiple along dimension 'i'
+ // passed to the original Tile op.
+ for (int i = 0; i < NDIM; ++i) {
+ sizes[i] = input_dims[i] / multiples_array[i];
+ indices[i] = 0;
+ }
+
+ bool first = true;
+ while (true) {
+ functor::TileGrad<Device, T, NDIM>()(
+ context->eigen_device<Device>(), result->tensor<T, NDIM>(),
+ context->input(0).tensor<T, NDIM>(), indices, sizes, first);
+ first = false;
+ // Increment the begin indices.
+ int i = 0;
+ while (i < NDIM && indices[i] / sizes[i] == multiples_array[i] - 1) {
+ indices[i] = 0;
+ ++i;
+ }
+ // We are finished if we have iterated to the maximum along all
+ // dimensions.
+ if (i == NDIM) {
+ break;
+ }
+ indices[i] += sizes[i];
+ }
+ }
+
+ template <typename T, int NDIM, int REDUCENDIM>
+ void HandleReduce(OpKernelContext* context,
+ const std::vector<int32>& reduce_dim_in, Tensor* result) {
+ static_assert(NDIM >= REDUCENDIM, "Too many reduced dimensions");
+ Eigen::DSizes<ptrdiff_t, REDUCENDIM> reduce_dim;
+ Eigen::DSizes<ptrdiff_t, NDIM> reshape_dim;
+
+ for (int i = 0; i < REDUCENDIM; ++i) {
+ reduce_dim[i] = reduce_dim_in[i];
+ }
+
+ for (int i = 0; i < NDIM; ++i) {
+ reshape_dim[i] = result->dim_size(i);
+ }
+
+ functor::ReduceAndReshape<Device, T, NDIM, REDUCENDIM>()(
+ context->eigen_device<Device>(), result->tensor<T, NDIM>(),
+ context->input(0).tensor<T, NDIM>(), reduce_dim, reshape_dim);
+ }
+
+ TF_DISALLOW_COPY_AND_ASSIGN(TileGradientOp);
+};
+
+template <typename Device>
+template <DataType DT, int NDIM>
+inline void TileGradientOp<Device>::HandleCase(
+ OpKernelContext* context, const std::vector<int32>& input_dims,
+ const gtl::ArraySlice<int32>& multiples_array, Tensor* result) {
+ LOG(FATAL) << "TileGradientOp: Invalid combination of Device, DT and NDIM: "
+ << typeid(Device).name() << ", " << DataTypeString(DT) << ", "
+ << NDIM;
+}
+
+#define HANDLE_CASE(device, dtype, ndim) \
+ template <> \
+ template <> \
+ void TileGradientOp<device>::HandleCase<dtype, ndim>( \
+ OpKernelContext * context, const std::vector<int32>& input_dims, \
+ const gtl::ArraySlice<int32>& multiples_array, Tensor* result) { \
+ HandleCaseImpl<dtype, ndim>(context, input_dims, multiples_array, result); \
+ }
+
+#define HANDLE_CASE_DIM_POSITIVE(device, dtype) \
+ HANDLE_CASE(device, dtype, 1); \
+ HANDLE_CASE(device, dtype, 2); \
+ HANDLE_CASE(device, dtype, 3); \
+ HANDLE_CASE(device, dtype, 4); \
+ HANDLE_CASE(device, dtype, 5);
+
+#define HANDLE_CASE_DIM(device, dtype) \
+ HANDLE_CASE(device, dtype, 0); \
+ HANDLE_CASE_DIM_POSITIVE(device, dtype);
+
+HANDLE_CASE_DIM(CPUDevice, DT_FLOAT);
+HANDLE_CASE_DIM(CPUDevice, DT_DOUBLE);
+HANDLE_CASE_DIM(CPUDevice, DT_INT16);
+HANDLE_CASE_DIM(CPUDevice, DT_INT32);
+HANDLE_CASE_DIM(CPUDevice, DT_INT64);
+
+#if GOOGLE_CUDA
+// Eigen on GPU does not handle 0-dimension data types yet.
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_FLOAT);
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_DOUBLE);
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT16);
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT32);
+HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT64);
+#endif // GOOGLE_CUDA
+
+#undef HANDLE_CASE_DIM_POSITIVE
+#undef HANDLE_CASE_DIM
+#undef HANDLE_CASE
+
+REGISTER_KERNEL_BUILDER(Name("Tile").Device(DEVICE_CPU).HostMemory("multiples"),
+ TileOp<CPUDevice>);
+REGISTER_KERNEL_BUILDER(Name("TileGrad")
+ .Device(DEVICE_CPU)
+ .HostMemory("multiples"),
+ TileGradientOp<CPUDevice>);
+
+#if GOOGLE_CUDA
+#define DEFINE_GPU_TYPE(T) \
+ DEFINE_GPU_DIM(T, 1) \
+ DEFINE_GPU_DIM(T, 2) \
+ DEFINE_GPU_DIM(T, 3) \
+ DEFINE_GPU_DIM(T, 4) \
+ DEFINE_GPU_DIM(T, 5)
+
+#define DEFINE_GPU_DIM(T, NDIM) \
+ template <> \
+ void Tile<GPUDevice, T, NDIM>::operator()( \
+ const GPUDevice& d, typename TTypes<T, NDIM>::Tensor out, \
+ typename TTypes<T, NDIM>::ConstTensor in, \
+ const Eigen::array<int32, NDIM>& broadcast_array) const; \
+ extern template struct Tile<GPUDevice, T, NDIM>; \
+ template <> \
+ void TileGrad<GPUDevice, T, NDIM>::operator()( \
+ const GPUDevice& d, typename TTypes<T, NDIM>::Tensor out, \
+ typename TTypes<T, NDIM>::ConstTensor in, \
+ const Eigen::DSizes<ptrdiff_t, NDIM>& indices, \
+ const Eigen::DSizes<ptrdiff_t, NDIM>& sizes, bool first) const; \
+ extern template struct TileGrad<GPUDevice, T, NDIM>; \
+ template <> \
+ void ReduceAndReshape<GPUDevice, T, NDIM, 1>::operator()( \
+ const GPUDevice& d, typename TTypes<T, NDIM>::Tensor out, \
+ typename TTypes<T, NDIM>::ConstTensor in, \
+ const Eigen::DSizes<ptrdiff_t, 1>& reduce_dim, \
+ const Eigen::DSizes<ptrdiff_t, NDIM>& reshape_dim) const; \
+ extern template struct ReduceAndReshape<GPUDevice, T, NDIM, 1>;
+
+namespace functor {
+DEFINE_GPU_TYPE(float);
+DEFINE_GPU_TYPE(double);
+DEFINE_GPU_TYPE(int64);
+DEFINE_GPU_TYPE(int32);
+DEFINE_GPU_TYPE(int16);
+} // end namespace functor
+
+#undef DEFINE_GPU_DIM
+#undef DEFINE_GPU_TYPE
+
+REGISTER_KERNEL_BUILDER(Name("Tile")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<float>("T")
+ .HostMemory("multiples"),
+ TileOp<GPUDevice>);
+REGISTER_KERNEL_BUILDER(Name("Tile")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<double>("T")
+ .HostMemory("multiples"),
+ TileOp<GPUDevice>);
+REGISTER_KERNEL_BUILDER(Name("Tile")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<int16>("T")
+ .HostMemory("multiples"),
+ TileOp<GPUDevice>);
+
+REGISTER_KERNEL_BUILDER(Name("TileGrad")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<float>("T")
+ .HostMemory("multiples"),
+ TileGradientOp<GPUDevice>);
+REGISTER_KERNEL_BUILDER(Name("TileGrad")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<double>("T")
+ .HostMemory("multiples"),
+ TileGradientOp<GPUDevice>);
+REGISTER_KERNEL_BUILDER(Name("TileGrad")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<int16>("T")
+ .HostMemory("multiples"),
+ TileGradientOp<GPUDevice>);
+#endif // GOOGLE_CUDA
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_ops.h b/tensorflow/core/kernels/tile_ops.h
new file mode 100644
index 0000000000..b3cc6165e0
--- /dev/null
+++ b/tensorflow/core/kernels/tile_ops.h
@@ -0,0 +1,48 @@
+#ifndef TENSORFLOW_KERNELS_TILE_OPS_H_
+#define TENSORFLOW_KERNELS_TILE_OPS_H_
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T, int NDIM>
+struct Tile {
+ void operator()(const Device& d, typename TTypes<T, NDIM>::Tensor out,
+ typename TTypes<T, NDIM>::ConstTensor in,
+ const Eigen::array<int32, NDIM>& broadcast_array) const {
+ out.device(d) = in.broadcast(broadcast_array);
+ }
+};
+
+template <typename Device, typename T, int NDIM>
+struct TileGrad {
+ void operator()(const Device& d, typename TTypes<T, NDIM>::Tensor out,
+ typename TTypes<T, NDIM>::ConstTensor in,
+ const Eigen::DSizes<ptrdiff_t, NDIM>& indices,
+ const Eigen::DSizes<ptrdiff_t, NDIM>& sizes,
+ bool first) const {
+ if (first) {
+ out.device(d) = in.slice(indices, sizes);
+ } else {
+ out.device(d) += in.slice(indices, sizes);
+ }
+ }
+};
+
+template <typename Device, typename T, int NDIM, int REDUCEDNDIM>
+struct ReduceAndReshape {
+ void operator()(const Device& d, typename TTypes<T, NDIM>::Tensor out,
+ typename TTypes<T, NDIM>::ConstTensor in,
+ const Eigen::DSizes<ptrdiff_t, REDUCEDNDIM>& reduce_dim,
+ const Eigen::DSizes<ptrdiff_t, NDIM>& reshape_dim) const {
+ out.device(d) = in.sum(reduce_dim).reshape(reshape_dim);
+ }
+};
+
+} // end namespace functor
+} // end namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_TILE_OPS_H_
diff --git a/tensorflow/core/kernels/tile_ops_gpu.cu.cc b/tensorflow/core/kernels/tile_ops_gpu.cu.cc
new file mode 100644
index 0000000000..29481e1a54
--- /dev/null
+++ b/tensorflow/core/kernels/tile_ops_gpu.cu.cc
@@ -0,0 +1,38 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_ops.h"
+#include <stdio.h>
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_TYPE(T) \
+ DEFINE_DIM(T, 1) \
+ DEFINE_DIM(T, 2) \
+ DEFINE_DIM(T, 3) \
+ DEFINE_DIM(T, 4) \
+ DEFINE_DIM(T, 5)
+
+#define DEFINE_DIM(T, NDIM) \
+ template struct Tile<GPUDevice, T, NDIM>; \
+ template struct TileGrad<GPUDevice, T, NDIM>; \
+ template struct ReduceAndReshape<GPUDevice, T, NDIM, 1>;
+
+DEFINE_TYPE(float)
+DEFINE_TYPE(double)
+DEFINE_TYPE(int64)
+DEFINE_TYPE(int32)
+DEFINE_TYPE(int16)
+// NOTE(keveman): Eigen's int8 and string versions don't compile yet with nvcc.
+
+#undef DEFINE_DIM
+#undef DEFINE_TYPE
+
+} // end namespace functor
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
new file mode 100644
index 0000000000..79b5d4d07e
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -0,0 +1,71 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/lib/gtl/top_n.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+template <typename T>
+class TopK : public OpKernel {
+ public:
+ explicit TopK(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("k", &k_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const auto& input_in = context->input(0);
+ OP_REQUIRES(context, input_in.dims() == 2,
+ errors::InvalidArgument("input must be 2-dimensional"));
+ OP_REQUIRES(context, input_in.dim_size(1) >= k_,
+ errors::InvalidArgument("input must have at least k columns"));
+
+ const auto& input = input_in.matrix<T>();
+
+ const auto num_rows = input_in.dim_size(0); // generally batch_size
+ const auto num_cols = input_in.dim_size(1);
+
+ Tensor* values_out = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(
+ 0, TensorShape({num_rows, k_}), &values_out));
+ Tensor* indices_out = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(
+ 1, TensorShape({num_rows, k_}), &indices_out));
+ auto values = values_out->matrix<T>();
+ auto indices = indices_out->matrix<int32>();
+
+ gtl::TopN<std::pair<T, int32>> filter(k_);
+
+ for (int r = 0; r < num_rows; r++) {
+ for (int32 c = 0; c < num_cols; ++c) {
+ // The second element is the negated index, so that lower-index elements
+ // are considered larger than higher-index elements in case of ties.
+ filter.push(std::make_pair(input(r, c), -c));
+ }
+
+ std::unique_ptr<std::vector<std::pair<T, int32>>> top_k(filter.Extract());
+ for (int32 i = 0; i < k_; ++i) {
+ values(r, i) = (*top_k)[i].first;
+ indices(r, i) = -(*top_k)[i].second;
+ }
+ filter.Reset();
+ }
+ }
+
+ private:
+ int k_;
+};
+
+#define REGISTER_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("TopK").Device(DEVICE_CPU).TypeConstraint<type>("T"), TopK<type>)
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
new file mode 100644
index 0000000000..611fa4ac41
--- /dev/null
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -0,0 +1,884 @@
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/training_ops.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+static inline bool DoInline(int64 size) { return size <= (256ll << 10); }
+
+template <typename T>
+struct ApplyGradientDescent<CPUDevice, T> {
+ void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+ typename TTypes<T>::ConstScalar lr,
+ typename TTypes<T>::ConstFlat grad) {
+ if (DoInline(var.size())) {
+ var -= grad * lr();
+ } else {
+ var.device(d) -= grad * lr();
+ }
+ }
+};
+
+template <typename T>
+struct ApplyAdagrad<CPUDevice, T> {
+ void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+ typename TTypes<T>::Flat accum,
+ typename TTypes<T>::ConstScalar lr,
+ typename TTypes<T>::ConstFlat grad) {
+ if (DoInline(var.size())) {
+ accum += grad.square();
+ var -= grad * lr() * accum.rsqrt();
+ } else {
+ accum.device(d) += grad.square();
+ var.device(d) -= grad * lr() * accum.rsqrt();
+ }
+ }
+};
+
+template <typename T>
+struct ApplyMomentum<CPUDevice, T> {
+ void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+ typename TTypes<T>::Flat accum,
+ typename TTypes<T>::ConstScalar lr,
+ typename TTypes<T>::ConstFlat grad,
+ typename TTypes<T>::ConstScalar momentum) {
+ if (DoInline(var.size())) {
+ accum = accum * momentum() + grad;
+ var -= accum * lr();
+ } else {
+ accum.device(d) = accum * momentum() + grad;
+ var.device(d) -= accum * lr();
+ }
+ }
+};
+
+template <typename T>
+struct ApplyAdam<CPUDevice, T> {
+ void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+ typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+ typename TTypes<T>::ConstScalar beta1_power,
+ typename TTypes<T>::ConstScalar beta2_power,
+ typename TTypes<T>::ConstScalar lr,
+ typename TTypes<T>::ConstScalar beta1,
+ typename TTypes<T>::ConstScalar beta2,
+ typename TTypes<T>::ConstScalar epsilon,
+ typename TTypes<T>::ConstFlat grad) {
+ const T alpha = lr() * std::sqrt(1 - beta2_power()) / (1 - beta1_power());
+ if (DoInline(var.size())) {
+ m += (grad - m) * (1 - beta1());
+ v += (grad.square() - v) * (1 - beta2());
+ var -= (m * alpha) / (v.sqrt() + epsilon());
+ } else {
+ m.device(d) += (grad - m) * (1 - beta1());
+ v.device(d) += (grad.square() - v) * (1 - beta2());
+ var.device(d) -= (m * alpha) / (v.sqrt() + epsilon());
+ }
+ }
+};
+
+template <typename T>
+struct ApplyRMSProp<CPUDevice, T> {
+ void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+ typename TTypes<T>::Flat ms, typename TTypes<T>::Flat mom,
+ typename TTypes<T>::ConstScalar lr,
+ typename TTypes<T>::ConstScalar rho,
+ typename TTypes<T>::ConstScalar momentum,
+ typename TTypes<T>::ConstScalar epsilon,
+ typename TTypes<T>::ConstFlat grad) {
+ if (DoInline(var.size())) {
+ ms += (grad.square() - ms) * (1 - rho());
+ mom = mom * momentum() + (grad * lr()) / ((ms + epsilon()).sqrt());
+ var -= mom;
+ } else {
+ ms.device(d) += (grad.square() - ms) * (1 - rho());
+ mom.device(d) =
+ mom * momentum() + (grad * lr()) / ((ms + epsilon()).sqrt());
+ var.device(d) -= mom;
+ }
+ }
+};
+
+} // namespace functor
+
+template <typename Device, typename T>
+class ApplyGradientDescentOp : public OpKernel {
+ public:
+ explicit ApplyGradientDescentOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ if (use_exclusive_lock_) {
+ mutex_lock l(*ctx->input_ref_mutex(0));
+ DoValidate(ctx);
+ if (!ctx->status().ok()) return;
+ DoCompute(ctx);
+ } else {
+ DoValidate(ctx);
+ if (!ctx->status().ok()) return;
+ DoCompute(ctx);
+ }
+ ctx->forward_ref_input_to_ref_output(0, 0);
+ }
+
+ private:
+ bool use_exclusive_lock_;
+
+ void DoValidate(OpKernelContext* ctx) {
+ Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+ OP_REQUIRES(
+ ctx, var.IsInitialized(),
+ errors::FailedPrecondition(
+ "Attempting to use uninitialized variables: ", def().input(0)));
+ const Tensor& alpha = ctx->input(1);
+ OP_REQUIRES(ctx, TensorShapeUtils::IsLegacyScalar(alpha.shape()),
+ errors::InvalidArgument("alpha is not a scalar: ",
+ alpha.shape().DebugString()));
+ const Tensor& delta = ctx->input(2);
+ OP_REQUIRES(
+ ctx, var.shape().IsSameSize(delta.shape()),
+ errors::InvalidArgument("var and delta do not have the same shape",
+ var.shape().DebugString(), " ",
+ delta.shape().DebugString()));
+ }
+
+ void DoCompute(OpKernelContext* ctx) {
+ const Device& device = ctx->template eigen_device<Device>();
+ Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+ const Tensor& alpha = ctx->input(1);
+ const Tensor& delta = ctx->input(2);
+ functor::ApplyGradientDescent<Device, T>()(
+ device, var.flat<T>(), alpha.scalar<T>(), delta.flat<T>());
+ }
+};
+
+#define REGISTER_KERNELS(D, T) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("ApplyGradientDescent").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+ ApplyGradientDescentOp<D##Device, T>);
+
+REGISTER_KERNELS(CPU, float);
+REGISTER_KERNELS(CPU, double);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T) \
+ template <> \
+ void ApplyGradientDescent<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T>::Flat var, \
+ typename TTypes<T>::ConstScalar alpha, \
+ typename TTypes<T>::ConstFlat delta); \
+ extern template struct ApplyGradientDescent<GPUDevice, T>;
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+} // namespace functor
+
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_KERNELS
+
+template <typename Device, typename T>
+class ApplyAdagradOp : public OpKernel {
+ public:
+ explicit ApplyAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ if (use_exclusive_lock_) {
+ mutex_lock l1(*ctx->input_ref_mutex(0));
+ // Don't try to acquire a lock on the second ref as they share the same
+ // mutex.
+ //
+ // mutex_lock l2(*ctx->input_ref_mutex(1));
+ DoValidate(ctx);
+ if (!ctx->status().ok()) return;
+ DoCompute(ctx);
+ } else {
+ DoValidate(ctx);
+ if (!ctx->status().ok()) return;
+ DoCompute(ctx);
+ }
+ ctx->forward_ref_input_to_ref_output(0, 0);
+ }
+
+ private:
+ bool use_exclusive_lock_;
+
+ void DoValidate(OpKernelContext* ctx) {
+ Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+ Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+ OP_REQUIRES(
+ ctx, var.IsInitialized(),
+ errors::FailedPrecondition(
+ "Attempting to use uninitialized variables: ", def().input(0)));
+ OP_REQUIRES(
+ ctx, accum.IsInitialized(),
+ errors::FailedPrecondition(
+ "Attempting to use uninitialized variables: ", def().input(1)));
+ const Tensor& lr = ctx->input(2);
+ OP_REQUIRES(ctx, TensorShapeUtils::IsLegacyScalar(lr.shape()),
+ errors::InvalidArgument("lr is not a scalar: ",
+ lr.shape().DebugString()));
+ const Tensor& grad = ctx->input(3);
+ OP_REQUIRES(
+ ctx, var.shape().IsSameSize(accum.shape()),
+ errors::InvalidArgument("var and accum do not have the same shape",
+ var.shape().DebugString(), " ",
+ accum.shape().DebugString()));
+ OP_REQUIRES(
+ ctx, var.shape().IsSameSize(grad.shape()),
+ errors::InvalidArgument("var and delta do not have the same shape",
+ var.shape().DebugString(), " ",
+ grad.shape().DebugString()));
+ }
+
+ void DoCompute(OpKernelContext* ctx) {
+ const Device& device = ctx->template eigen_device<Device>();
+ Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+ Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+ const Tensor& lr = ctx->input(2);
+ const Tensor& grad = ctx->input(3);
+ functor::ApplyAdagrad<Device, T>()(device, var.flat<T>(), accum.flat<T>(),
+ lr.scalar<T>(), grad.flat<T>());
+ }
+};
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER_KERNELS(D, T) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("ApplyAdagrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+ ApplyAdagradOp<D##Device, T>);
+
+REGISTER_KERNELS(CPU, float);
+REGISTER_KERNELS(CPU, double);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T) \
+ template <> \
+ void ApplyAdagrad<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T>::Flat var, \
+ typename TTypes<T>::Flat accum, typename TTypes<T>::ConstScalar lr, \
+ typename TTypes<T>::ConstFlat grad); \
+ extern template struct ApplyAdagrad<GPUDevice, T>;
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+} // namespace functor
+
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_KERNELS
+
+// Note, this op works on cpu only.
+template <typename T, typename Tindex>
+class SparseApplyAdagradOp : public OpKernel {
+ public:
+ explicit SparseApplyAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+ }
+
+ void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+ mutex* mu_var = ctx->input_ref_mutex(0);
+ // mu_accum is actually the same mutex as mu_var since currently we use a
+ // global mutex.
+ //
+ // mutex* mu_accum = ctx->input_ref_mutex(1);
+ if (use_exclusive_lock_) {
+ mu_var->lock();
+ }
+ Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+ Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+ OP_REQUIRES(
+ ctx, var.IsInitialized(),
+ errors::FailedPrecondition(
+ "Attempting to use uninitialized variables: ", def().input(0)));
+ OP_REQUIRES(
+ ctx, accum.IsInitialized(),
+ errors::FailedPrecondition(
+ "Attempting to use uninitialized variables: ", def().input(1)));
+ OP_REQUIRES(
+ ctx, var.shape().IsSameSize(accum.shape()),
+ errors::InvalidArgument("var and accum do not have the same shape",
+ var.shape().DebugString(), " ",
+ accum.shape().DebugString()));
+ OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
+ errors::InvalidArgument("var must be at least 1 dimensional"));
+
+ const Tensor& lr = ctx->input(2);
+ OP_REQUIRES(ctx, TensorShapeUtils::IsLegacyScalar(lr.shape()),
+ errors::InvalidArgument("lr is not a scalar: ",
+ lr.shape().DebugString()));
+ const Tensor& grad = ctx->input(3);
+ const Tensor& indices = ctx->input(4);
+ OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+ errors::InvalidArgument("indices must be one-dimensional"));
+
+ for (int d = 1; d < var.dims(); d++) {
+ OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d),
+ errors::InvalidArgument(strings::StrCat(
+ "var and grad must match in dimension ", d)));
+ }
+ const Tindex N = indices.dim_size(0);
+ OP_REQUIRES(
+ ctx, grad.dim_size(0) == N,
+ errors::InvalidArgument(
+ "grad must be the same size as indices in the first dimension."));
+
+ if (N > 0) {
+ const Tindex first_dim_size = var.dim_size(0);
+ // Validate all the indices are in range
+ auto indices_vec = indices.vec<Tindex>();
+ for (Tindex i = 0; i < N; i++) {
+ const Tindex index = indices_vec(i);
+ OP_REQUIRES(ctx, index >= 0 && index < first_dim_size,
+ errors::InvalidArgument(
+ strings::StrCat("Index ", index, " at offset ", i,
+ " in indices is out of range")));
+ }
+
+ auto var_flat = var.flat_outer_dims<T>();
+ auto accum_flat = accum.flat_outer_dims<T>();
+ auto grad_flat = grad.flat_outer_dims<T>();
+ T lr_scalar = lr.scalar<T>()();
+
+ // Note(yonghui): It might be worth multi-threading square() and rsqrt().
+ for (Tindex i = 0; i < N; i++) {
+ const Tindex index = indices_vec(i);
+ auto a = accum_flat.template chip<0>(index);
+ auto g = grad_flat.template chip<0>(i);
+ auto v = var_flat.template chip<0>(index);
+ a += g.square();
+ v -= g.constant(lr_scalar) * g * a.rsqrt();
+ }
+ }
+ if (use_exclusive_lock_) {
+ mu_var->unlock();
+ }
+
+ ctx->forward_ref_input_to_ref_output(0, 0);
+ }
+
+ private:
+ bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(T, Tindices) \
+ REGISTER_KERNEL_BUILDER(Name("SparseApplyAdagrad") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<T>("T") \
+ .TypeConstraint<Tindices>("Tindices"), \
+ SparseApplyAdagradOp<T, Tindices>);
+
+REGISTER_KERNELS(float, int32);
+REGISTER_KERNELS(float, int64);
+REGISTER_KERNELS(double, int32);
+REGISTER_KERNELS(double, int64);
+#undef REGISTER_KERNELS
+
+template <typename Device, typename T>
+class ApplyMomentumOp : public OpKernel {
+ public:
+ explicit ApplyMomentumOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ if (use_exclusive_lock_) {
+ mutex_lock l1(*ctx->input_ref_mutex(0));
+ // Don't try to acquire a lock on the second ref as they share the same
+ // mutex.
+ //
+ // mutex_lock l2(*ctx->input_ref_mutex(1));
+ DoValidate(ctx);
+ if (!ctx->status().ok()) return;
+ DoCompute(ctx);
+ } else {
+ DoValidate(ctx);
+ if (!ctx->status().ok()) return;
+ DoCompute(ctx);
+ }
+ ctx->forward_ref_input_to_ref_output(0, 0);
+ }
+
+ private:
+ bool use_exclusive_lock_;
+
+ void DoValidate(OpKernelContext* ctx) {
+ Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+ Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+ OP_REQUIRES(
+ ctx, var.IsInitialized(),
+ errors::FailedPrecondition(
+ "Attempting to use uninitialized variables: ", def().input(0)));
+ OP_REQUIRES(
+ ctx, accum.IsInitialized(),
+ errors::FailedPrecondition(
+ "Attempting to use uninitialized variables: ", def().input(1)));
+ const Tensor& lr = ctx->input(2);
+ OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+ errors::InvalidArgument("lr is not a scalar: ",
+ lr.shape().DebugString()));
+ const Tensor& grad = ctx->input(3);
+ OP_REQUIRES(
+ ctx, var.shape().IsSameSize(accum.shape()),
+ errors::InvalidArgument("var and accum do not have the same shape",
+ var.shape().DebugString(), " ",
+ accum.shape().DebugString()));
+ OP_REQUIRES(
+ ctx, var.shape().IsSameSize(grad.shape()),
+ errors::InvalidArgument("var and delta do not have the same shape",
+ var.shape().DebugString(), " ",
+ grad.shape().DebugString()));
+
+ const Tensor& momentum = ctx->input(4);
+ OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum.shape()),
+ errors::InvalidArgument("momentum is not a scalar: ",
+ momentum.shape().DebugString()));
+ }
+
+ void DoCompute(OpKernelContext* ctx) {
+ const Device& device = ctx->template eigen_device<Device>();
+ Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+ Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+ const Tensor& lr = ctx->input(2);
+ const Tensor& grad = ctx->input(3);
+ const Tensor& momentum = ctx->input(4);
+ functor::ApplyMomentum<Device, T>()(device, var.flat<T>(), accum.flat<T>(),
+ lr.scalar<T>(), grad.flat<T>(),
+ momentum.scalar<T>());
+ }
+};
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER_KERNELS(D, T) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("ApplyMomentum").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+ ApplyMomentumOp<D##Device, T>);
+
+REGISTER_KERNELS(CPU, float);
+REGISTER_KERNELS(CPU, double);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T) \
+ template <> \
+ void ApplyMomentum<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T>::Flat var, \
+ typename TTypes<T>::Flat accum, typename TTypes<T>::ConstScalar lr, \
+ typename TTypes<T>::ConstFlat grad, \
+ typename TTypes<T>::ConstScalar momentum); \
+ extern template struct ApplyMomentum<GPUDevice, T>;
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+} // namespace functor
+
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_KERNELS
+
+// Note, this op works on cpu only.
+template <typename T, typename Tindex>
+class SparseApplyMomentumOp : public OpKernel {
+ public:
+ explicit SparseApplyMomentumOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+ }
+
+ void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+ mutex* mu_var = ctx->input_ref_mutex(0);
+ // mu_accum is actually the same mutex as mu_var since currently we use a
+ // global mutex.
+ //
+ // mutex* mu_accum = ctx->input_ref_mutex(1);
+ if (use_exclusive_lock_) {
+ mu_var->lock();
+ }
+ Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+ Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+ OP_REQUIRES(
+ ctx, var.IsInitialized(),
+ errors::FailedPrecondition(
+ "Attempting to use uninitialized variables: ", def().input(0)));
+ OP_REQUIRES(
+ ctx, accum.IsInitialized(),
+ errors::FailedPrecondition(
+ "Attempting to use uninitialized variables: ", def().input(1)));
+ OP_REQUIRES(
+ ctx, var.shape().IsSameSize(accum.shape()),
+ errors::InvalidArgument("var and accum do not have the same shape",
+ var.shape().DebugString(), " ",
+ accum.shape().DebugString()));
+ OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
+ errors::InvalidArgument("var must be at least 1 dimensional"));
+
+ const Tensor& lr = ctx->input(2);
+ OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+ errors::InvalidArgument("lr is not a scalar: ",
+ lr.shape().DebugString()));
+ const Tensor& grad = ctx->input(3);
+ const Tensor& indices = ctx->input(4);
+ OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+ errors::InvalidArgument("indices must be one-dimensional"));
+
+ for (int d = 1; d < var.dims(); d++) {
+ OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d),
+ errors::InvalidArgument(strings::StrCat(
+ "var and grad must match in dimension ", d)));
+ }
+ const Tindex N = indices.dim_size(0);
+ OP_REQUIRES(
+ ctx, grad.dim_size(0) == N,
+ errors::InvalidArgument(
+ "grad must be the same size as indices in the first dimension."));
+
+ const Tensor& momentum = ctx->input(5);
+ OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum.shape()),
+ errors::InvalidArgument("momentum is not a scalar: ",
+ momentum.shape().DebugString()));
+
+ if (N > 0) {
+ const Tindex first_dim_size = var.dim_size(0);
+ // Validate all the indices are in range
+ auto indices_vec = indices.vec<Tindex>();
+ for (Tindex i = 0; i < N; i++) {
+ const Tindex index = indices_vec(i);
+ OP_REQUIRES(ctx, index >= 0 && index < first_dim_size,
+ errors::InvalidArgument(
+ strings::StrCat("Index ", index, " at offset ", i,
+ " in indices is out of range")));
+ }
+
+ auto var_flat = var.flat_outer_dims<T>();
+ auto accum_flat = accum.flat_outer_dims<T>();
+ auto grad_flat = grad.flat_outer_dims<T>();
+ T lr_scalar = lr.scalar<T>()();
+ T momentum_scalar = momentum.scalar<T>()();
+
+ for (Tindex i = 0; i < N; i++) {
+ const Tindex index = indices_vec(i);
+ auto a = accum_flat.template chip<0>(index);
+ auto g = grad_flat.template chip<0>(i);
+ auto v = var_flat.template chip<0>(index);
+ a = a * a.constant(momentum_scalar) + g;
+ v -= a.constant(lr_scalar) * a;
+ }
+ }
+ if (use_exclusive_lock_) {
+ mu_var->unlock();
+ }
+
+ ctx->forward_ref_input_to_ref_output(0, 0);
+ }
+
+ private:
+ bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(T, Tindices) \
+ REGISTER_KERNEL_BUILDER(Name("SparseApplyMomentum") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<T>("T") \
+ .TypeConstraint<Tindices>("Tindices"), \
+ SparseApplyMomentumOp<T, Tindices>);
+
+REGISTER_KERNELS(float, int32);
+REGISTER_KERNELS(float, int64);
+REGISTER_KERNELS(double, int32);
+REGISTER_KERNELS(double, int64);
+#undef REGISTER_KERNELS
+
+template <typename Device, typename T>
+class ApplyAdamOp : public OpKernel {
+ public:
+ explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ if (use_exclusive_lock_) {
+ // all input refs share the same mutex
+ mutex_lock l1(*ctx->input_ref_mutex(0));
+ DoValidate(ctx);
+ if (!ctx->status().ok()) return;
+ DoCompute(ctx);
+ } else {
+ DoValidate(ctx);
+ if (!ctx->status().ok()) return;
+ DoCompute(ctx);
+ }
+ ctx->forward_ref_input_to_ref_output(0, 0);
+ }
+
+ private:
+ bool use_exclusive_lock_;
+
+ void DoValidate(OpKernelContext* ctx) {
+ Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+ Tensor m = ctx->mutable_input(1, use_exclusive_lock_);
+ Tensor v = ctx->mutable_input(2, use_exclusive_lock_);
+ OP_REQUIRES(
+ ctx, var.IsInitialized(),
+ errors::FailedPrecondition(
+ "Attempting to use uninitialized variables: ", def().input(0)));
+ OP_REQUIRES(
+ ctx, m.IsInitialized(),
+ errors::FailedPrecondition(
+ "Attempting to use uninitialized variables: ", def().input(1)));
+ OP_REQUIRES(
+ ctx, v.IsInitialized(),
+ errors::FailedPrecondition(
+ "Attempting to use uninitialized variables: ", def().input(2)));
+
+ const Tensor& beta1_power = ctx->input(3);
+ const Tensor& beta2_power = ctx->input(4);
+ const Tensor& lr = ctx->input(5);
+ const Tensor& beta1 = ctx->input(6);
+ const Tensor& beta2 = ctx->input(7);
+ const Tensor& epsilon = ctx->input(8);
+
+ OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power.shape()),
+ errors::InvalidArgument("beta1_power is not a scalar: ",
+ beta1_power.shape().DebugString()));
+ OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power.shape()),
+ errors::InvalidArgument("beta2_power is not a scalar: ",
+ beta2_power.shape().DebugString()));
+ OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+ errors::InvalidArgument("lr is not a scalar: ",
+ lr.shape().DebugString()));
+ OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()),
+ errors::InvalidArgument("beta1 is not a scalar: ",
+ beta1.shape().DebugString()));
+ OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()),
+ errors::InvalidArgument("beta2 is not a scalar: ",
+ beta2.shape().DebugString()));
+ OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()),
+ errors::InvalidArgument("epsilon is not a scalar: ",
+ epsilon.shape().DebugString()));
+
+ const Tensor& grad = ctx->input(9);
+ OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()),
+ errors::InvalidArgument("var and m do not have the same shape",
+ var.shape().DebugString(), " ",
+ m.shape().DebugString()));
+ OP_REQUIRES(ctx, var.shape().IsSameSize(v.shape()),
+ errors::InvalidArgument("var and v do not have the same shape",
+ var.shape().DebugString(), " ",
+ v.shape().DebugString()));
+ OP_REQUIRES(
+ ctx, var.shape().IsSameSize(grad.shape()),
+ errors::InvalidArgument("var and grad do not have the same shape",
+ var.shape().DebugString(), " ",
+ grad.shape().DebugString()));
+ }
+
+ void DoCompute(OpKernelContext* ctx) {
+ const Device& device = ctx->template eigen_device<Device>();
+ Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+ Tensor m = ctx->mutable_input(1, use_exclusive_lock_);
+ Tensor v = ctx->mutable_input(2, use_exclusive_lock_);
+ const Tensor& beta1_power = ctx->input(3);
+ const Tensor& beta2_power = ctx->input(4);
+ const Tensor& lr = ctx->input(5);
+ const Tensor& beta1 = ctx->input(6);
+ const Tensor& beta2 = ctx->input(7);
+ const Tensor& epsilon = ctx->input(8);
+ const Tensor& grad = ctx->input(9);
+
+ functor::ApplyAdam<Device, T>()(device, var.flat<T>(), m.flat<T>(),
+ v.flat<T>(), beta1_power.scalar<T>(),
+ beta2_power.scalar<T>(), lr.scalar<T>(),
+ beta1.scalar<T>(), beta2.scalar<T>(),
+ epsilon.scalar<T>(), grad.flat<T>());
+ }
+};
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER_KERNELS(D, T) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("ApplyAdam").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+ ApplyAdamOp<D##Device, T>);
+
+REGISTER_KERNELS(CPU, float);
+REGISTER_KERNELS(CPU, double);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T) \
+ template <> \
+ void ApplyAdam<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T>::Flat var, \
+ typename TTypes<T>::Flat m, typename TTypes<T>::Flat v, \
+ typename TTypes<T>::ConstScalar beta1_power, \
+ typename TTypes<T>::ConstScalar beta2_power, \
+ typename TTypes<T>::ConstScalar lr, \
+ typename TTypes<T>::ConstScalar beta1, \
+ typename TTypes<T>::ConstScalar beta2, \
+ typename TTypes<T>::ConstScalar epsilon, \
+ typename TTypes<T>::ConstFlat grad); \
+ extern template struct ApplyAdam<GPUDevice, T>;
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+} // namespace functor
+
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_KERNELS
+
+template <typename Device, typename T>
+class ApplyRMSPropOp : public OpKernel {
+ public:
+ explicit ApplyRMSPropOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ if (use_exclusive_lock_) {
+ // all input refs share the same mutex
+ mutex_lock l1(*ctx->input_ref_mutex(0));
+ DoValidate(ctx);
+ if (!ctx->status().ok()) return;
+ DoCompute(ctx);
+ } else {
+ DoValidate(ctx);
+ if (!ctx->status().ok()) return;
+ DoCompute(ctx);
+ }
+ ctx->forward_ref_input_to_ref_output(0, 0);
+ }
+
+ private:
+ bool use_exclusive_lock_;
+
+ void DoValidate(OpKernelContext* ctx) {
+ Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+ Tensor ms = ctx->mutable_input(1, use_exclusive_lock_);
+ Tensor mom = ctx->mutable_input(2, use_exclusive_lock_);
+
+ OP_REQUIRES(
+ ctx, var.IsInitialized(),
+ errors::FailedPrecondition(
+ "Attempting to use uninitialized variables: ", def().input(0)));
+ OP_REQUIRES(
+ ctx, ms.IsInitialized(),
+ errors::FailedPrecondition(
+ "Attempting to use uninitialized variables: ", def().input(1)));
+ OP_REQUIRES(
+ ctx, mom.IsInitialized(),
+ errors::FailedPrecondition(
+ "Attempting to use uninitialized variables: ", def().input(2)));
+
+ const Tensor& lr = ctx->input(3);
+ const Tensor& rho = ctx->input(4);
+ const Tensor& momentum = ctx->input(5);
+ const Tensor& epsilon = ctx->input(6);
+ const Tensor& grad = ctx->input(7);
+
+ OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+ errors::InvalidArgument("lr is not a scalar: ",
+ lr.shape().DebugString()));
+ OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(rho.shape()),
+ errors::InvalidArgument("rho is not a scalar: ",
+ rho.shape().DebugString()));
+ OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum.shape()),
+ errors::InvalidArgument("momentum is not a scalar: ",
+ momentum.shape().DebugString()));
+ OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()),
+ errors::InvalidArgument("epsilon is not a scalar: ",
+ epsilon.shape().DebugString()));
+
+ OP_REQUIRES(ctx, var.shape().IsSameSize(ms.shape()),
+ errors::InvalidArgument("var and ms do not have the same shape",
+ var.shape().DebugString(), " ",
+ ms.shape().DebugString()));
+
+ OP_REQUIRES(ctx, var.shape().IsSameSize(mom.shape()),
+ errors::InvalidArgument(
+ "var and mom do not have the same shape",
+ var.shape().DebugString(), " ", mom.shape().DebugString()));
+
+ OP_REQUIRES(
+ ctx, var.shape().IsSameSize(grad.shape()),
+ errors::InvalidArgument("var and grad do not have the same shape",
+ var.shape().DebugString(), " ",
+ grad.shape().DebugString()));
+ }
+
+ void DoCompute(OpKernelContext* ctx) {
+ const Device& device = ctx->template eigen_device<Device>();
+ Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+ Tensor ms = ctx->mutable_input(1, use_exclusive_lock_);
+ Tensor mom = ctx->mutable_input(2, use_exclusive_lock_);
+ const Tensor& lr = ctx->input(3);
+ const Tensor& rho = ctx->input(4);
+ const Tensor& momentum = ctx->input(5);
+ const Tensor& epsilon = ctx->input(6);
+ const Tensor& grad = ctx->input(7);
+
+ functor::ApplyRMSProp<Device, T>()(device, var.flat<T>(), ms.flat<T>(),
+ mom.flat<T>(), lr.scalar<T>(),
+ rho.scalar<T>(), momentum.scalar<T>(),
+ epsilon.scalar<T>(), grad.flat<T>());
+ }
+};
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER_KERNELS(D, T) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("ApplyRMSProp").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+ ApplyRMSPropOp<D##Device, T>);
+
+REGISTER_KERNELS(CPU, float);
+REGISTER_KERNELS(CPU, double);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T) \
+ template <> \
+ void ApplyRMSProp<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T>::Flat var, \
+ typename TTypes<T>::Flat ms, typename TTypes<T>::Flat mom, \
+ typename TTypes<T>::ConstScalar lr, typename TTypes<T>::ConstScalar rho, \
+ typename TTypes<T>::ConstScalar momentum, \
+ typename TTypes<T>::ConstScalar epsilon, \
+ typename TTypes<T>::ConstFlat grad); \
+ extern template struct ApplyRMSProp<GPUDevice, T>;
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+} // namespace functor
+
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_KERNELS
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
new file mode 100644
index 0000000000..71f6d0253d
--- /dev/null
+++ b/tensorflow/core/kernels/training_ops.h
@@ -0,0 +1,65 @@
+#ifndef TENSORFLOW_KERNELS_TRAINING_OPS_H_
+#define TENSORFLOW_KERNELS_TRAINING_OPS_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Each training algorithm has a ApplyXYZ functor struct declared in
+// this header file. They are specialized for different devices
+// (CPUDevice in training_ops.cc or GPUDevice in training_ops_gpu.cc).
+
+template <typename Device, typename T>
+struct ApplyGradientDescent {
+ void operator()(const Device& d, typename TTypes<T>::Flat var,
+ typename TTypes<T>::ConstScalar alpha,
+ typename TTypes<T>::ConstFlat delta);
+};
+
+template <typename Device, typename T>
+struct ApplyAdagrad {
+ void operator()(const Device& d, typename TTypes<T>::Flat var,
+ typename TTypes<T>::Flat accum,
+ typename TTypes<T>::ConstScalar lr,
+ typename TTypes<T>::ConstFlat grad);
+};
+
+template <typename Device, typename T>
+struct ApplyMomentum {
+ void operator()(const Device& d, typename TTypes<T>::Flat var,
+ typename TTypes<T>::Flat accum,
+ typename TTypes<T>::ConstScalar lr,
+ typename TTypes<T>::ConstFlat grad,
+ typename TTypes<T>::ConstScalar momentum);
+};
+
+template <typename Device, typename T>
+struct ApplyAdam {
+ void operator()(const Device& d, typename TTypes<T>::Flat var,
+ typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+ typename TTypes<T>::ConstScalar beta1_power,
+ typename TTypes<T>::ConstScalar beta2_power,
+ typename TTypes<T>::ConstScalar lr,
+ typename TTypes<T>::ConstScalar beta1,
+ typename TTypes<T>::ConstScalar beta2,
+ typename TTypes<T>::ConstScalar epsilon,
+ typename TTypes<T>::ConstFlat grad);
+};
+
+template <typename Device, typename T>
+struct ApplyRMSProp {
+ void operator()(const Device& d, typename TTypes<T>::Flat var,
+ typename TTypes<T>::Flat ms, typename TTypes<T>::Flat mom,
+ typename TTypes<T>::ConstScalar lr,
+ typename TTypes<T>::ConstScalar rho,
+ typename TTypes<T>::ConstScalar momentum,
+ typename TTypes<T>::ConstScalar epsilon,
+ typename TTypes<T>::ConstFlat grad);
+};
+
+} // end namespace functor
+} // end namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_TRAINING_OPS_H_
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
new file mode 100644
index 0000000000..3106f29648
--- /dev/null
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -0,0 +1,127 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/training_ops.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+template <typename T>
+struct ApplyGradientDescent<GPUDevice, T> {
+ void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+ typename TTypes<T>::ConstScalar alpha,
+ typename TTypes<T>::ConstFlat delta) {
+ Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+ bcast[0] = delta.dimension(0);
+ Eigen::Sizes<1> single;
+ var.device(d) -= alpha.reshape(single).broadcast(bcast) * delta;
+ }
+};
+
+template <typename T>
+struct ApplyAdagrad<GPUDevice, T> {
+ void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+ typename TTypes<T>::Flat accum,
+ typename TTypes<T>::ConstScalar lr,
+ typename TTypes<T>::ConstFlat grad) {
+ accum.device(d) += grad.square();
+ Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+ bcast[0] = grad.dimension(0);
+ Eigen::Sizes<1> single;
+ var.device(d) -= lr.reshape(single).broadcast(bcast) * grad * accum.rsqrt();
+ }
+};
+
+template <typename T>
+struct ApplyMomentum<GPUDevice, T> {
+ void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+ typename TTypes<T>::Flat accum,
+ typename TTypes<T>::ConstScalar lr,
+ typename TTypes<T>::ConstFlat grad,
+ typename TTypes<T>::ConstScalar momentum) {
+ Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+ bcast[0] = grad.dimension(0);
+ Eigen::Sizes<1> single;
+ accum.device(d) = accum * momentum.reshape(single).broadcast(bcast) + grad;
+ var.device(d) -= lr.reshape(single).broadcast(bcast) * accum;
+ }
+};
+
+template <typename T>
+struct ApplyAdam<GPUDevice, T> {
+ void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+ typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+ typename TTypes<T>::ConstScalar beta1_power,
+ typename TTypes<T>::ConstScalar beta2_power,
+ typename TTypes<T>::ConstScalar lr,
+ typename TTypes<T>::ConstScalar beta1,
+ typename TTypes<T>::ConstScalar beta2,
+ typename TTypes<T>::ConstScalar epsilon,
+ typename TTypes<T>::ConstFlat grad) {
+ Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+ bcast[0] = grad.dimension(0);
+ Eigen::Sizes<1> single;
+ const auto one = static_cast<T>(1.0);
+ m.device(d) =
+ m +
+ (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
+ (grad - m);
+ v.device(d) =
+ v +
+ (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
+ (grad.square() - v);
+ var.device(d) -= (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
+ (beta1_power.constant(one) - beta1_power))
+ .reshape(single)
+ .broadcast(bcast) *
+ m / (epsilon.reshape(single).broadcast(bcast) + v.sqrt());
+ }
+};
+
+template <typename T>
+struct ApplyRMSProp<GPUDevice, T> {
+ void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+ typename TTypes<T>::Flat ms, typename TTypes<T>::Flat mom,
+ typename TTypes<T>::ConstScalar lr,
+ typename TTypes<T>::ConstScalar rho,
+ typename TTypes<T>::ConstScalar momentum,
+ typename TTypes<T>::ConstScalar epsilon,
+ typename TTypes<T>::ConstFlat grad) {
+ Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+ bcast[0] = grad.dimension(0);
+ Eigen::Sizes<1> single;
+ const auto one = static_cast<T>(1.0);
+ ms.device(d) = ms +
+ (rho.constant(one) - rho).reshape(single).broadcast(bcast) *
+ (grad.square() - ms);
+ mom.device(d) =
+ mom * momentum.reshape(single).broadcast(bcast) +
+ lr.reshape(single).broadcast(bcast) * grad /
+ ((epsilon.reshape(single).broadcast(bcast) + ms).sqrt());
+ var.device(d) -= mom;
+ }
+};
+
+} // namespace functor
+
+template struct functor::ApplyGradientDescent<GPUDevice, float>;
+template struct functor::ApplyGradientDescent<GPUDevice, double>;
+
+template struct functor::ApplyAdagrad<GPUDevice, float>;
+template struct functor::ApplyAdagrad<GPUDevice, double>;
+
+template struct functor::ApplyMomentum<GPUDevice, float>;
+template struct functor::ApplyMomentum<GPUDevice, double>;
+
+template struct functor::ApplyAdam<GPUDevice, float>;
+template struct functor::ApplyAdam<GPUDevice, double>;
+
+template struct functor::ApplyRMSProp<GPUDevice, float>;
+template struct functor::ApplyRMSProp<GPUDevice, double>;
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc
new file mode 100644
index 0000000000..3c629badb6
--- /dev/null
+++ b/tensorflow/core/kernels/training_ops_test.cc
@@ -0,0 +1,226 @@
+#include <gtest/gtest.h>
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+// We focus on the single thread performance of training ops.
+static SessionOptions InitSingleThreadedOptions() {
+ SessionOptions opts;
+ opts.config.set_intra_op_parallelism_threads(1);
+ opts.config.set_inter_op_parallelism_threads(1);
+ return opts;
+}
+
+static SessionOptions* GetOptions() {
+ static SessionOptions opts = InitSingleThreadedOptions();
+ return &opts;
+}
+
+static Node* Var(Graph* g, int n) {
+ return test::graph::Var(g, DT_FLOAT, TensorShape({n}));
+}
+
+static Node* Zeros(Graph* g, int n) {
+ Tensor data(DT_FLOAT, TensorShape({n}));
+ data.flat<float>().setZero();
+ return test::graph::Constant(g, data);
+}
+
+static Node* Random(Graph* g, int n) {
+ Tensor data(DT_FLOAT, TensorShape({n}));
+ data.flat<float>().setRandom();
+ return test::graph::Constant(g, data);
+}
+
+static Node* Scalar(Graph* g, float val) {
+ Tensor data(DT_FLOAT, TensorShape({}));
+ data.flat<float>()(0) = val;
+ return test::graph::Constant(g, data);
+}
+
+static void SGD(int32 n, Graph** init_g, Graph** train_g) {
+ RequireDefaultOps();
+ {
+ Graph* g = new Graph(OpRegistry::Global());
+ auto var = Var(g, n);
+ test::graph::Assign(g, var, Zeros(g, n));
+ *init_g = g;
+ }
+ {
+ Graph* g = new Graph(OpRegistry::Global());
+ auto var = Var(g, n);
+ auto lr = Scalar(g, 0.01);
+ auto grad = Random(g, n);
+ test::graph::Multi(g, "ApplyGradientDescent", {var, lr, grad});
+ *train_g = g;
+ }
+}
+
+static void BM_SGD(int iters, int params) {
+ const int64 tot = static_cast<int64>(iters) * params;
+ testing::ItemsProcessed(tot);
+ testing::BytesProcessed(tot * sizeof(float));
+ Graph* init;
+ Graph* train;
+ SGD(params, &init, &train);
+ test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_SGD)->Arg(128 << 10)->Arg(256 << 10);
+
+static void Adagrad(int32 n, Graph** init_g, Graph** train_g) {
+ RequireDefaultOps();
+ {
+ Graph* g = new Graph(OpRegistry::Global());
+ auto var = Var(g, n);
+ auto accum = Var(g, n);
+ auto zero = Zeros(g, n);
+ test::graph::Assign(g, var, zero);
+ test::graph::Assign(g, accum, zero);
+ *init_g = g;
+ }
+ {
+ Graph* g = new Graph(OpRegistry::Global());
+ auto var = Var(g, n);
+ auto accum = Var(g, n);
+ auto lr = Scalar(g, 0.01);
+ auto grad = Random(g, n);
+ test::graph::Multi(g, "ApplyAdagrad", {var, accum, lr, grad});
+ *train_g = g;
+ }
+}
+
+static void BM_Adagrad(int iters, int params) {
+ const int64 tot = static_cast<int64>(iters) * params;
+ testing::ItemsProcessed(tot);
+ testing::BytesProcessed(tot * sizeof(float));
+ Graph* init;
+ Graph* train;
+ Adagrad(params, &init, &train);
+ test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_Adagrad)->Arg(128 << 10)->Arg(256 << 10);
+
+static void Momentum(int32 n, Graph** init_g, Graph** train_g) {
+ RequireDefaultOps();
+ TensorShape shape({n});
+ {
+ Graph* g = new Graph(OpRegistry::Global());
+ auto var = Var(g, n);
+ auto accum = Var(g, n);
+ auto zero = Zeros(g, n);
+ test::graph::Assign(g, var, zero);
+ test::graph::Assign(g, accum, zero);
+ *init_g = g;
+ }
+ {
+ Graph* g = new Graph(OpRegistry::Global());
+ auto var = Var(g, n);
+ auto accum = Var(g, n);
+ auto lr = Scalar(g, 0.01);
+ auto grad = Random(g, n);
+ auto mom = Scalar(g, 0.01);
+ test::graph::Multi(g, "ApplyMomentum", {var, accum, lr, grad, mom});
+ *train_g = g;
+ }
+}
+
+static void BM_Momentum(int iters, int params) {
+ const int64 tot = static_cast<int64>(iters) * params;
+ testing::ItemsProcessed(tot);
+ testing::BytesProcessed(tot * sizeof(float));
+ Graph* init;
+ Graph* train;
+ Momentum(params, &init, &train);
+ test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_Momentum)->Arg(128 << 10)->Arg(256 << 10);
+
+static void Adam(int32 n, Graph** init_g, Graph** train_g) {
+ RequireDefaultOps();
+ TensorShape shape({n});
+ {
+ Graph* g = new Graph(OpRegistry::Global());
+ auto var = Var(g, n);
+ auto m = Var(g, n);
+ auto v = Var(g, n);
+ auto zero = Zeros(g, n);
+ test::graph::Assign(g, var, zero);
+ test::graph::Assign(g, m, zero);
+ test::graph::Assign(g, v, zero);
+ *init_g = g;
+ }
+ {
+ Graph* g = new Graph(OpRegistry::Global());
+ auto var = Var(g, n);
+ auto m = Var(g, n);
+ auto v = Var(g, n);
+ auto beta1_power = Scalar(g, 0.9);
+ auto beta2_power = Scalar(g, 0.99);
+ auto lr = Scalar(g, 0.01);
+ auto beta1 = Scalar(g, 0.9);
+ auto beta2 = Scalar(g, 0.99);
+ auto epsilon = Scalar(g, 1e-8);
+ auto grad = Random(g, n);
+ test::graph::Multi(g, "ApplyAdam", {var, m, v, beta1_power, beta2_power, lr,
+ beta1, beta2, epsilon, grad});
+ *train_g = g;
+ }
+}
+
+static void BM_Adam(int iters, int params) {
+ const int64 tot = static_cast<int64>(iters) * params;
+ testing::ItemsProcessed(tot);
+ testing::BytesProcessed(tot * sizeof(float));
+ Graph* init;
+ Graph* train;
+ Adam(params, &init, &train);
+ test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_Adam)->Arg(128 << 10)->Arg(256 << 10);
+
+static void RMSProp(int32 n, Graph** init_g, Graph** train_g) {
+ RequireDefaultOps();
+ TensorShape shape({n});
+ {
+ Graph* g = new Graph(OpRegistry::Global());
+ auto var = Var(g, n);
+ auto ms = Var(g, n);
+ auto mom = Var(g, n);
+ auto zero = Zeros(g, n);
+ test::graph::Assign(g, var, zero);
+ test::graph::Assign(g, ms, zero);
+ test::graph::Assign(g, mom, zero);
+ *init_g = g;
+ }
+ {
+ Graph* g = new Graph(OpRegistry::Global());
+ auto var = Var(g, n);
+ auto ms = Var(g, n);
+ auto mom = Var(g, n);
+ auto lr = Scalar(g, 0.01);
+ auto rho = Scalar(g, 0.9);
+ auto momentum = Scalar(g, 0.9);
+ auto epsilon = Scalar(g, 1e-8);
+ auto grad = Random(g, n);
+ test::graph::Multi(g, "ApplyRMSProp",
+ {var, ms, mom, lr, rho, momentum, epsilon, grad});
+ *train_g = g;
+ }
+}
+
+static void BM_RMSProp(int iters, int params) {
+ const int64 tot = static_cast<int64>(iters) * params;
+ testing::ItemsProcessed(tot);
+ testing::BytesProcessed(tot * sizeof(float));
+ Graph* init;
+ Graph* train;
+ RMSProp(params, &init, &train);
+ test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_RMSProp)->Arg(128 << 10)->Arg(256 << 10);
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
new file mode 100644
index 0000000000..4f11a881f8
--- /dev/null
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -0,0 +1,190 @@
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/transpose_op.h"
+#include "tensorflow/core/kernels/transpose_op_functor.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// inv = InvertPermutationOp(T<int32> p) takes a permutation of
+// integers 0, 1, ..., n - 1 and returns the inverted
+// permutation of p. I.e., inv[p[i]] == i, for i in [0 .. n).
+//
+// REQUIRES: input is a vector of int32.
+// REQUIRES: input is a permutation of 0, 1, ..., n-1.
+
+class InvertPermutationOp : public OpKernel {
+ public:
+ explicit InvertPermutationOp(OpKernelConstruction* context)
+ : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ OP_REQUIRES(
+ context, TensorShapeUtils::IsVector(input.shape()),
+ errors::InvalidArgument("invert_permutation expects a 1D vector."));
+ auto Tin = input.vec<int32>();
+ const int N = Tin.size();
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, input.shape(), &output));
+ auto Tout = output->vec<int32>();
+ std::fill_n(Tout.data(), N, -1);
+ for (int i = 0; i < N; ++i) {
+ const int32 d = Tin(i);
+ OP_REQUIRES(context, 0 <= d && d < N,
+ errors::InvalidArgument(d, " is not between 0 and ", N));
+ OP_REQUIRES(context, Tout(d) == -1,
+ errors::InvalidArgument(d, " is duplicated in the input."));
+ Tout(d) = i;
+ }
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("InvertPermutation").Device(DEVICE_CPU),
+ InvertPermutationOp);
+
+// output = TransposeOp(T<any> input, T<int32> perm) takes a tensor
+// of type T and rank N, and a permutation of 0, 1, ..., N-1. It
+// shuffles the dimensions of the input tensor according to permutation.
+//
+// Specifically, the returned tensor output meets the following condition:
+// 1) output.dims() == input.dims();
+// 2) output.dim_size(i) == input.dim_size(perm[i]);
+// 3) output.tensor<T, N>(i_0, i_1, ..., i_N-1) ==
+// input.tensor<T, N>(j_0, j_1, ..., j_N-1),
+// where i_s == j_{perm[s]}
+//
+// REQUIRES: perm is a vector of int32.
+// REQUIRES: input.dims() == perm.size().
+// REQUIRES: perm is a permutation.
+
+template <typename Device, typename T>
+TransposeOp<Device, T>::TransposeOp(OpKernelConstruction* context)
+ : OpKernel(context) {}
+
+template <typename Device, typename T>
+void TransposeOp<Device, T>::Compute(OpKernelContext* context) {
+ const Tensor& input = context->input(0);
+ const Tensor& perm = context->input(1);
+ // Preliminary validation of sizes.
+ OP_REQUIRES(context, TensorShapeUtils::IsVector(perm.shape()),
+ errors::InvalidArgument("perm must be a vector, not ",
+ perm.shape().DebugString()));
+ auto Vperm = perm.vec<int32>();
+ const int dims = input.dims();
+ static const int kMinDims = 1;
+ static const int kMaxDims = 8;
+ OP_REQUIRES(context, kMinDims <= dims && dims <= kMaxDims,
+ errors::Unimplemented("Transposing a tensor of rank ", dims,
+ " is not implemented."));
+ OP_REQUIRES(context, dims == Vperm.size(),
+ errors::InvalidArgument(
+ "transpose expects a vector of size ", input.dims(),
+ ". But input(1) is a vector of size ", Vperm.size()));
+ gtl::ArraySlice<int32> permutation(
+ reinterpret_cast<const int32*>(Vperm.data()), dims);
+ TensorShape shape;
+
+ // Check whether permutation is a permutation of integers of [0 .. dims).
+ gtl::InlinedVector<bool, 8> bits(dims);
+ for (const int32 d : permutation) {
+ OP_REQUIRES(
+ context, 0 <= d && d < dims,
+ errors::InvalidArgument(d, " is out of range [0 .. ", dims, ")"));
+ bits[d] = true;
+ shape.AddDim(input.dim_size(d));
+ }
+ for (int i = 0; i < dims; ++i) {
+ OP_REQUIRES(context, bits[i], errors::InvalidArgument(
+ i, " is missing from {",
+ str_util::Join(permutation, ","), "}."));
+ }
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output));
+ switch (dims) {
+#define EXPAND_DIM(N) \
+ case N: { \
+ functor::TransposeFunctor<Device, T, N> func; \
+ func(context->eigen_device<Device>(), output->tensor<T, N>(), \
+ input.tensor<T, N>(), permutation.data()); \
+ break; \
+ }
+ EXPAND_DIM(1);
+ EXPAND_DIM(2);
+ EXPAND_DIM(3);
+ EXPAND_DIM(4);
+ EXPAND_DIM(5);
+ EXPAND_DIM(6);
+ EXPAND_DIM(7);
+ EXPAND_DIM(8);
+ default:
+ LOG(FATAL) << "Unexpected dims: " << dims;
+ }
+#undef EXPAND_CASE
+}
+
+namespace functor {
+
+template <typename Device, typename T, int NDIMS>
+void TransposeMaybeInline(const Device& d,
+ typename TTypes<T, NDIMS>::Tensor out,
+ typename TTypes<T, NDIMS>::ConstTensor in,
+ const int* perm) {
+ // perm[] is a permutation of 0, 1, ..., NDIMS-1. perm[] is on CPU.
+ Eigen::array<int, NDIMS> p;
+ for (int i = 0; i < NDIMS; ++i) p[i] = perm[i];
+ if (out.size() * sizeof(T) < 131072) { // Small transpose on a CPU: do inline
+ out = in.shuffle(p);
+ } else {
+ out.device(d) = in.shuffle(p);
+ }
+}
+
+template <typename T, int NDIMS>
+struct TransposeFunctor<CPUDevice, T, NDIMS> {
+ void operator()(const CPUDevice& d, typename TTypes<T, NDIMS>::Tensor out,
+ typename TTypes<T, NDIMS>::ConstTensor in, const int* perm) {
+ TransposeMaybeInline<CPUDevice, T, NDIMS>(d, out, in, perm);
+ }
+};
+
+} // namespace functor
+
+#define REGISTER(D, T) \
+ template class TransposeOp<D##Device, T>; \
+ REGISTER_KERNEL_BUILDER(Name("Transpose") \
+ .Device(DEVICE_##D) \
+ .TypeConstraint<T>("T") \
+ .HostMemory("perm"), \
+ TransposeOp<D##Device, T>)
+REGISTER(CPU, float);
+REGISTER(CPU, double);
+REGISTER(CPU, complex64);
+REGISTER(CPU, uint8);
+REGISTER(CPU, int8);
+REGISTER(CPU, int16);
+REGISTER(CPU, int32);
+REGISTER(CPU, int64);
+REGISTER(CPU, string);
+#if GOOGLE_CUDA
+REGISTER(GPU, uint8);
+REGISTER(GPU, int8);
+REGISTER(GPU, int16);
+REGISTER(GPU, int32);
+REGISTER(GPU, int64);
+REGISTER(GPU, float);
+REGISTER(GPU, double);
+#endif
+#undef REGISTER
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/transpose_op.h b/tensorflow/core/kernels/transpose_op.h
new file mode 100644
index 0000000000..f7a5be5c2b
--- /dev/null
+++ b/tensorflow/core/kernels/transpose_op.h
@@ -0,0 +1,19 @@
+#ifndef TENSORFLOW_KERNELS_TRANSPOSE_OP_H_
+#define TENSORFLOW_KERNELS_TRANSPOSE_OP_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+template <typename Device, typename T>
+class TransposeOp : public OpKernel {
+ public:
+ explicit TransposeOp(OpKernelConstruction* context);
+ void Compute(OpKernelContext* context) override;
+};
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_TRANSPOSE_OP_H_
diff --git a/tensorflow/core/kernels/transpose_op_functor.h b/tensorflow/core/kernels/transpose_op_functor.h
new file mode 100644
index 0000000000..8cbd1cbb29
--- /dev/null
+++ b/tensorflow/core/kernels/transpose_op_functor.h
@@ -0,0 +1,28 @@
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TRANSPOSE_OP_FUNCTOR_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TRANSPOSE_OP_FUNCTOR_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T, int NDIMS>
+void Transpose(const Device& d, typename TTypes<T, NDIMS>::Tensor out,
+ typename TTypes<T, NDIMS>::ConstTensor in, const int* perm) {
+ // perm[] is a permutation of 0, 1, ..., NDIMS-1. perm[] is on CPU.
+ Eigen::array<int, NDIMS> p;
+ for (int i = 0; i < NDIMS; ++i) p[i] = perm[i];
+ out.device(d) = in.shuffle(p);
+}
+
+template <typename Device, typename T, int NDIMS>
+struct TransposeFunctor {
+ void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor out,
+ typename TTypes<T, NDIMS>::ConstTensor in, const int* perm);
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TRANSPOSE_OP_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/transpose_op_gpu.cu.cc b/tensorflow/core/kernels/transpose_op_gpu.cu.cc
new file mode 100644
index 0000000000..8c04a6544e
--- /dev/null
+++ b/tensorflow/core/kernels/transpose_op_gpu.cu.cc
@@ -0,0 +1,43 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/kernels/transpose_op_functor.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename T, int NDIMS>
+struct TransposeFunctor<Eigen::GpuDevice, T, NDIMS> {
+ void operator()(const Eigen::GpuDevice& d,
+ typename TTypes<T, NDIMS>::Tensor out,
+ typename TTypes<T, NDIMS>::ConstTensor in, const int* perm) {
+ Transpose<Eigen::GpuDevice, T, NDIMS>(d, out, in, perm);
+ }
+};
+
+#define DEFINE(T, N) template struct TransposeFunctor<Eigen::GpuDevice, T, N>;
+#define DEFINE_DIM(T) \
+ DEFINE(T, 1); \
+ DEFINE(T, 2); \
+ DEFINE(T, 3); \
+ DEFINE(T, 4); \
+ DEFINE(T, 5); \
+ DEFINE(T, 6); \
+ DEFINE(T, 7); \
+ DEFINE(T, 8);
+DEFINE_DIM(uint8);
+DEFINE_DIM(int8);
+DEFINE_DIM(int16);
+DEFINE_DIM(int32);
+DEFINE_DIM(int64);
+DEFINE_DIM(float);
+DEFINE_DIM(double);
+#undef DEFINE_DIM
+#undef DEFINE
+
+} // end namespace functor
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
new file mode 100644
index 0000000000..61f4a54583
--- /dev/null
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -0,0 +1,61 @@
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/status.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename T>
+class UniqueOp : public OpKernel {
+ public:
+ explicit UniqueOp(OpKernelConstruction* context) : OpKernel(context) {
+ const DataType dt = DataTypeToEnum<T>::v();
+ OP_REQUIRES_OK(context, context->MatchSignature({dt}, {dt, DT_INT32}));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()),
+ errors::InvalidArgument("unique expects a 1D vector."));
+ auto Tin = input.vec<T>();
+ const int N = Tin.size();
+
+ Tensor* idx = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(1, input.shape(), &idx));
+ auto idx_vec = idx->template vec<int32>();
+
+ std::unordered_map<T, int32> uniq;
+ uniq.reserve(2 * N);
+ for (int i = 0, j = 0; i < N; ++i) {
+ auto it = uniq.insert(std::make_pair(Tin(i), j));
+ idx_vec(i) = it.first->second;
+ if (it.second) {
+ ++j;
+ }
+ }
+ int32 uniq_size = uniq.size();
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(
+ 0, TensorShape({uniq_size}), &output));
+ auto output_vec = output->template vec<T>();
+
+ for (auto it : uniq) {
+ output_vec(it.second) = it.first;
+ }
+ }
+};
+
+#define REGISTER_UNIQUE(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Unique").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ UniqueOp<type>)
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE);
+#undef REGISTER_UNIQUE
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/unique_op_test.cc b/tensorflow/core/kernels/unique_op_test.cc
new file mode 100644
index 0000000000..658f2282cf
--- /dev/null
+++ b/tensorflow/core/kernels/unique_op_test.cc
@@ -0,0 +1,51 @@
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/tensor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+
+namespace {
+
+static void BM_Unique(int iters, int dim) {
+ testing::StopTiming();
+ RequireDefaultOps();
+ Graph* g = new Graph(OpRegistry::Global());
+
+ Tensor input(DT_INT32, TensorShape({dim}));
+ input.flat<int32>().setRandom();
+
+ Node* node;
+ TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Unique")
+ .Input(test::graph::Constant(g, input))
+ .Attr("T", DT_INT32)
+ .Finalize(g, &node));
+
+ testing::BytesProcessed(static_cast<int64>(iters) * dim * sizeof(int32));
+ testing::UseRealTime();
+ testing::StartTiming();
+ test::Benchmark("cpu", g).Run(iters);
+}
+
+BENCHMARK(BM_Unique)
+ ->Arg(32)
+ ->Arg(256)
+ ->Arg(1024)
+ ->Arg(4 * 1024)
+ ->Arg(16 * 1024)
+ ->Arg(64 * 1024)
+ ->Arg(256 * 1024);
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
new file mode 100644
index 0000000000..36cfb2c8e5
--- /dev/null
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -0,0 +1,96 @@
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/split_op.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class UnpackOp : public OpKernel {
+ public:
+ explicit UnpackOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+ void Compute(OpKernelContext* context) override {
+ const int32 num = num_outputs();
+ const Tensor& input = context->input(0);
+ const TensorShape& input_shape = input.shape();
+
+ OP_REQUIRES(
+ context, input_shape.dims() > 0 && input_shape.dim_size(0) == num,
+ errors::InvalidArgument("Input shape must start with ", num, ", got ",
+ input_shape.ShortDebugString()));
+
+ auto output_shape = input_shape;
+ output_shape.RemoveDim(0);
+ const int32 output_size = output_shape.num_elements();
+
+ // Special case: Aligned, so we can share the underlying buffer.
+ //
+ // Apply this optimization conservatively: if input is aligned,
+ // the resulting tensors must be aligned. It's conservative
+ // because if the immediate consumer of the resulting tensors are
+ // not using eigen for computation, its perfectly fine to avoid
+ // the copying.
+ if (output_size == 0 || IsInnerDimsSizeAligned<T>(input_shape)) {
+ for (int i = 0; i < num; ++i) {
+ Tensor output;
+ CHECK(output.CopyFrom(input.Slice(i, i + 1), output_shape));
+ context->set_output(i, output);
+ }
+ return;
+ }
+
+ // Except for shape, unpack is a special case of split, so we reuse the
+ // same computational kernels.
+ auto input_reshaped = input.shaped<T, 3>({1, num, output_size});
+
+ for (int i = 0; i < num; ++i) {
+ Tensor* output;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(i, output_shape, &output));
+ auto output_shaped = output->shaped<T, 3>({1, 1, output_size});
+
+ Eigen::DSizes<ptrdiff_t, 3> indices{0, i, 0};
+ Eigen::DSizes<ptrdiff_t, 3> sizes{1, 1, output_size};
+ functor::Split<Device, T>()(context->eigen_device<Device>(),
+ output_shaped, input_reshaped, indices,
+ sizes);
+ }
+ }
+};
+
+#define REGISTER_UNPACK(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Unpack").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+ UnpackOp<CPUDevice, type>)
+
+TF_CALL_ALL_TYPES(REGISTER_UNPACK);
+
+#undef REGISTER_UNPACK
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Unpack").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+ UnpackOp<GPUDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+#undef REGISTER_GPU
+
+#endif // GOOGLE_CUDA
+
+} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
new file mode 100644
index 0000000000..2f1dbc68c0
--- /dev/null
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -0,0 +1,37 @@
+#define EIGEN_USE_THREADS
+#include "tensorflow/core/kernels/variable_ops.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/port.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("Variable").Device(DEVICE_CPU), VariableOp);
+REGISTER_KERNEL_BUILDER(Name("TemporaryVariable").Device(DEVICE_CPU),
+ TemporaryVariableOp);
+REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable").Device(DEVICE_CPU),
+ DestroyTemporaryVariableOp);
+
+#if GOOGLE_CUDA
+// Only register 'Variable' on GPU for the subset of types also supported by
+// 'Assign' (see dense_update_ops.cc.)
+#define REGISTER_GPU_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Variable").Device(DEVICE_GPU).TypeConstraint<type>("dtype"), \
+ VariableOp); \
+ REGISTER_KERNEL_BUILDER(Name("TemporaryVariable") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<type>("dtype"), \
+ TemporaryVariableOp); \
+ REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable") \
+ .Device(DEVICE_GPU) \
+ .TypeConstraint<type>("T"), \
+ DestroyTemporaryVariableOp);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/variable_ops.h b/tensorflow/core/kernels/variable_ops.h
new file mode 100644
index 0000000000..77d2da0ad4
--- /dev/null
+++ b/tensorflow/core/kernels/variable_ops.h
@@ -0,0 +1,146 @@
+#ifndef TENSORFLOW_KERNELS_VARIABLE_OPS_H_
+#define TENSORFLOW_KERNELS_VARIABLE_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/port.h"
+
+namespace tensorflow {
+
+class VariableOp : public OpKernel {
+ public:
+ explicit VariableOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
+ dtype_ = RemoveRefType(context->output_type(0));
+ }
+
+ ~VariableOp() override {
+ if (var_) var_->Unref();
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ mutex_lock l(init_mu_);
+ if (var_ == nullptr) {
+ OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
+ true /* use name() */));
+ auto creator = [this](Var** var) {
+ *var = new Var(dtype_);
+ (*var)->tensor()->set_shape(shape_);
+ return Status::OK();
+ };
+ OP_REQUIRES_OK(ctx,
+ cinfo_.resource_manager()->LookupOrCreate<Var>(
+ cinfo_.container(), cinfo_.name(), &var_, creator));
+ }
+ // Output a reference to our tensor, so it may be updated.
+ //
+ // As long as *this is alive, the ref we return here is valid
+ // because *this owns a ref on var_.
+ ctx->set_output_ref(0, var_->mu(), var_->tensor());
+ }
+
+ private:
+ class Var : public ResourceBase {
+ public:
+ explicit Var(DataType dtype) : tensor_(dtype) {}
+ mutex* mu() { return &mu_; }
+ Tensor* tensor() { return &tensor_; }
+
+ string DebugString() override {
+ return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
+ tensor_.shape().ShortDebugString());
+ }
+
+ private:
+ mutex mu_;
+ Tensor tensor_;
+
+ ~Var() override {}
+ TF_DISALLOW_COPY_AND_ASSIGN(Var);
+ };
+
+ DataType dtype_;
+ TensorShape shape_;
+
+ mutex init_mu_;
+ ContainerInfo cinfo_ GUARDED_BY(init_mu_);
+ Var* var_ GUARDED_BY(init_mu_) = nullptr;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(VariableOp);
+};
+
+class TemporaryVariableOp : public OpKernel {
+ public:
+ explicit TemporaryVariableOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
+ OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
+ OP_REQUIRES_OK(context, context->GetAttr("var_name", &var_name_));
+ // Variable name defaults to op name if not specified explicitly.
+ if (var_name_ == "") var_name_ = name();
+ }
+
+ void Compute(OpKernelContext* context) override {
+ Status s;
+ ResourceMgr* rm = context->step_resource_manager();
+ OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager."));
+ auto* tmp_var = new TmpVar;
+ OP_REQUIRES(context, tmp_var,
+ errors::ResourceExhausted("Could not allocate TmpVar."));
+ tmp_var->name = var_name_;
+ s = context->allocate_temp(dtype_, shape_, &tmp_var->val);
+ if (!s.ok()) tmp_var->Unref();
+ OP_REQUIRES_OK(context, s);
+ OP_REQUIRES_OK(context, rm->Create("tmp_var", var_name_, tmp_var));
+ context->set_output_ref(0, &tmp_var->mu, &tmp_var->val);
+ }
+
+ private:
+ // Refcounted temporary variable resource.
+ friend class DestroyTemporaryVariableOp;
+ struct TmpVar : public ResourceBase {
+ mutex mu;
+ Tensor val;
+ string name;
+ string DebugString() override { return name; }
+ ~TmpVar() override { VLOG(3) << "TmpVar " << name << " deleted"; }
+ };
+
+ TensorShape shape_;
+ DataType dtype_;
+ string var_name_;
+};
+
+class DestroyTemporaryVariableOp : public OpKernel {
+ public:
+ explicit DestroyTemporaryVariableOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ OP_REQUIRES(context, IsRefType(context->input_type(0)),
+ errors::InvalidArgument("lhs input needs to be a ref type"))
+ OP_REQUIRES_OK(context, context->GetAttr("var_name", &var_name_));
+ OP_REQUIRES(context, var_name_ != "",
+ errors::InvalidArgument("Missing var_name attribute"));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ // NOTE(pbar): All other mutators of the Tensor Ref *must* have completed
+ // their execution before this DestroyTemporaryVariable op executes.
+ // This is typically achieved using control dependencies.
+ CHECK(IsRefType(context->input_dtype(0)));
+ Tensor tmpvar = context->mutable_input(0, false);
+ context->set_output(0, tmpvar);
+ ResourceMgr* rm = context->step_resource_manager();
+ OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager."));
+ OP_REQUIRES_OK(
+ context, rm->Delete<TemporaryVariableOp::TmpVar>("tmp_var", var_name_));
+ }
+
+ private:
+ string var_name_;
+};
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_VARIABLE_OPS_H_
diff --git a/tensorflow/core/kernels/where_op.cc b/tensorflow/core/kernels/where_op.cc
new file mode 100644
index 0000000000..9db0943ea7
--- /dev/null
+++ b/tensorflow/core/kernels/where_op.cc
@@ -0,0 +1,74 @@
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/where_op.h"
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device>
+class WhereOp : public OpKernel {
+ public:
+ explicit WhereOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+
+ const int input_dims = input.dims();
+ Tensor num_true;
+ OP_REQUIRES_OK(
+ context, context->allocate_temp(DT_INT64, TensorShape({}), &num_true));
+ auto num_true_t = num_true.scalar<int64>();
+
+ functor::NumTrue<Device>::Compute(context->eigen_device<Device>(),
+ input.flat<bool>(), num_true_t);
+ TensorShape output_shape({num_true_t(), input_dims});
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+#define HANDLE_DIM(NDIM) \
+ case NDIM: \
+ functor::Where<Device, NDIM>::Compute(context->eigen_device<Device>(), \
+ input.tensor<bool, NDIM>(), \
+ output->matrix<int64>()); \
+ break;
+
+ switch (input_dims) {
+ HANDLE_DIM(1);
+ HANDLE_DIM(2);
+ HANDLE_DIM(3);
+ HANDLE_DIM(4);
+ HANDLE_DIM(5);
+
+ default:
+ OP_REQUIRES(context, false,
+ errors::InvalidArgument(
+ "WhereOp : Unhandled input dimensions: ", input_dims));
+ }
+#undef HANDLE_DIM
+ }
+
+ private:
+ TF_DISALLOW_COPY_AND_ASSIGN(WhereOp);
+};
+
+#define REGISTER_WHERE() \
+ REGISTER_KERNEL_BUILDER(Name("Where").Device(DEVICE_CPU), WhereOp<CPUDevice>);
+
+REGISTER_WHERE();
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/where_op.h b/tensorflow/core/kernels/where_op.h
new file mode 100644
index 0000000000..c7b835d02f
--- /dev/null
+++ b/tensorflow/core/kernels/where_op.h
@@ -0,0 +1,65 @@
+#ifndef TENSORFLOW_KERNELS_WHERE_OP_H_
+#define TENSORFLOW_KERNELS_WHERE_OP_H_
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device>
+struct NumTrue {
+ EIGEN_ALWAYS_INLINE static void Compute(
+ const Device& d, typename TTypes<bool>::ConstFlat input,
+ TTypes<int64>::Scalar num_true) {
+ num_true.device(d) = input.template cast<int64>().sum();
+ }
+};
+
+template <typename Device, int NDIM>
+struct Where {
+ EIGEN_ALWAYS_INLINE static void Compute(
+ const Device& d, typename TTypes<bool, NDIM>::ConstTensor input,
+ typename TTypes<int64>::Matrix output) {
+ Eigen::DenseIndex true_n = 0;
+ Eigen::DSizes<Eigen::DenseIndex, NDIM> dims = input.dimensions();
+ Eigen::DSizes<Eigen::DenseIndex, NDIM> strides;
+
+ // Calculate strides for RowMajor order.
+ EIGEN_STATIC_ASSERT((static_cast<int>(decltype(input)::Layout) ==
+ static_cast<int>(Eigen::RowMajor)),
+ INTERNAL_ERROR_INPUT_SHOULD_BE_ROWMAJOR);
+
+ strides[NDIM - 1] = 1;
+ for (int i = NDIM - 2; i >= 0; --i) {
+ strides[i] = strides[i + 1] * dims[i + 1];
+ }
+
+ // Note, no bounds checking is done on true_n. It is assumed that
+ // the output was correctly sized via output of NumTrue::Compute.
+ for (Eigen::DenseIndex n = 0; n < input.size(); ++n) {
+ if (input.data()[n]) {
+ WriteIndexRowMajor(output, strides, true_n, n);
+ ++true_n;
+ }
+ }
+ }
+
+ EIGEN_ALWAYS_INLINE static void WriteIndexRowMajor(
+ typename TTypes<int64>::Matrix output,
+ const Eigen::DSizes<Eigen::DenseIndex, NDIM>& strides,
+ Eigen::DenseIndex true_n, Eigen::DenseIndex index) {
+ for (int i = 0; i < NDIM; ++i) {
+ output(true_n, i) = index / strides[i];
+ index %= strides[i];
+ }
+ }
+};
+
+} // namespace functor
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_WHERE_OP_H_
diff --git a/tensorflow/core/kernels/whole_file_read_ops.cc b/tensorflow/core/kernels/whole_file_read_ops.cc
new file mode 100644
index 0000000000..b940163ec9
--- /dev/null
+++ b/tensorflow/core/kernels/whole_file_read_ops.cc
@@ -0,0 +1,108 @@
+// See docs in ../ops/io_ops.cc.
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/kernels/reader_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/public/env.h"
+#include "tensorflow/core/public/tensor_shape.h"
+
+namespace tensorflow {
+
+static Status ReadEntireFile(Env* env, const string& filename,
+ string* contents) {
+ uint64 file_size = 0;
+ TF_RETURN_IF_ERROR(env->GetFileSize(filename, &file_size));
+ contents->resize(file_size);
+ RandomAccessFile* file;
+ TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file));
+ std::unique_ptr<RandomAccessFile> make_sure_file_gets_deleted(file);
+ StringPiece data;
+ TF_RETURN_IF_ERROR(file->Read(0, file_size, &data, &(*contents)[0]));
+ if (data.size() != file_size) {
+ return errors::DataLoss("Truncated read of '", filename, "' expected ",
+ file_size, " got ", data.size());
+ }
+ if (data.data() != &(*contents)[0]) {
+ memmove(&(*contents)[0], data.data(), data.size());
+ }
+ return Status::OK();
+}
+
+class WholeFileReader : public ReaderBase {
+ public:
+ WholeFileReader(Env* env, const string& node_name)
+ : ReaderBase(strings::StrCat("WholeFileReader '", node_name, "'")),
+ env_(env) {}
+
+ Status ReadLocked(string* key, string* value, bool* produced,
+ bool* at_end) override {
+ *key = current_work();
+ TF_RETURN_IF_ERROR(ReadEntireFile(env_, *key, value));
+ *produced = true;
+ *at_end = true;
+ return Status::OK();
+ }
+
+ // Stores state in a ReaderBaseState proto, since WholeFileReader has
+ // no additional state beyond ReaderBase.
+ Status SerializeStateLocked(string* state) override {
+ ReaderBaseState base_state;
+ SaveBaseState(&base_state);
+ base_state.SerializeToString(state);
+ return Status::OK();
+ }
+
+ Status RestoreStateLocked(const string& state) override {
+ ReaderBaseState base_state;
+ if (!ParseProtoUnlimited(&base_state, state)) {
+ return errors::InvalidArgument("Could not parse state for ", name(), ": ",
+ str_util::CEscape(state));
+ }
+ TF_RETURN_IF_ERROR(RestoreBaseState(base_state));
+ return Status::OK();
+ }
+
+ private:
+ Env* env_;
+};
+
+class WholeFileReaderOp : public ReaderOpKernel {
+ public:
+ explicit WholeFileReaderOp(OpKernelConstruction* context)
+ : ReaderOpKernel(context) {
+ Env* env = context->env();
+ SetReaderFactory(
+ [this, env]() { return new WholeFileReader(env, name()); });
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("WholeFileReader").Device(DEVICE_CPU),
+ WholeFileReaderOp);
+
+class ReadFileOp : public OpKernel {
+ public:
+ using OpKernel::OpKernel;
+ void Compute(OpKernelContext* context) override {
+ const Tensor* input;
+ OP_REQUIRES_OK(context, context->input("filename", &input));
+ OP_REQUIRES(context, TensorShapeUtils::IsScalar(input->shape()),
+ errors::InvalidArgument(
+ "Input filename tensor must be scalar, but had shape: ",
+ input->shape().DebugString()));
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context, context->allocate_output("contents",
+ TensorShape({}), &output));
+ OP_REQUIRES_OK(context,
+ ReadEntireFile(context->env(), input->scalar<string>()(),
+ &output->scalar<string>()()));
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ReadFile").Device(DEVICE_CPU), ReadFileOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
new file mode 100644
index 0000000000..ff54d157af
--- /dev/null
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -0,0 +1,90 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/kernels/xent_op.h"
+#include "tensorflow/core/public/tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class SoftmaxXentWithLogitsOp : public OpKernel {
+ public:
+ explicit SoftmaxXentWithLogitsOp(OpKernelConstruction* context)
+ : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& logits_in = context->input(0);
+ const Tensor& labels_in = context->input(1);
+ OP_REQUIRES(context, logits_in.IsSameSize(labels_in),
+ errors::InvalidArgument(
+ "logits and labels must be same size: logits_size=",
+ logits_in.shape().DebugString(), " labels_size=",
+ labels_in.shape().DebugString()));
+ OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()),
+ errors::InvalidArgument("logits must be 2-dimensional"));
+ // As we already tested that both inputs have the same shape no need to
+ // check that "labels" is a matrix too.
+
+ // loss is 1-D (one per example), and size is batch_size.
+
+ Tensor scratch;
+ OP_REQUIRES_OK(
+ context, context->allocate_temp(DataTypeToEnum<T>::value,
+ TensorShape({logits_in.dim_size(0), 1}),
+ &scratch));
+
+ Tensor* loss_out = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(
+ 0, TensorShape({logits_in.dim_size(0)}), &loss_out));
+ Tensor* back_out = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(1, logits_in.shape(), &back_out));
+
+ functor::XentFunctor<Device, T> functor;
+ functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
+ labels_in.matrix<T>(), scratch.matrix<T>(), loss_out->vec<T>(),
+ back_out->matrix<T>());
+ }
+};
+
+// Partial specialization for a CPUDevice, that uses the Eigen implementation
+// from XentEigenImpl.
+namespace functor {
+template <typename T>
+struct XentFunctor<CPUDevice, T> {
+ void operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+ typename TTypes<T>::ConstMatrix labels,
+ typename TTypes<T>::Matrix scratch,
+ typename TTypes<T>::Vec loss,
+ typename TTypes<T>::Matrix backprop) {
+ XentEigenImpl<CPUDevice, T>::Compute(d, logits, labels, scratch, loss,
+ backprop);
+ }
+};
+} // namespace functor
+
+REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
+ .Device(DEVICE_CPU)
+ .TypeConstraint<float>("T"),
+ SoftmaxXentWithLogitsOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
+ .Device(DEVICE_CPU)
+ .TypeConstraint<double>("T"),
+ SoftmaxXentWithLogitsOp<CPUDevice, double>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
+ .Device(DEVICE_GPU)
+ .TypeConstraint<float>("T"),
+ SoftmaxXentWithLogitsOp<GPUDevice, float>);
+#endif // GOOGLE_CUDA
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/xent_op.h b/tensorflow/core/kernels/xent_op.h
new file mode 100644
index 0000000000..edb7d817c8
--- /dev/null
+++ b/tensorflow/core/kernels/xent_op.h
@@ -0,0 +1,102 @@
+#ifndef TENSORFLOW_KERNELS_XENT_OP_H_
+#define TENSORFLOW_KERNELS_XENT_OP_H_
+// Functor definition for XentOp, must be compilable by nvcc.
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by XentOp to do the computations.
+template <typename Device, typename T>
+struct XentFunctor {
+ // Computes Cross Entropy loss and backprop.
+ //
+ // logits: batch_size, num_classes.
+ // labels: batch_size, num_classes.
+ // scratch: temporary tensor, dims: batch_size, 1
+ // loss: output tensor for the loss, dims: batch_size.
+ // backprop: output tensor for the backprop, dims: batch_size, num_classes.
+ void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
+ typename TTypes<T>::ConstMatrix labels,
+ typename TTypes<T>::Matrix scratch,
+ typename TTypes<T>::Vec loss,
+ typename TTypes<T>::Matrix backprop);
+};
+
+// Eigen code implementing XentFunctor::operator().
+// This code works for both CPU and GPU and is used by the functor
+// specializations for both device types.
+template <typename Device, typename T>
+struct XentEigenImpl {
+ static void Compute(const Device& d, typename TTypes<T>::ConstMatrix logits,
+ typename TTypes<T>::ConstMatrix labels,
+ typename TTypes<T>::Matrix scratch,
+ typename TTypes<T>::Vec loss,
+ typename TTypes<T>::Matrix backprop) {
+ // NOTE(mdevin): This duplicates some of the computations in softmax_op
+ // because we need the intermediate (logits -max(logits)) values to
+ // avoid a log(exp()) in the computation of the loss.
+
+ const int kBatchDim = 0;
+ const int kClassDim = 1;
+
+ const int batch_size = logits.dimension(kBatchDim);
+ const int num_classes = logits.dimension(kClassDim);
+
+// These arrays are used to reduce along the class dimension, and broadcast
+// the resulting value to all classes.
+#if !defined(EIGEN_HAS_INDEX_LIST)
+ Eigen::array<int, 1> along_class;
+ along_class[0] = kClassDim;
+ Eigen::array<int, 1> batch_only;
+ batch_only[0] = batch_size;
+ Eigen::array<int, 2> batch_by_one;
+ batch_by_one[0] = batch_size;
+ batch_by_one[1] = 1;
+ Eigen::array<int, 2> one_by_class;
+ one_by_class[0] = 1;
+ one_by_class[1] = num_classes;
+#else
+ Eigen::IndexList<Eigen::type2index<kClassDim> > along_class;
+ Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one;
+ batch_by_one.set(0, batch_size);
+ Eigen::IndexList<int> batch_only;
+ batch_only.set(0, batch_size);
+ Eigen::IndexList<Eigen::type2index<1>, int> one_by_class;
+ one_by_class.set(1, num_classes);
+#endif
+
+ // max_logits along classes.
+ scratch.reshape(batch_only).device(d) = logits.maximum(along_class);
+
+ // logits - max_logits.
+ backprop.device(d) = logits - scratch.broadcast(one_by_class);
+
+ // sum(exp(logits - max_logits)) along classes.
+ scratch.reshape(batch_only).device(d) = backprop.exp().sum(along_class);
+
+ // NOTE(keveman): Eigen on GPU dispatches to an optimized implementaion
+ // for an expression of the form lhs = rhs.sum().
+ // lhs = -rhs.sum() doesn't match the above pattern, so folding in the
+ // negation before calling sum().
+ // sum(-labels *
+ // ((logits - max_logits) - log(sum(exp(logits - max_logits)))))
+ // along classes
+ loss.device(d) =
+ (labels * (scratch.log().eval().broadcast(one_by_class) - backprop))
+ .eval()
+ .sum(along_class);
+
+ // backprop: prob - labels, where
+ // prob = exp(logits - max_logits) / sum(exp(logits - max_logits))
+ backprop.device(d) =
+ (backprop.exp() / scratch.broadcast(one_by_class)) - labels;
+ }
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // TENSORFLOW_KERNELS_XENT_OP_H_
diff --git a/tensorflow/core/kernels/xent_op_gpu.cu.cc b/tensorflow/core/kernels/xent_op_gpu.cu.cc
new file mode 100644
index 0000000000..eec6a84281
--- /dev/null
+++ b/tensorflow/core/kernels/xent_op_gpu.cu.cc
@@ -0,0 +1,35 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/xent_op.h"
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Partial specialization for a GPUDevice, that uses the Eigen implementation
+// from XentEigenImpl.
+namespace functor {
+template <typename T>
+struct XentFunctor<GPUDevice, T> {
+ void operator()(const GPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+ typename TTypes<T>::ConstMatrix labels,
+ typename TTypes<T>::Matrix scratch,
+ typename TTypes<T>::Vec loss,
+ typename TTypes<T>::Matrix backprop) {
+ XentEigenImpl<GPUDevice, T>::Compute(d, logits, labels, scratch, loss,
+ backprop);
+ }
+};
+} // end namespace functor
+
+// Instantiate the GPU implementation for float.
+template struct functor::XentFunctor<GPUDevice, float>;
+
+} // end namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/xent_op_test.cc b/tensorflow/core/kernels/xent_op_test.cc
new file mode 100644
index 0000000000..9aab1b09bf
--- /dev/null
+++ b/tensorflow/core/kernels/xent_op_test.cc
@@ -0,0 +1,46 @@
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/kernels/xent_op.h"
+
+namespace tensorflow {
+
+static Graph* Xent(int batch_size, int num_classes) {
+ Graph* g = new Graph(OpRegistry::Global());
+ Tensor logits(DT_FLOAT, TensorShape({batch_size, num_classes}));
+ logits.flat<float>().setRandom();
+ Tensor labels(DT_FLOAT, TensorShape({batch_size, num_classes}));
+ labels.flat<float>().setRandom();
+ test::graph::Binary(g, "SoftmaxCrossEntropyWithLogits",
+ test::graph::Constant(g, logits),
+ test::graph::Constant(g, labels));
+ return g;
+}
+
+#define BM_XentDev(BATCH, CLASS, DEVICE) \
+ static void BM_Xent##_##BATCH##_##CLASS##_##DEVICE(int iters) { \
+ testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * CLASS); \
+ test::Benchmark(#DEVICE, Xent(BATCH, CLASS)).Run(iters); \
+ } \
+ BENCHMARK(BM_Xent##_##BATCH##_##CLASS##_##DEVICE);
+
+/// The representative tests for ptb_word on GPU
+BM_XentDev(16, 10000, gpu);
+BM_XentDev(16, 30000, gpu);
+BM_XentDev(16, 100000, gpu);
+
+BM_XentDev(32, 10000, gpu);
+BM_XentDev(32, 30000, gpu);
+BM_XentDev(32, 100000, gpu);
+
+BM_XentDev(64, 10000, gpu);
+BM_XentDev(64, 30000, gpu);
+BM_XentDev(64, 100000, gpu);
+
+/// Only the smaller tests for CPU. Otherwise, it's too slow
+BM_XentDev(16, 10000, cpu);
+BM_XentDev(32, 10000, cpu);
+BM_XentDev(64, 10000, cpu);
+
+} // end namespace tensorflow