diff options
author | 2018-09-26 11:54:30 +0800 | |
---|---|---|
committer | 2018-09-26 11:54:30 +0800 | |
commit | 35174f46b973c66a2e6894a12b3018d60e8414ec (patch) | |
tree | 5bdae0172159bc02ec3a470722bf959b14dd47ba /tensorflow/core/kernels | |
parent | f0886f7269de900d226455d4831722f6fc94a71b (diff) | |
parent | 6666516f390f125ed70ddbd4e6f89b83d953c408 (diff) |
Merge remote-tracking branch 'origin'
Diffstat (limited to 'tensorflow/core/kernels')
105 files changed, 7212 insertions, 1394 deletions
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index c3c6013d83..1a3db2c7cd 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -30,6 +30,7 @@ load( "//tensorflow:tensorflow.bzl", "if_android", "tf_cc_test", + "tf_cc_test_mkl", "tf_cc_tests", "tf_cc_binary", "tf_copts", @@ -50,6 +51,10 @@ load( "tf_kernel_tests_linkstatic", ) load( + "//tensorflow/core:platform/default/build_config_root.bzl", + "tf_cuda_tests_tags", +) +load( "//third_party/mkl:build_defs.bzl", "if_mkl", "if_mkl_ml", @@ -212,6 +217,19 @@ tf_kernel_library( ], ) +tf_kernel_library( + name = "extract_volume_patches_op", + prefix = "extract_volume_patches_op", + deps = [ + ":bounds_check", + ":eigen_helpers", + ":ops_util", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//third_party/eigen3", + ], +) + cc_library( name = "conv_3d", hdrs = ["conv_3d.h"], @@ -617,6 +635,7 @@ cc_library( ":diag_op", ":edit_distance_op", ":extract_image_patches_op", + ":extract_volume_patches_op", ":gather_nd_op", ":gather_op", ":guarantee_const_op", @@ -636,6 +655,7 @@ cc_library( ":reshape_op", ":reverse_op", ":reverse_sequence_op", + ":searchsorted_op", ":shape_ops", ":slice_op", ":snapshot_op", @@ -869,6 +889,12 @@ tf_kernel_library( ) tf_kernel_library( + name = "searchsorted_op", + prefix = "searchsorted_op", + deps = ARRAY_DEPS, +) + +tf_kernel_library( name = "inplace_ops", prefix = "inplace_ops", deps = ARRAY_DEPS, @@ -1105,7 +1131,7 @@ tf_cuda_cc_test( name = "depthwise_conv_ops_test", size = "small", srcs = ["depthwise_conv_ops_test.cc"], - tags = ["requires-gpu-sm35"], + tags = tf_cuda_tests_tags(), deps = [ ":conv_ops", ":image", @@ -2702,6 +2728,7 @@ cc_library( ) LOGGING_DEPS = [ + "@com_google_absl//absl/strings", "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", @@ -2759,6 +2786,7 @@ tf_cc_tests( "//tensorflow/core:test", "//tensorflow/core:test_main", "//tensorflow/core:testlib", + "@com_google_absl//absl/strings", ], ) @@ -4396,6 +4424,7 @@ cc_library( ":reduce_join_op", ":regex_full_match_op", ":regex_replace_op", + ":string_format_op", ":string_join_op", ":string_length_op", ":string_split_op", @@ -4405,8 +4434,16 @@ cc_library( ], ) +cc_library( + name = "string_util", + srcs = ["string_util.cc"], + hdrs = ["string_util.h"], + deps = ["//tensorflow/core:lib"], +) + STRING_DEPS = [ ":bounds_check", + ":string_util", "//third_party/eigen3", "//tensorflow/core:framework", "//tensorflow/core:lib", @@ -4427,6 +4464,30 @@ tf_kernel_library( ) tf_kernel_library( + name = "string_format_op", + prefix = "string_format_op", + deps = STRING_DEPS + ["@com_google_absl//absl/strings"], +) + +tf_cc_test( + name = "string_format_op_test", + size = "small", + srcs = ["string_format_op_test.cc"], + deps = [ + ":string_format_op", + "//tensorflow/core:core_cpu", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + "//tensorflow/core/kernels:ops_testutil", + "//tensorflow/core/kernels:ops_util", + ], +) + +tf_kernel_library( name = "string_join_op", prefix = "string_join_op", deps = STRING_DEPS, @@ -4504,6 +4565,25 @@ tf_kernel_library( deps = STRING_DEPS, ) +tf_cc_test( + name = "substr_op_test", + size = "small", + srcs = ["substr_op_test.cc"], + deps = [ + ":substr_op", + "//tensorflow/core:core_cpu", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + "//tensorflow/core/kernels:ops_testutil", + "//tensorflow/core/kernels:ops_util", + ], +) + tf_kernel_library( name = "as_string_op", prefix = "as_string_op", @@ -5094,6 +5174,7 @@ filegroup( "spacetobatch_functor.h", "spacetodepth_op.h", "spectrogram.h", + "string_util.h", "tensor_array.h", "tile_functor.h", "tile_ops_cpu_impl.h", @@ -5262,6 +5343,7 @@ filegroup( "spectrogram_op.cc", "stack_ops.cc", "string_join_op.cc", + "string_util.cc", "summary_op.cc", "tensor_array.cc", "tensor_array_ops.cc", @@ -6209,6 +6291,26 @@ tf_mkl_kernel_library( ] + mkl_deps(), ) +tf_cc_test_mkl( + name = "mkl_conv_ops_test", + size = "small", + srcs = ["mkl_conv_ops_test.cc"], + deps = [ + ":ops_testutil", + ":ops_util", + "//tensorflow/cc:cc_ops", + "//tensorflow/core:core_cpu", + "//tensorflow/core:framework", + "//tensorflow/core:framework_internal", + "//tensorflow/core:lib", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core:tensorflow", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + ], +) + tf_mkl_kernel_library( name = "mkl_tfconv_op", prefix = "mkl_tfconv", diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc index 7b28c8e91f..e15ea82e7d 100644 --- a/tensorflow/core/kernels/bias_op.cc +++ b/tensorflow/core/kernels/bias_op.cc @@ -134,8 +134,8 @@ class BiasOp : public BinaryOp<T> { if (data_format_ == FORMAT_NCHW) { int32 batch, height, width, channel; GetBiasValueDims(input, data_format_, &batch, &height, &width, &channel); - Eigen::DSizes<int32, 4> four_dims(1, channel, 1, 1); - Eigen::DSizes<int32, 4> broad_cast_dims(batch, 1, height, width); + Eigen::DSizes<Eigen::Index, 4> four_dims(1, channel, 1, 1); + Eigen::DSizes<Eigen::Index, 4> broad_cast_dims(batch, 1, height, width); const Device& d = context->eigen_device<Device>(); output->tensor<T, 4>().device(d) = input.tensor<T, 4>() + @@ -247,14 +247,14 @@ class BiasGradOp : public OpKernel { OP_REQUIRES(context, output_backprop.dims() == 4, errors::InvalidArgument( "NCHW format supports only 4D input/output tensor.")); - Eigen::DSizes<int, 4> four_dims(batch, channel, height, width); + Eigen::DSizes<Eigen::Index, 4> four_dims(batch, channel, height, width); #ifdef EIGEN_HAS_INDEX_LIST using idx0 = Eigen::type2index<0>; using idx2 = Eigen::type2index<2>; using idx3 = Eigen::type2index<3>; Eigen::IndexList<idx0, idx2, idx3> reduction_axes; #else - Eigen::array<int, 3> reduction_axes = {0, 2, 3}; + Eigen::array<Eigen::Index, 3> reduction_axes = {0, 2, 3}; #endif output->template flat<T>().device(context->eigen_device<Device>()) = output_backprop.flat<T>() @@ -263,11 +263,12 @@ class BiasGradOp : public OpKernel { .sum(reduction_axes) .template cast<T>(); // End of code by intel_tf. } else { - Eigen::DSizes<int, 2> two_dims(batch * height * width, channel); + Eigen::DSizes<Eigen::Index, 2> two_dims(batch * height * width, + channel); #ifdef EIGEN_HAS_INDEX_LIST Eigen::IndexList<Eigen::type2index<0> > reduction_axis; #else - Eigen::array<int, 1> reduction_axis = {0}; + Eigen::array<Eigen::Index, 1> reduction_axis = {0}; #endif output->template flat<T>().device(context->eigen_device<Device>()) = output_backprop.flat<T>() diff --git a/tensorflow/core/kernels/bincount_op_gpu.cu.cc b/tensorflow/core/kernels/bincount_op_gpu.cu.cc index 6074b3e1f6..7d09e9b820 100644 --- a/tensorflow/core/kernels/bincount_op_gpu.cu.cc +++ b/tensorflow/core/kernels/bincount_op_gpu.cu.cc @@ -17,7 +17,7 @@ limitations under the License. #define EIGEN_USE_GPU -#include "external/cub_archive/cub/device/device_histogram.cuh" +#include "third_party/cub/device/device_histogram.cuh" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" diff --git a/tensorflow/core/kernels/boosted_trees/BUILD b/tensorflow/core/kernels/boosted_trees/BUILD index 4910021c63..4e8bfa02fc 100644 --- a/tensorflow/core/kernels/boosted_trees/BUILD +++ b/tensorflow/core/kernels/boosted_trees/BUILD @@ -15,7 +15,9 @@ load( tf_proto_library( name = "boosted_trees_proto", - srcs = ["boosted_trees.proto"], + srcs = [ + "boosted_trees.proto", + ], cc_api_version = 2, visibility = ["//visibility:public"], ) @@ -87,9 +89,21 @@ tf_kernel_library( ) tf_kernel_library( + name = "quantile_ops", + srcs = ["quantile_ops.cc"], + deps = [ + "//tensorflow/core:boosted_trees_ops_op_lib", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core/kernels/boosted_trees/quantiles:weighted_quantiles", + ], +) + +tf_kernel_library( name = "boosted_trees_ops", deps = [ ":prediction_ops", + ":quantile_ops", ":resource_ops", ":stats_ops", ":training_ops", diff --git a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto index c9664f0c1c..1ab72af059 100644 --- a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto +++ b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto @@ -11,6 +11,7 @@ message Node { oneof node { Leaf leaf = 1; BucketizedSplit bucketized_split = 2; + CategoricalSplit categorical_split = 3; } NodeMetadata metadata = 777; } @@ -57,6 +58,18 @@ message BucketizedSplit { int32 right_id = 4; } +message CategoricalSplit { + // Categorical feature column and split describing the rule feature value == + // value. + int32 feature_id = 1; + int32 value = 2; + + // Node children indexing into a contiguous + // vector of nodes starting from the root. + int32 left_id = 3; + int32 right_id = 4; +} + // Tree describes a list of connected nodes. // Node 0 must be the root and can carry any payload including a leaf // in the case of representing the bias. diff --git a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc index b2efa06941..4ae26fb95b 100644 --- a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc +++ b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc @@ -334,30 +334,34 @@ class BoostedTreesExampleDebugOutputsOp : public OpKernel { // Proto to store debug outputs, per example. boosted_trees::DebugOutput example_debug_info; // Initial bias prediction. E.g., prediction based off training mean. - example_debug_info.add_logits_path(resource->GetTreeWeight(0) * - resource->node_value(0, 0)); + float tree_logit = + resource->GetTreeWeight(0) * resource->node_value(0, 0); + example_debug_info.add_logits_path(tree_logit); int32 node_id = 0; int32 tree_id = 0; int32 feature_id; - float tree_logit; float past_trees_logit = 0; // Sum of leaf logits from prior trees. - // Populate proto. + // Go through each tree and populate proto. while (tree_id <= last_tree) { - // Feature id used to split. - feature_id = resource->feature_id(tree_id, node_id); - example_debug_info.add_feature_ids(feature_id); - // Get logit after split. - node_id = resource->next_node(tree_id, node_id, i, - batch_bucketized_features); - tree_logit = resource->GetTreeWeight(tree_id) * - resource->node_value(tree_id, node_id); - // Output logit incorporates sum of leaf logits from prior trees. - example_debug_info.add_logits_path(tree_logit + past_trees_logit); - if (resource->is_leaf(tree_id, node_id)) { - // Move onto other trees. - past_trees_logit += tree_logit; + if (resource->is_leaf(tree_id, node_id)) { // Move onto other trees. + // Accumulate tree_logits only if the leaf is non-root, but do so + // for bias tree. + if (tree_id == 0 || node_id > 0) { + past_trees_logit += tree_logit; + } ++tree_id; node_id = 0; + } else { // Add to proto. + // Feature id used to split. + feature_id = resource->feature_id(tree_id, node_id); + example_debug_info.add_feature_ids(feature_id); + // Get logit after split. + node_id = resource->next_node(tree_id, node_id, i, + batch_bucketized_features); + tree_logit = resource->GetTreeWeight(tree_id) * + resource->node_value(tree_id, node_id); + // Output logit incorporates sum of leaf logits from prior trees. + example_debug_info.add_logits_path(tree_logit + past_trees_logit); } } // Set output as serialized proto containing debug info. diff --git a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc new file mode 100644 index 0000000000..d1840941c1 --- /dev/null +++ b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc @@ -0,0 +1,453 @@ +// Copyright 2017 The TensorFlow Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================= +#include <algorithm> +#include <iterator> +#include <string> +#include <vector> + +#include "tensorflow/core/framework/device_base.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h" +#include "tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_stream.h" +#include "tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/stringprintf.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/work_sharder.h" + +namespace tensorflow { + +const char* const kExampleWeightsName = "example_weights"; +const char* const kMaxElementsName = "max_elements"; +const char* const kGenerateQuantiles = "generate_quantiles"; +const char* const kNumBucketsName = "num_buckets"; +const char* const kEpsilonName = "epsilon"; +const char* const kBucketBoundariesName = "bucket_boundaries"; +const char* const kBucketsName = "buckets"; +const char* const kSummariesName = "summaries"; +const char* const kNumStreamsName = "num_streams"; +const char* const kNumFeaturesName = "num_features"; +const char* const kFloatFeaturesName = "float_values"; +const char* const kResourceHandleName = "quantile_stream_resource_handle"; + +using QuantileStreamResource = BoostedTreesQuantileStreamResource; +using QuantileStream = + boosted_trees::quantiles::WeightedQuantilesStream<float, float>; +using QuantileSummary = + boosted_trees::quantiles::WeightedQuantilesSummary<float, float>; +using QuantileSummaryEntry = + boosted_trees::quantiles::WeightedQuantilesSummary<float, + float>::SummaryEntry; + +// Generates quantiles on a finalized QuantileStream. +std::vector<float> GenerateBoundaries(const QuantileStream& stream, + const int64 num_boundaries) { + std::vector<float> boundaries = stream.GenerateBoundaries(num_boundaries); + + // Uniquify elements as we may get dupes. + auto end_it = std::unique(boundaries.begin(), boundaries.end()); + boundaries.resize(std::distance(boundaries.begin(), end_it)); + return boundaries; +} + +// Generates quantiles on a finalized QuantileStream. +std::vector<float> GenerateQuantiles(const QuantileStream& stream, + const int64 num_quantiles) { + // Do not de-dup boundaries. Exactly num_quantiles+1 boundary values + // will be returned. + std::vector<float> boundaries = stream.GenerateQuantiles(num_quantiles - 1); + CHECK_EQ(boundaries.size(), num_quantiles); + return boundaries; +} + +std::vector<float> GetBuckets(const int32 feature, + const OpInputList& buckets_list) { + const auto& buckets = buckets_list[feature].flat<float>(); + std::vector<float> buckets_vector(buckets.data(), + buckets.data() + buckets.size()); + return buckets_vector; +} + +REGISTER_RESOURCE_HANDLE_KERNEL(BoostedTreesQuantileStreamResource); + +REGISTER_KERNEL_BUILDER( + Name("IsBoostedTreesQuantileStreamResourceInitialized").Device(DEVICE_CPU), + IsResourceInitialized<BoostedTreesQuantileStreamResource>); + +class BoostedTreesCreateQuantileStreamResourceOp : public OpKernel { + public: + explicit BoostedTreesCreateQuantileStreamResourceOp( + OpKernelConstruction* const context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr(kMaxElementsName, &max_elements_)); + } + + void Compute(OpKernelContext* context) override { + // Only create one, if one does not exist already. Report status for all + // other exceptions. If one already exists, it unrefs the new one. + // An epsilon value of zero could cause perfoamance issues and is therefore, + // disallowed. + const Tensor* epsilon_t; + OP_REQUIRES_OK(context, context->input(kEpsilonName, &epsilon_t)); + float epsilon = epsilon_t->scalar<float>()(); + OP_REQUIRES( + context, epsilon > 0, + errors::InvalidArgument("An epsilon value of zero is not allowed.")); + + const Tensor* num_streams_t; + OP_REQUIRES_OK(context, context->input(kNumStreamsName, &num_streams_t)); + int64 num_streams = num_streams_t->scalar<int64>()(); + + auto result = + new QuantileStreamResource(epsilon, max_elements_, num_streams); + auto status = CreateResource(context, HandleFromInput(context, 0), result); + if (!status.ok() && status.code() != tensorflow::error::ALREADY_EXISTS) { + OP_REQUIRES(context, false, status); + } + } + + private: + // An upper bound on the number of entries that the summaries might have + // for a feature. + int64 max_elements_; +}; + +REGISTER_KERNEL_BUILDER( + Name("BoostedTreesCreateQuantileStreamResource").Device(DEVICE_CPU), + BoostedTreesCreateQuantileStreamResourceOp); + +class BoostedTreesMakeQuantileSummariesOp : public OpKernel { + public: + explicit BoostedTreesMakeQuantileSummariesOp( + OpKernelConstruction* const context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr(kNumFeaturesName, &num_features_)); + } + + void Compute(OpKernelContext* const context) override { + // Read float features list; + OpInputList float_features_list; + OP_REQUIRES_OK( + context, context->input_list(kFloatFeaturesName, &float_features_list)); + + // Parse example weights and get batch size. + const Tensor* example_weights_t; + OP_REQUIRES_OK(context, + context->input(kExampleWeightsName, &example_weights_t)); + auto example_weights = example_weights_t->flat<float>(); + const int64 batch_size = example_weights.size(); + const Tensor* epsilon_t; + OP_REQUIRES_OK(context, context->input(kEpsilonName, &epsilon_t)); + float epsilon = epsilon_t->scalar<float>()(); + + OpOutputList summaries_output_list; + OP_REQUIRES_OK( + context, context->output_list(kSummariesName, &summaries_output_list)); + + auto do_quantile_summary_gen = [&](const int64 begin, const int64 end) { + // Iterating features. + for (int64 index = begin; index < end; index++) { + const auto feature_values = float_features_list[index].flat<float>(); + QuantileStream stream(epsilon, batch_size + 1); + // Run quantile summary generation. + for (int64 j = 0; j < batch_size; j++) { + stream.PushEntry(feature_values(j), example_weights(j)); + } + stream.Finalize(); + const auto summary_entry_list = stream.GetFinalSummary().GetEntryList(); + Tensor* output_t; + OP_REQUIRES_OK( + context, + summaries_output_list.allocate( + index, + TensorShape({static_cast<int64>(summary_entry_list.size()), 4}), + &output_t)); + auto output = output_t->matrix<float>(); + for (auto row = 0; row < summary_entry_list.size(); row++) { + const auto& entry = summary_entry_list[row]; + output(row, 0) = entry.value; + output(row, 1) = entry.weight; + output(row, 2) = entry.min_rank; + output(row, 3) = entry.max_rank; + } + } + }; + // TODO(tanzheny): comment on the magic number. + const int64 kCostPerUnit = 500 * batch_size; + const DeviceBase::CpuWorkerThreads& worker_threads = + *context->device()->tensorflow_cpu_worker_threads(); + Shard(worker_threads.num_threads, worker_threads.workers, num_features_, + kCostPerUnit, do_quantile_summary_gen); + } + + private: + int64 num_features_; +}; + +REGISTER_KERNEL_BUILDER( + Name("BoostedTreesMakeQuantileSummaries").Device(DEVICE_CPU), + BoostedTreesMakeQuantileSummariesOp); + +class BoostedTreesQuantileStreamResourceAddSummariesOp : public OpKernel { + public: + explicit BoostedTreesQuantileStreamResourceAddSummariesOp( + OpKernelConstruction* const context) + : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + ResourceHandle handle; + OP_REQUIRES_OK(context, + HandleFromInput(context, kResourceHandleName, &handle)); + QuantileStreamResource* stream_resource; + // Create a reference to the underlying resource using the handle. + OP_REQUIRES_OK(context, LookupResource(context, handle, &stream_resource)); + // Remove the reference at the end of this scope. + mutex_lock l(*stream_resource->mutex()); + core::ScopedUnref unref_me(stream_resource); + + OpInputList summaries_list; + OP_REQUIRES_OK(context, + context->input_list(kSummariesName, &summaries_list)); + int32 num_streams = stream_resource->num_streams(); + CHECK_EQ(static_cast<int>(num_streams), summaries_list.size()); + + auto do_quantile_add_summary = [&](const int64 begin, const int64 end) { + // Iterating all features. + for (int64 feature_idx = begin; feature_idx < end; ++feature_idx) { + const Tensor& summaries = summaries_list[feature_idx]; + const auto summary_values = summaries.matrix<float>(); + const auto& tensor_shape = summaries.shape(); + const int64 entries_size = tensor_shape.dim_size(0); + CHECK_EQ(tensor_shape.dim_size(1), 4); + std::vector<QuantileSummaryEntry> summary_entries; + summary_entries.reserve(entries_size); + for (int64 i = 0; i < entries_size; i++) { + float value = summary_values(i, 0); + float weight = summary_values(i, 1); + float min_rank = summary_values(i, 2); + float max_rank = summary_values(i, 3); + QuantileSummaryEntry entry(value, weight, min_rank, max_rank); + summary_entries.push_back(entry); + } + stream_resource->stream(feature_idx)->PushSummary(summary_entries); + } + }; + + // TODO(tanzheny): comment on the magic number. + const int64 kCostPerUnit = 500 * num_streams; + const DeviceBase::CpuWorkerThreads& worker_threads = + *context->device()->tensorflow_cpu_worker_threads(); + Shard(worker_threads.num_threads, worker_threads.workers, num_streams, + kCostPerUnit, do_quantile_add_summary); + } +}; + +REGISTER_KERNEL_BUILDER( + Name("BoostedTreesQuantileStreamResourceAddSummaries").Device(DEVICE_CPU), + BoostedTreesQuantileStreamResourceAddSummariesOp); + +class BoostedTreesQuantileStreamResourceFlushOp : public OpKernel { + public: + explicit BoostedTreesQuantileStreamResourceFlushOp( + OpKernelConstruction* const context) + : OpKernel(context) { + OP_REQUIRES_OK(context, + context->GetAttr(kGenerateQuantiles, &generate_quantiles_)); + } + + void Compute(OpKernelContext* context) override { + ResourceHandle handle; + OP_REQUIRES_OK(context, + HandleFromInput(context, kResourceHandleName, &handle)); + QuantileStreamResource* stream_resource; + // Create a reference to the underlying resource using the handle. + OP_REQUIRES_OK(context, LookupResource(context, handle, &stream_resource)); + // Remove the reference at the end of this scope. + mutex_lock l(*stream_resource->mutex()); + core::ScopedUnref unref_me(stream_resource); + + const Tensor* num_buckets_t; + OP_REQUIRES_OK(context, context->input(kNumBucketsName, &num_buckets_t)); + const int64 num_buckets = num_buckets_t->scalar<int64>()(); + const int64 num_streams = stream_resource->num_streams(); + + auto do_quantile_flush = [&](const int64 begin, const int64 end) { + // Iterating over all streams. + for (int64 stream_idx = begin; stream_idx < end; ++stream_idx) { + QuantileStream* stream = stream_resource->stream(stream_idx); + stream->Finalize(); + stream_resource->set_boundaries( + generate_quantiles_ ? GenerateQuantiles(*stream, num_buckets) + : GenerateBoundaries(*stream, num_buckets), + stream_idx); + } + }; + + // TODO(tanzheny): comment on the magic number. + const int64 kCostPerUnit = 500 * num_streams; + const DeviceBase::CpuWorkerThreads& worker_threads = + *context->device()->tensorflow_cpu_worker_threads(); + Shard(worker_threads.num_threads, worker_threads.workers, num_streams, + kCostPerUnit, do_quantile_flush); + + stream_resource->set_buckets_ready(true); + } + + private: + bool generate_quantiles_; +}; + +REGISTER_KERNEL_BUILDER( + Name("BoostedTreesQuantileStreamResourceFlush").Device(DEVICE_CPU), + BoostedTreesQuantileStreamResourceFlushOp); + +class BoostedTreesQuantileStreamResourceGetBucketBoundariesOp + : public OpKernel { + public: + explicit BoostedTreesQuantileStreamResourceGetBucketBoundariesOp( + OpKernelConstruction* const context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr(kNumFeaturesName, &num_features_)); + } + + void Compute(OpKernelContext* const context) override { + ResourceHandle handle; + OP_REQUIRES_OK(context, + HandleFromInput(context, kResourceHandleName, &handle)); + QuantileStreamResource* stream_resource; + // Create a reference to the underlying resource using the handle. + OP_REQUIRES_OK(context, LookupResource(context, handle, &stream_resource)); + // Remove the reference at the end of this scope. + mutex_lock l(*stream_resource->mutex()); + core::ScopedUnref unref_me(stream_resource); + + const int64 num_streams = stream_resource->num_streams(); + CHECK_EQ(num_features_, num_streams); + OpOutputList bucket_boundaries_list; + OP_REQUIRES_OK(context, context->output_list(kBucketBoundariesName, + &bucket_boundaries_list)); + + auto do_quantile_get_buckets = [&](const int64 begin, const int64 end) { + // Iterating over all streams. + for (int64 stream_idx = begin; stream_idx < end; stream_idx++) { + const auto& boundaries = stream_resource->boundaries(stream_idx); + Tensor* bucket_boundaries_t = nullptr; + OP_REQUIRES_OK(context, + bucket_boundaries_list.allocate( + stream_idx, {static_cast<int64>(boundaries.size())}, + &bucket_boundaries_t)); + auto* quantiles_flat = bucket_boundaries_t->flat<float>().data(); + memcpy(quantiles_flat, boundaries.data(), + sizeof(float) * boundaries.size()); + } + }; + + // TODO(tanzheny): comment on the magic number. + const int64 kCostPerUnit = 500 * num_streams; + const DeviceBase::CpuWorkerThreads& worker_threads = + *context->device()->tensorflow_cpu_worker_threads(); + Shard(worker_threads.num_threads, worker_threads.workers, num_streams, + kCostPerUnit, do_quantile_get_buckets); + } + + private: + int64 num_features_; +}; + +REGISTER_KERNEL_BUILDER( + Name("BoostedTreesQuantileStreamResourceGetBucketBoundaries") + .Device(DEVICE_CPU), + BoostedTreesQuantileStreamResourceGetBucketBoundariesOp); + +// Given the calculated quantiles thresholds and input data, this operation +// converts the input features into the buckets (categorical values), depending +// on which quantile they fall into. +class BoostedTreesBucketizeOp : public OpKernel { + public: + explicit BoostedTreesBucketizeOp(OpKernelConstruction* const context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr(kNumFeaturesName, &num_features_)); + } + + void Compute(OpKernelContext* const context) override { + // Read float features list; + OpInputList float_features_list; + OP_REQUIRES_OK( + context, context->input_list(kFloatFeaturesName, &float_features_list)); + OpInputList bucket_boundaries_list; + OP_REQUIRES_OK(context, context->input_list(kBucketBoundariesName, + &bucket_boundaries_list)); + OP_REQUIRES(context, + tensorflow::TensorShapeUtils::IsVector( + bucket_boundaries_list[0].shape()), + errors::InvalidArgument( + strings::Printf("Buckets should be flat vectors."))); + OpOutputList buckets_list; + OP_REQUIRES_OK(context, context->output_list(kBucketsName, &buckets_list)); + + auto do_quantile_get_quantiles = [&](const int64 begin, const int64 end) { + // Iterating over all resources + for (int64 feature_idx = begin; feature_idx < end; feature_idx++) { + const Tensor& values_tensor = float_features_list[feature_idx]; + const int64 num_values = values_tensor.dim_size(0); + + Tensor* output_t = nullptr; + OP_REQUIRES_OK( + context, buckets_list.allocate( + feature_idx, TensorShape({num_values, 1}), &output_t)); + auto output = output_t->matrix<int32>(); + + const std::vector<float>& bucket_boundaries_vector = + GetBuckets(feature_idx, bucket_boundaries_list); + CHECK(!bucket_boundaries_vector.empty()) + << "Got empty buckets for feature " << feature_idx; + auto flat_values = values_tensor.flat<float>(); + for (int64 instance = 0; instance < num_values; instance++) { + const float value = flat_values(instance); + auto bucket_iter = + std::lower_bound(bucket_boundaries_vector.begin(), + bucket_boundaries_vector.end(), value); + if (bucket_iter == bucket_boundaries_vector.end()) { + --bucket_iter; + } + const int32 bucket = static_cast<int32>( + bucket_iter - bucket_boundaries_vector.begin()); + // Bucket id. + output(instance, 0) = bucket; + } + } + }; + + // TODO(tanzheny): comment on the magic number. + const int64 kCostPerUnit = 500 * num_features_; + const DeviceBase::CpuWorkerThreads& worker_threads = + *context->device()->tensorflow_cpu_worker_threads(); + Shard(worker_threads.num_threads, worker_threads.workers, num_features_, + kCostPerUnit, do_quantile_get_quantiles); + } + + private: + int64 num_features_; +}; + +REGISTER_KERNEL_BUILDER(Name("BoostedTreesBucketize").Device(DEVICE_CPU), + BoostedTreesBucketizeOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/BUILD b/tensorflow/core/kernels/boosted_trees/quantiles/BUILD index 3163c63949..12d9473776 100644 --- a/tensorflow/core/kernels/boosted_trees/quantiles/BUILD +++ b/tensorflow/core/kernels/boosted_trees/quantiles/BUILD @@ -1,5 +1,5 @@ # Description: -# This directory contains common utilities used in boosted_trees. +# This directory contains common quantile utilities used in boosted_trees. package( default_visibility = ["//tensorflow:internal"], ) @@ -16,6 +16,7 @@ cc_library( name = "weighted_quantiles", srcs = [], hdrs = [ + "quantile_stream_resource.h", "weighted_quantiles_buffer.h", "weighted_quantiles_stream.h", "weighted_quantiles_summary.h", @@ -23,6 +24,7 @@ cc_library( visibility = ["//visibility:public"], deps = [ "//tensorflow/core:framework_headers_lib", + "//third_party/eigen3", ], ) diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h b/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h new file mode 100644 index 0000000000..1c31724272 --- /dev/null +++ b/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h @@ -0,0 +1,96 @@ +// Copyright 2017 The TensorFlow Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================= +#ifndef TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_QUANTILES_QUANTILE_STREAM_RESOURCE_H_ +#define TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_QUANTILES_QUANTILE_STREAM_RESOURCE_H_ + +#include <vector> +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_stream.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/mutex.h" + +namespace tensorflow { + +using QuantileStream = + boosted_trees::quantiles::WeightedQuantilesStream<float, float>; + +// Quantile Stream Resource for a list of streams sharing the same number of +// quantiles, maximum elements, and epsilon. +class BoostedTreesQuantileStreamResource : public ResourceBase { + public: + BoostedTreesQuantileStreamResource(const float epsilon, + const int64 max_elements, + const int64 num_streams) + : are_buckets_ready_(false), + epsilon_(epsilon), + num_streams_(num_streams), + max_elements_(max_elements) { + streams_.reserve(num_streams_); + boundaries_.reserve(num_streams_); + for (int64 idx = 0; idx < num_streams; ++idx) { + streams_.push_back(QuantileStream(epsilon, max_elements)); + boundaries_.push_back(std::vector<float>()); + } + } + + string DebugString() override { return "QuantileStreamResource"; } + + tensorflow::mutex* mutex() { return &mu_; } + + QuantileStream* stream(const int64 index) { return &streams_[index]; } + + const std::vector<float>& boundaries(const int64 index) { + return boundaries_[index]; + } + + void set_boundaries(const std::vector<float>& boundaries, const int64 index) { + boundaries_[index] = boundaries; + } + + float epsilon() const { return epsilon_; } + int64 num_streams() const { return num_streams_; } + + bool are_buckets_ready() const { return are_buckets_ready_; } + void set_buckets_ready(const bool are_buckets_ready) { + are_buckets_ready_ = are_buckets_ready; + } + + private: + ~BoostedTreesQuantileStreamResource() override {} + + // Mutex for the whole resource. + tensorflow::mutex mu_; + + // Quantile streams. + std::vector<QuantileStream> streams_; + + // Stores the boundaries. Same size as streams_. + std::vector<std::vector<float>> boundaries_; + + // Whether boundaries are created. Initially boundaries are empty until + // set_boundaries are called. + bool are_buckets_ready_; + + const float epsilon_; + const int64 num_streams_; + // An upper-bound for the number of elements. + int64 max_elements_; + + TF_DISALLOW_COPY_AND_ASSIGN(BoostedTreesQuantileStreamResource); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_QUANTILES_QUANTILE_STREAM_RESOURCE_H_ diff --git a/tensorflow/core/kernels/boosted_trees/resources.cc b/tensorflow/core/kernels/boosted_trees/resources.cc index cc90bb2f45..2798722536 100644 --- a/tensorflow/core/kernels/boosted_trees/resources.cc +++ b/tensorflow/core/kernels/boosted_trees/resources.cc @@ -60,14 +60,26 @@ int32 BoostedTreesEnsembleResource::next_node( DCHECK_LT(tree_id, tree_ensemble_->trees_size()); DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size()); const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id); - DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit); - const auto& split = node.bucketized_split(); - if (bucketized_features[split.feature_id()](index_in_batch) <= - split.threshold()) { - return split.left_id(); - } else { - return split.right_id(); + + switch (node.node_case()) { + case boosted_trees::Node::kBucketizedSplit: { + const auto& split = node.bucketized_split(); + return (bucketized_features[split.feature_id()](index_in_batch) <= + split.threshold()) + ? split.left_id() + : split.right_id(); + } + case boosted_trees::Node::kCategoricalSplit: { + const auto& split = node.categorical_split(); + return (bucketized_features[split.feature_id()](index_in_batch) == + split.value()) + ? split.left_id() + : split.right_id(); + } + default: + DCHECK(false) << "Node type " << node.node_case() << " not supported."; } + return -1; } float BoostedTreesEnsembleResource::node_value(const int32 tree_id, diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h index de9b69828e..639c3062cc 100644 --- a/tensorflow/core/kernels/conv_2d.h +++ b/tensorflow/core/kernels/conv_2d.h @@ -137,17 +137,16 @@ struct MatMulConvFunctor { } }; -// Shuffles a filter tensor from: -// [<spatial_dims>, in, out] -// to: -// [out, in, <spatial_dims>] +// Shuffles a filter tensor from TensorFlow format HWIO to dst_filter_format. +// +// Note: Currently OIHW is the only supported destination format. Support for +// OHWI format will be added in a follow-up change. template <typename Device, typename T, typename IndexType, int NDIMS> struct TransformFilter { - void operator()(const Device& d, + void operator()(const Device& d, FilterTensorFormat dst_filter_format, typename TTypes<T, NDIMS, IndexType>::ConstTensor in, typename TTypes<T, NDIMS, IndexType>::Tensor out) { - // We want a 3, 2, 0, 1 shuffle. Merge the spatial dimensions together - // to speed up the shuffle operation. + // Merge the spatial dimensions together to speed up the shuffle operation. Eigen::DSizes<IndexType, 3> merged_dims; merged_dims[0] = in.dimension(0); // spatial dimensions for (int i = 1; i < NDIMS - 2; ++i) { @@ -156,16 +155,30 @@ struct TransformFilter { merged_dims[1] = in.dimension(NDIMS - 2); // input filters merged_dims[2] = in.dimension(NDIMS - 1); // output filters + CHECK(dst_filter_format == FORMAT_OIHW) + << "Unsupported destination filter format: " + << ToString(dst_filter_format); + // Source filter format is FORMAT_HWIO and spatial dimensions HW are merged + // in the beginning. + Eigen::DSizes<IndexType, 3> shuffling_perm = + Eigen::DSizes<IndexType, 3>(2, 1, 0); + Eigen::DSizes<IndexType, NDIMS> expanded_dims; - expanded_dims[0] = in.dimension(NDIMS - 1); // output filters - expanded_dims[1] = in.dimension(NDIMS - 2); // input filters - for (int i = 0; i < NDIMS - 2; ++i) { // spatial dimensions - expanded_dims[i + 2] = in.dimension(i); + int out_index = 0; + for (int merged_dim = 0; merged_dim < merged_dims.rank(); ++merged_dim) { + if (shuffling_perm[merged_dim] == 0) { + for (int spatial_dim = 0; spatial_dim < NDIMS - 2; ++spatial_dim) { + expanded_dims[out_index++] = in.dimension(spatial_dim); + } + } else { + constexpr int kLastSpatialDim = NDIMS - 3; + expanded_dims[out_index++] = + in.dimension(kLastSpatialDim + shuffling_perm[merged_dim]); + } } - out.device(d) = in.reshape(merged_dims) - .shuffle(Eigen::DSizes<IndexType, 3>(2, 1, 0)) - .reshape(expanded_dims); + out.device(d) = + in.reshape(merged_dims).shuffle(shuffling_perm).reshape(expanded_dims); } }; @@ -282,7 +295,9 @@ struct SwapDimension0And2InTensor3 { const gtl::ArraySlice<int64>& input_dims, T* out); }; -// Reverses the effect of TransformFilter above. +// Transforms back filter from OIHW to HWOI format to reverse effect of +// TransformFilter above. +// TODO(hinsu): Support reverse transformation from filter format OHWI as well. template <typename Device, typename T, int NDIMS> struct ReverseTransformFilter { void operator()(const Device& d, typename TTypes<T, NDIMS>::ConstTensor in, diff --git a/tensorflow/core/kernels/conv_3d.h b/tensorflow/core/kernels/conv_3d.h index 02e3655ad1..b819c6f910 100644 --- a/tensorflow/core/kernels/conv_3d.h +++ b/tensorflow/core/kernels/conv_3d.h @@ -19,6 +19,7 @@ limitations under the License. #define TENSORFLOW_CORE_KERNELS_CONV_3D_H_ #include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h" #include "tensorflow/core/kernels/eigen_cuboid_convolution.h" namespace tensorflow { @@ -28,6 +29,14 @@ namespace functor { template <typename Device, typename T> struct CuboidConvolution; +// Backward input pass for the cuboid convolution. +template <typename Device, typename T> +struct CuboidConvolutionBackwardInput; + +// Backward filter pass for the cuboid convolution. +template <typename Device, typename T> +struct CuboidConvolutionBackwardFilter; + typedef Eigen::ThreadPoolDevice CPUDevice; template <typename T> @@ -42,6 +51,40 @@ struct CuboidConvolution<CPUDevice, T> { } }; +template <typename T> +struct CuboidConvolutionBackwardInput<CPUDevice, T> { + void operator()(const CPUDevice& d, + typename TTypes<T, 5>::Tensor input_backward, + typename TTypes<T, 5>::ConstTensor filter, + typename TTypes<T, 5>::ConstTensor output_backward, + int stride_planes, int stride_rows, int stride_cols) { + // Need to swap the order of plane/row/col strides when calling Eigen. + input_backward.device(d) = Eigen::CuboidConvolutionBackwardInput( + filter, output_backward, + input_backward.dimension(3), // input_planes + input_backward.dimension(2), // input_rows + input_backward.dimension(1), // input_cols + stride_cols, stride_rows, stride_planes); + } +}; + +template <typename T> +struct CuboidConvolutionBackwardFilter<CPUDevice, T> { + void operator()(const CPUDevice& d, + typename TTypes<T, 5>::Tensor filter_backward, + typename TTypes<T, 5>::ConstTensor input, + typename TTypes<T, 5>::ConstTensor output_backward, + int stride_planes, int stride_rows, int stride_cols) { + // Need to swap the order of plane/row/col strides when calling Eigen. + filter_backward.device(d) = Eigen::CuboidConvolutionBackwardKernel( + input, output_backward, + filter_backward.dimension(2), // kernel_planes + filter_backward.dimension(1), // kernel_rows + filter_backward.dimension(0), // kernel_cols + stride_cols, stride_rows, stride_planes); + } +}; + } // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc index 63b1bcda43..9e86a16b66 100644 --- a/tensorflow/core/kernels/conv_grad_filter_ops.cc +++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc @@ -1018,7 +1018,8 @@ namespace functor { extern template struct InflatePadAndShuffle<GPUDevice, T, 4, int>; \ template <> \ void TransformFilter<GPUDevice, T, int, 4>::operator()( \ - const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in, \ + const GPUDevice& d, FilterTensorFormat dst_filter_format, \ + typename TTypes<T, 4, int>::ConstTensor in, \ typename TTypes<T, 4, int>::Tensor out); \ extern template struct TransformFilter<GPUDevice, T, int, 4>; \ template <> \ diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc index d664a11e73..43bb5ea56c 100644 --- a/tensorflow/core/kernels/conv_grad_input_ops.cc +++ b/tensorflow/core/kernels/conv_grad_input_ops.cc @@ -901,7 +901,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()( &transformed_filter)); functor::TransformFilter<GPUDevice, T, int, 4>()( - ctx->eigen_device<GPUDevice>(), To32Bit(filter.tensor<T, 4>()), + ctx->eigen_device<GPUDevice>(), FORMAT_OIHW, + To32Bit(filter.tensor<T, 4>()), To32Bit(transformed_filter.tensor<T, 4>())); Tensor transformed_out_backprop; @@ -1090,7 +1091,8 @@ namespace functor { extern template struct InflatePadAndShuffle<GPUDevice, T, 4, int>; \ template <> \ void TransformFilter<GPUDevice, T, int, 4>::operator()( \ - const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in, \ + const GPUDevice& d, FilterTensorFormat dst_filter_format, \ + typename TTypes<T, 4, int>::ConstTensor in, \ typename TTypes<T, 4, int>::Tensor out); \ extern template struct TransformFilter<GPUDevice, T, int, 4>; \ template <> \ diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc index fc0a2f123f..507720c998 100644 --- a/tensorflow/core/kernels/conv_grad_ops.cc +++ b/tensorflow/core/kernels/conv_grad_ops.cc @@ -41,6 +41,17 @@ limitations under the License. namespace tensorflow { +// Compute padding for the given spatial dimension. +int ConvBackpropDimensions::SpatialPadding(const Padding& padding, + int dim) const { + return (padding == VALID) + ? 0 + : std::max<int>( + 0, static_cast<int>((output_size(dim) - 1) * stride(dim) + + (filter_size(dim) - 1) * dilation(dim) + + 1 - input_size(dim))); +} + // The V2 version computes windowed output size with arbitrary dilation_rate, // while the original version only handles the cases where dilation_rates equal // to 1. diff --git a/tensorflow/core/kernels/conv_grad_ops.h b/tensorflow/core/kernels/conv_grad_ops.h index 535586d53a..9551959463 100644 --- a/tensorflow/core/kernels/conv_grad_ops.h +++ b/tensorflow/core/kernels/conv_grad_ops.h @@ -234,6 +234,16 @@ struct ConvBackpropDimensions { // Input and output feature depth. int64 in_depth, out_depth; + + // Convenience access methods for spatial dimensions properties. + int64 input_size(int dim) const { return spatial_dims[dim].input_size; } + int64 filter_size(int dim) const { return spatial_dims[dim].filter_size; } + int64 output_size(int dim) const { return spatial_dims[dim].output_size; } + int64 stride(int dim) const { return spatial_dims[dim].stride; } + int64 dilation(int dim) const { return spatial_dims[dim].dilation; } + + // Compute padding for the given spatial dimension. + int SpatialPadding(const Padding& padding, int dim) const; }; // Common code between implementations of Conv?DBackpropInput and diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc index 15f1bf9aba..bab91f5e86 100644 --- a/tensorflow/core/kernels/conv_grad_ops_3d.cc +++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc @@ -25,6 +25,7 @@ limitations under the License. #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/tensor_slice.h" #include "tensorflow/core/kernels/conv_2d.h" +#include "tensorflow/core/kernels/conv_grad_ops.h" #include "tensorflow/core/kernels/conv_ops_gpu.h" #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/lib/core/errors.h" @@ -32,111 +33,130 @@ limitations under the License. #include "tensorflow/core/util/padding.h" #include "tensorflow/core/util/tensor_format.h" #include "tensorflow/core/util/use_cudnn.h" +#include "tensorflow/core/util/work_sharder.h" #if GOOGLE_CUDA #include "tensorflow/core/platform/stream_executor.h" using stream_executor::dnn::DimIndex; #endif +namespace { + +// TODO(ezhulenev): Split this file into conv_grad_filter_ops_3d.cc and +// conv_grad_input_ops_3d.cc. + +// TODO(ezhulenev): Generalize Col2im and Im2col for 2-d and 3-d kernels. + +// "Depth" is already used for the channel dimension, so for the third spatial +// dimension in this file we use "plane", although in NDHWC layout it's +// indicated with a "D". + +// Returns in 'im_data' (assumed to be zero-initialized) image patch in storage +// order (planes, height, width, depth), constructed from patches in 'col_data', +// which is required to be in storage order (out_planes * out_height * +// out_width, filter_planes, filter_height, filter_width, in_depth). +// +// Based on 2-dimensional implementation written by Yangqing Jia (jiayq). +template <typename T> +void Col2im(const T* col_data, const int depth, const int planes, + const int height, const int width, const int filter_p, + const int filter_h, const int filter_w, const int pad_pt, + const int pad_t, const int pad_l, const int pad_pb, const int pad_b, + const int pad_r, const int stride_p, const int stride_h, + const int stride_w, T* im_data) { + const int planes_col = (planes + pad_pt + pad_pb - filter_p) / stride_p + 1; + const int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1; + const int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1; + int p_pad = -pad_pt; + for (int p = 0; p < planes_col; ++p) { + int h_pad = -pad_t; + for (int h = 0; h < height_col; ++h) { + int w_pad = -pad_l; + for (int w = 0; w < width_col; ++w) { + T* im_patch_data = + im_data + (p_pad * height * width + h_pad * width + w_pad) * depth; + for (int ip = p_pad; ip < p_pad + filter_p; ++ip) { + for (int ih = h_pad; ih < h_pad + filter_h; ++ih) { + for (int iw = w_pad; iw < w_pad + filter_w; ++iw) { + if (ip >= 0 && ip < planes && ih >= 0 && ih < height && iw >= 0 && + iw < width) { + for (int i = 0; i < depth; ++i) { + im_patch_data[i] += col_data[i]; + } + } + im_patch_data += depth; + col_data += depth; + } + // Jump over remaining number of depth. + im_patch_data += depth * (width - filter_w); + } + // Jump over remaining number of (depth * width). + im_patch_data += (depth * width) * (height - filter_h); + } + w_pad += stride_w; + } + h_pad += stride_h; + } + p_pad += stride_p; + } +} + +// Returns in 'col_data', image patches in storage order (planes, height, width, +// depth) extracted from image at 'input_data', which is required to be in +// storage order (batch, planes, height, width, depth). +// +// Based on 2-dimensional implementation written by Yangqing Jia (jiayq). +template <typename T> +void Im2col(const T* input_data, const int depth, const int planes, + const int height, const int width, const int filter_p, + const int filter_h, const int filter_w, const int pad_pt, + const int pad_t, const int pad_l, const int pad_pb, const int pad_b, + const int pad_r, const int stride_p, const int stride_h, + const int stride_w, T* col_data) { + const int planes_col = (planes + pad_pt + pad_pb - filter_p) / stride_p + 1; + const int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1; + const int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1; + + int p_pad = -pad_pt; + for (int p = 0; p < planes_col; ++p) { + int h_pad = -pad_t; + for (int h = 0; h < height_col; ++h) { + int w_pad = -pad_l; + for (int w = 0; w < width_col; ++w) { + for (int ip = p_pad; ip < p_pad + filter_p; ++ip) { + for (int ih = h_pad; ih < h_pad + filter_h; ++ih) { + for (int iw = w_pad; iw < w_pad + filter_w; ++iw) { + if (ip >= 0 && ip < planes && ih >= 0 && ih < height && iw >= 0 && + iw < width) { + memcpy(col_data, + input_data + + (ip * height * width + ih * width + iw) * depth, + sizeof(T) * depth); + } else { + // This should be simply padded with zero. + memset(col_data, 0, sizeof(T) * depth); + } + col_data += depth; + } + } + } + w_pad += stride_w; + } + h_pad += stride_h; + } + p_pad += stride_p; + } +} + +} // namespace + namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; -// TODO(mjanusz): Get rid of the macro and return shapes directly. -#define EXTRACT_AND_VERIFY_DIMENSIONS(label) \ - const Tensor& out_backprop = context->input(2); \ - OP_REQUIRES( \ - context, input_shape.dims() == 5, \ - errors::InvalidArgument(label, ": input must be 5-dimensional")); \ - OP_REQUIRES( \ - context, filter_shape.dims() == 5, \ - errors::InvalidArgument(label, ": filter must be 5-dimensional")); \ - OP_REQUIRES( \ - context, out_backprop.dims() == 5, \ - errors::InvalidArgument(label, ": out_backprop must be 5-dimensional")); \ - const int64 batch = input_shape.dim_size(0); \ - OP_REQUIRES( \ - context, batch == out_backprop.dim_size(0), \ - errors::InvalidArgument( \ - label, ": input and out_backprop must have the same batch size")); \ - const std::array<int64, 3> input_size = { \ - {GetTensorDim(input_shape, data_format_, '0'), \ - GetTensorDim(input_shape, data_format_, '1'), \ - GetTensorDim(input_shape, data_format_, '2')}}; \ - const int64 in_depth = GetTensorDim(input_shape, data_format_, 'C'); \ - const std::array<int64, 3> filter_size = {{filter_shape.dim_size(0), \ - filter_shape.dim_size(1), \ - filter_shape.dim_size(2)}}; \ - const int64 output_cols = GetTensorDim(out_backprop, data_format_, '2'); \ - const int64 output_rows = GetTensorDim(out_backprop, data_format_, '1'); \ - const int64 output_planes = GetTensorDim(out_backprop, data_format_, '0'); \ - OP_REQUIRES(context, in_depth == filter_shape.dim_size(3), \ - errors::InvalidArgument( \ - label, ": input and filter must have the same depth")); \ - const int64 out_depth = filter_shape.dim_size(4); \ - OP_REQUIRES( \ - context, out_depth == GetTensorDim(out_backprop, data_format_, 'C'), \ - errors::InvalidArgument( \ - label, ": filter and out_backprop must have the same out_depth")); \ - const std::array<int64, 3> dilations = { \ - {GetTensorDim(dilation_, data_format_, '0'), \ - GetTensorDim(dilation_, data_format_, '1'), \ - GetTensorDim(dilation_, data_format_, '2')}}; \ - const std::array<int64, 3> strides = { \ - {GetTensorDim(stride_, data_format_, '0'), \ - GetTensorDim(stride_, data_format_, '1'), \ - GetTensorDim(stride_, data_format_, '2')}}; \ - std::array<int64, 3> out, padding; \ - OP_REQUIRES_OK( \ - context, Get3dOutputSizeV2(input_size, filter_size, dilations, strides, \ - padding_, &out, &padding)); \ - OP_REQUIRES(context, output_planes == out[0], \ - errors::InvalidArgument( \ - label, \ - ": Number of planes of out_backprop doesn't match " \ - "computed: actual = ", \ - output_planes, ", computed = ", out[0])); \ - OP_REQUIRES( \ - context, output_rows == out[1], \ - errors::InvalidArgument( \ - label, ": Number of rows of out_backprop doesn't match computed: ", \ - "actual = ", output_rows, ", computed = ", out[1])); \ - OP_REQUIRES( \ - context, output_cols == out[2], \ - errors::InvalidArgument( \ - label, ": Number of cols of out_backprop doesn't match computed: ", \ - "actual = ", output_cols, ", computed = ", out[2])); \ - const auto expanded_out_planes = (output_planes - 1) * strides[0] + 1; \ - const auto expanded_out_rows = (output_rows - 1) * strides[1] + 1; \ - const auto expanded_out_cols = (output_cols - 1) * strides[2] + 1; \ - const auto padded_out_planes = input_size[0] + filter_size[0] - 1; \ - const auto padded_out_rows = input_size[1] + filter_size[1] - 1; \ - const auto padded_out_cols = input_size[2] + filter_size[2] - 1; \ - const auto top_pad_planes = filter_size[0] - 1 - padding[0]; \ - const auto top_pad_rows = filter_size[1] - 1 - padding[1]; \ - const auto left_pad_cols = filter_size[2] - 1 - padding[2]; \ - const auto bottom_pad_planes = \ - padded_out_planes - expanded_out_planes - top_pad_planes; \ - const auto bottom_pad_rows = \ - padded_out_rows - expanded_out_rows - top_pad_rows; \ - const auto right_pad_cols = \ - padded_out_cols - expanded_out_cols - left_pad_cols; \ - VLOG(2) << "Conv3d: " << label \ - << ": expanded_out_planes = " << expanded_out_planes \ - << ": expanded_out_rows = " << expanded_out_rows \ - << ", expanded_out_cols = " << expanded_out_cols \ - << ", padded_out_planes = " << padded_out_planes \ - << ", padded_out_rows = " << padded_out_rows \ - << ", padded_out_cols = " << padded_out_cols \ - << ", top_pad_planes = " << top_pad_planes \ - << ", top_pad_rows = " << top_pad_rows \ - << ", left_pad_cols = " << left_pad_cols \ - << ", bottom_pad_planes = " << bottom_pad_planes \ - << ", bottom_pad_rows = " << bottom_pad_rows \ - << ", right_pad_cols = " << right_pad_cols - -// Backprop for input. +// Backprop for input that offloads computation to +// Eigen::CuboidConvolutionBackwardInput. template <typename Device, class T> class Conv3DBackpropInputOp : public OpKernel { public: @@ -192,6 +212,116 @@ class Conv3DBackpropInputOp : public OpKernel { void Compute(OpKernelContext* context) override { const Tensor& filter = context->input(1); const TensorShape& filter_shape = filter.shape(); + + const Tensor& out_backprop = context->input(2); + const TensorShape& out_backprop_shape = out_backprop.shape(); + + TensorShape input_shape; + if (takes_shape_) { + const Tensor& input_sizes = context->input(0); + // MakeShape is able to handle both DT_INT32 and DT_INT64 for input_sizes. + OP_REQUIRES_OK(context, MakeShape(input_sizes, &input_shape)); + } else { + input_shape = context->input(0).shape(); + } + + ConvBackpropDimensions dims; + OP_REQUIRES_OK(context, ConvBackpropComputeDimensions( + "Conv3DBackpropInputOp", /*num_spatial_dims=*/3, + input_shape, filter_shape, out_backprop_shape, + stride_, padding_, data_format_, &dims)); + + Tensor* in_backprop; + OP_REQUIRES_OK(context, + context->allocate_output(0, input_shape, &in_backprop)); + + functor::CuboidConvolutionBackwardInput<Device, T>()( + context->eigen_device<Device>(), + in_backprop->tensor<T, 5>(), // input_backward + filter.tensor<T, 5>(), // filter + out_backprop.tensor<T, 5>(), // output_backward + static_cast<int>(dims.spatial_dims[0].stride), // stride_planes + static_cast<int>(dims.spatial_dims[1].stride), // stride_rows + static_cast<int>(dims.spatial_dims[2].stride)); // stride_cols + } + + private: + std::vector<int32> dilation_; + std::vector<int32> stride_; + Padding padding_; + TensorFormat data_format_; + bool takes_shape_; + + TF_DISALLOW_COPY_AND_ASSIGN(Conv3DBackpropInputOp); +}; + +// Custom backprop for input that explicitly does the work sharding and calls +// Eigen only to multiply matrices. +template <typename Device, class T> +class Conv3DCustomBackpropInputOp : public OpKernel { + // Limit the maximum size of allocated temporary buffer to + // kMaxTempAllocationOverhead times the size of the input tensors (input, + // filter, out_backprop). If the size of the temporary buffer exceeds this + // limit, fallback on Eigen implementation. + static constexpr int kMaxTempAllocationOverhead = 25; + + public: + explicit Conv3DCustomBackpropInputOp(OpKernelConstruction* context) + : OpKernel(context), + data_format_(FORMAT_NHWC), + takes_shape_(type_string().find("V2") != std::string::npos) { + // data_format is only available in V2. + if (takes_shape_) { + string data_format; + OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); + OP_REQUIRES(context, FormatFromString(data_format, &data_format_), + errors::InvalidArgument("Invalid data format")); + OP_REQUIRES( + context, data_format_ == FORMAT_NHWC, + errors::InvalidArgument( + "Conv3DBackpropInputOpV2 only supports NDHWC on the CPU.")); + } + + OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_)); + OP_REQUIRES(context, dilation_.size() == 5, + errors::InvalidArgument("Dilation rates field must " + "specify 5 dimensions")); + OP_REQUIRES(context, + (GetTensorDim(dilation_, data_format_, 'C') == 1 && + GetTensorDim(dilation_, data_format_, 'N') == 1), + errors::InvalidArgument( + "Current implementation does not yet support " + "dilation rates in the batch and depth dimensions.")); + + // TODO(yangzihao): Add CPU version of dilated conv 3D. + OP_REQUIRES(context, + (GetTensorDim(dilation_, data_format_, '0') == 1 && + GetTensorDim(dilation_, data_format_, '1') == 1 && + GetTensorDim(dilation_, data_format_, '2') == 1), + errors::InvalidArgument( + "Current CPU implementation does not yet support " + "dilation rates larger than 1.")); + + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 5, + errors::InvalidArgument("Sliding window strides field must " + "specify 5 dimensions")); + OP_REQUIRES( + context, + (GetTensorDim(stride_, data_format_, 'C') == 1 && + GetTensorDim(stride_, data_format_, 'N') == 1), + errors::InvalidArgument("Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& filter = context->input(1); + const TensorShape& filter_shape = filter.shape(); + + const Tensor& out_backprop = context->input(2); + const TensorShape& out_backprop_shape = out_backprop.shape(); + TensorShape input_shape; if (takes_shape_) { const Tensor& input_sizes = context->input(0); @@ -200,51 +330,239 @@ class Conv3DBackpropInputOp : public OpKernel { } else { input_shape = context->input(0).shape(); } - EXTRACT_AND_VERIFY_DIMENSIONS("Conv3DBackpropInput"); - Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 5> pad_dims{ - {0, 0}, - {top_pad_planes, bottom_pad_planes}, - {top_pad_rows, bottom_pad_rows}, - {left_pad_cols, right_pad_cols}, - {0, 0}}; + + ConvBackpropDimensions dims; + OP_REQUIRES_OK(context, ConvBackpropComputeDimensions( + "Conv3DBackpropInputOp", /*num_spatial_dims=*/3, + input_shape, filter_shape, out_backprop_shape, + stride_, padding_, data_format_, &dims)); + Tensor* in_backprop; OP_REQUIRES_OK(context, context->allocate_output(0, input_shape, &in_backprop)); - // Fill out a padded out_backprop. - TensorShape padded_out_shape({batch, padded_out_planes, padded_out_rows, - padded_out_cols, out_depth}); - Tensor padded_output; + int64 top_pad_planes, bottom_pad_planes; + int64 top_pad_rows, bottom_pad_rows; + int64 left_pad_cols, right_pad_cols; + + OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose( + dims.spatial_dims[0].input_size, + dims.spatial_dims[0].filter_size, + dims.spatial_dims[0].stride, padding_, + &dims.spatial_dims[0].output_size, + &top_pad_planes, &bottom_pad_planes)); + OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose( + dims.spatial_dims[1].input_size, + dims.spatial_dims[1].filter_size, + dims.spatial_dims[1].stride, padding_, + &dims.spatial_dims[1].output_size, + &top_pad_rows, &bottom_pad_rows)); + OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose( + dims.spatial_dims[2].input_size, + dims.spatial_dims[2].filter_size, + dims.spatial_dims[2].stride, padding_, + &dims.spatial_dims[2].output_size, + &left_pad_cols, &right_pad_cols)); + + // TODO(ezhulenev): Extract work size and shard estimation to shared + // functions in conv_grad_ops, and update 2d convolution backprop. + + // The total dimension size of each kernel. + const int64 filter_total_size = + dims.spatial_dims[0].filter_size * dims.spatial_dims[1].filter_size * + dims.spatial_dims[2].filter_size * dims.in_depth; + + // The output image size is the spatial size of the output. + const int64 output_image_size = dims.spatial_dims[0].output_size * + dims.spatial_dims[1].output_size * + dims.spatial_dims[2].output_size; + + const auto cache_sizes = Eigen::internal::CacheSizes(); + const ptrdiff_t l3_cache_size = cache_sizes.m_l3; + + // Use L3 cache size as target working set size. + const size_t target_working_set_size = l3_cache_size / sizeof(T); + + // Calculate size of matrices involved in MatMul: C = A x B. + const int64 size_A = output_image_size * dims.out_depth; + + const int64 size_B = filter_total_size * dims.out_depth; + + const int64 size_C = output_image_size * filter_total_size; + + const int64 work_unit_size = size_A + size_B + size_C; + + auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); + + // Use parallel tensor contractions if there is no batching. + // + // Compared to Conv2D code, this version is missing work size estimation. In + // benchmarks I didn't find a case when it's beneficial to run parallel + // contraction compared to sharding and matmuls. + const bool use_parallel_contraction = dims.batch_size == 1; + + const size_t shard_size = + use_parallel_contraction + ? 1 + : (target_working_set_size + work_unit_size - 1) / work_unit_size; + + // Total number of elements in all the tensors used by this kernel. + int64 total_tensor_elements = input_shape.num_elements() + + filter_shape.num_elements() + + out_backprop_shape.num_elements(); + + // Shape of the temporary workspace buffer. + TensorShape col_buffer_shape = {static_cast<int64>(shard_size), + static_cast<int64>(output_image_size), + static_cast<int64>(filter_total_size)}; + int64 col_buffer_elements = col_buffer_shape.num_elements(); + + // If the temporary allocation overhead is too large, fallback on Eigen + // implementation which requires much less memory. + int64 col_buffer_overhead = col_buffer_elements / total_tensor_elements; + if (col_buffer_overhead > kMaxTempAllocationOverhead) { + VLOG(2) << "Fallback on Eigen implementation of Conv3DBackpropInputOp: " + "col_buffer_overhead=" + << col_buffer_overhead; + + functor::CuboidConvolutionBackwardInput<Device, T>()( + context->eigen_device<Device>(), + in_backprop->tensor<T, 5>(), // input_backward + filter.tensor<T, 5>(), // filter + out_backprop.tensor<T, 5>(), // output_backward + static_cast<int>(dims.spatial_dims[0].stride), // stride_planes + static_cast<int>(dims.spatial_dims[1].stride), // stride_rows + static_cast<int>(dims.spatial_dims[2].stride)); // stride_cols + + return; + } + + Tensor col_buffer; OP_REQUIRES_OK(context, - context->allocate_temp(DataTypeToEnum<T>::v(), - padded_out_shape, &padded_output)); - Eigen::DSizes<Eigen::DenseIndex, 5> no_op_shuffle{0, 1, 2, 3, 4}; - Eigen::DSizes<Eigen::DenseIndex, 5> eigen_strides{1, strides[0], strides[1], - strides[2], 1}; - functor::InflatePadAndShuffle<Device, T, 5, Eigen::DenseIndex>()( - context->eigen_device<Device>(), out_backprop.tensor<T, 5>(), - eigen_strides, pad_dims, no_op_shuffle, padded_output.tensor<T, 5>()); - const Tensor& padded_output_cref = padded_output; - - // Fill a new "reverted" filter. We need to transpose the in_depth and - // out_depth for the filter and reverse the planes, rows and cols. - TensorShape r_filter_shape( - {filter_size[0], filter_size[1], filter_size[2], out_depth, in_depth}); - Tensor r_filter; - OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::v(), - r_filter_shape, &r_filter)); - Eigen::DSizes<Eigen::DenseIndex, 5> filter_order{0, 1, 2, 4, 3}; - Eigen::array<bool, 5> filter_rev_dims{true, true, true, false, false}; - functor::ShuffleAndReverse<Device, T, 5, Eigen::DenseIndex>()( - context->eigen_device<Device>(), filter.tensor<T, 5>(), filter_order, - filter_rev_dims, r_filter.tensor<T, 5>()); - const Tensor& r_filter_cref = r_filter; - - // Now we can call conv_3d directly. - functor::CuboidConvolution<Device, T>()( - context->eigen_device<Device>(), in_backprop->tensor<T, 5>(), - padded_output_cref.tensor<T, 5>(), r_filter_cref.tensor<T, 5>(), 1, 1, - 1, BrainPadding2EigenPadding(VALID)); + context->allocate_temp(DataTypeToEnum<T>::value, + col_buffer_shape, &col_buffer)); + + // The input offset corresponding to a single input image. + const int64 input_offset = dims.spatial_dims[0].input_size * + dims.spatial_dims[1].input_size * + dims.spatial_dims[2].input_size * dims.in_depth; + + // The output offset corresponding to a single output image. + const int64 output_offset = + dims.spatial_dims[0].output_size * dims.spatial_dims[1].output_size * + dims.spatial_dims[2].output_size * dims.out_depth; + + const T* filter_data = filter.template flat<T>().data(); + T* col_buffer_data = col_buffer.template flat<T>().data(); + const T* out_backprop_data = out_backprop.template flat<T>().data(); + + auto in_backprop_flat = in_backprop->template flat<T>(); + T* input_backprop_data = in_backprop_flat.data(); + in_backprop_flat.device(context->eigen_device<Device>()) = + in_backprop_flat.constant(T(0)); + + if (use_parallel_contraction) { + typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, + Eigen::Unaligned> + TensorMap; + typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>, + Eigen::Unaligned> + ConstTensorMap; + + // Initialize contraction dims (we need to transpose 'B' below). + Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_dims; + contract_dims[0].first = 1; + contract_dims[0].second = 1; + + for (int image_id = 0; image_id < dims.batch_size; ++image_id) { + // Compute gradient into col_buffer. + TensorMap C(col_buffer_data, output_image_size, filter_total_size); + + ConstTensorMap A(out_backprop_data + output_offset * image_id, + output_image_size, dims.out_depth); + ConstTensorMap B(filter_data, filter_total_size, dims.out_depth); + + C.device(context->eigen_cpu_device()) = A.contract(B, contract_dims); + + Col2im<T>(col_buffer_data, dims.in_depth, + // Input spatial dimensions. + dims.spatial_dims[0].input_size, // input planes + dims.spatial_dims[1].input_size, // input rows + dims.spatial_dims[2].input_size, // input cols + // Filter spatial dimensions. + dims.spatial_dims[0].filter_size, // filter planes + dims.spatial_dims[1].filter_size, // filter rows + dims.spatial_dims[2].filter_size, // filter cols + // Spatial padding. + top_pad_planes, top_pad_rows, left_pad_cols, + bottom_pad_planes, bottom_pad_rows, right_pad_cols, + // Spatial striding. + dims.spatial_dims[0].stride, // stride planes + dims.spatial_dims[1].stride, // stride rows + dims.spatial_dims[2].stride, // stride cols + input_backprop_data); + + input_backprop_data += input_offset; + } + } else { + typedef Eigen::Map< + Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> + MatrixMap; + typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, + Eigen::RowMajor>> + ConstMatrixMap; + + for (int image_id = 0; image_id < dims.batch_size; + image_id += shard_size) { + const int shard_limit = + std::min(static_cast<int>(shard_size), + static_cast<int>(dims.batch_size) - image_id); + + auto shard = [&dims, &top_pad_planes, &top_pad_rows, &left_pad_cols, + &bottom_pad_planes, &bottom_pad_rows, &right_pad_cols, + &output_image_size, &filter_total_size, + &input_backprop_data, &col_buffer_data, + &out_backprop_data, &filter_data, &input_offset, + &output_offset, &size_C](int64 start, int64 limit) { + for (int shard_id = start; shard_id < limit; ++shard_id) { + T* im2col_buf = col_buffer_data + shard_id * size_C; + T* input_data = input_backprop_data + shard_id * input_offset; + const T* out_data = out_backprop_data + shard_id * output_offset; + + // Compute gradient into 'im2col_buf'. + MatrixMap C(im2col_buf, output_image_size, filter_total_size); + + ConstMatrixMap A(out_data, output_image_size, dims.out_depth); + ConstMatrixMap B(filter_data, filter_total_size, dims.out_depth); + + C.noalias() = A * B.transpose(); + + Col2im<T>(im2col_buf, dims.in_depth, + // Input spatial dimensions. + dims.spatial_dims[0].input_size, // input planes + dims.spatial_dims[1].input_size, // input rows + dims.spatial_dims[2].input_size, // input cols + // Filter spatial dimensions. + dims.spatial_dims[0].filter_size, // filter planes + dims.spatial_dims[1].filter_size, // filter rows + dims.spatial_dims[2].filter_size, // filter cols + // Spatial padding. + top_pad_planes, top_pad_rows, left_pad_cols, + bottom_pad_planes, bottom_pad_rows, right_pad_cols, + // Spatial striding. + dims.spatial_dims[0].stride, // stride planes + dims.spatial_dims[1].stride, // stride rows + dims.spatial_dims[2].stride, // stride cols + input_data); + } + }; + Shard(worker_threads.num_threads, worker_threads.workers, shard_limit, + work_unit_size, shard); + + input_backprop_data += input_offset * shard_limit; + out_backprop_data += output_offset * shard_limit; + } + } } private: @@ -253,21 +571,48 @@ class Conv3DBackpropInputOp : public OpKernel { Padding padding_; TensorFormat data_format_; bool takes_shape_; + + TF_DISALLOW_COPY_AND_ASSIGN(Conv3DCustomBackpropInputOp); }; +// Custom backrop input kernel is 30% - 4x faster when compiled with AVX2 than +// default Eigen implementation (at the cost of ~2x-8x peak memory usage). + #define REGISTER_CPU_KERNEL(T) \ REGISTER_KERNEL_BUILDER( \ Name("Conv3DBackpropInput").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ - Conv3DBackpropInputOp<CPUDevice, T>); \ + Conv3DCustomBackpropInputOp<CPUDevice, T>); \ REGISTER_KERNEL_BUILDER( \ Name("Conv3DBackpropInputV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ - Conv3DBackpropInputOp<CPUDevice, T>); + Conv3DCustomBackpropInputOp<CPUDevice, T>); \ + REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropInput") \ + .Device(DEVICE_CPU) \ + .Label("custom") \ + .TypeConstraint<T>("T"), \ + Conv3DCustomBackpropInputOp<CPUDevice, T>); \ + REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropInputV2") \ + .Device(DEVICE_CPU) \ + .Label("custom") \ + .TypeConstraint<T>("T"), \ + Conv3DCustomBackpropInputOp<CPUDevice, T>); \ + REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropInput") \ + .Device(DEVICE_CPU) \ + .Label("eigen_tensor") \ + .TypeConstraint<T>("T"), \ + Conv3DBackpropInputOp<CPUDevice, T>); \ + REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropInputV2") \ + .Device(DEVICE_CPU) \ + .Label("eigen_tensor") \ + .TypeConstraint<T>("T"), \ + Conv3DBackpropInputOp<CPUDevice, T>); + TF_CALL_half(REGISTER_CPU_KERNEL); TF_CALL_float(REGISTER_CPU_KERNEL); TF_CALL_double(REGISTER_CPU_KERNEL); #undef REGISTER_CPU_KERNEL -// Backprop for filter. +// Backprop for filter that offloads computation to +// Eigen::CuboidConvolutionBackwardFilter. template <typename Device, class T> class Conv3DBackpropFilterOp : public OpKernel { public: @@ -323,8 +668,11 @@ class Conv3DBackpropFilterOp : public OpKernel { void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); const TensorShape& input_shape = input.shape(); - TensorShape filter_shape; + const Tensor& out_backprop = context->input(2); + const TensorShape& out_backprop_shape = out_backprop.shape(); + + TensorShape filter_shape; if (takes_shape_) { const Tensor& filter_sizes = context->input(1); OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape( @@ -333,13 +681,13 @@ class Conv3DBackpropFilterOp : public OpKernel { filter_shape = context->input(1).shape(); } - EXTRACT_AND_VERIFY_DIMENSIONS("Conv3DBackpropFilter"); - Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 5> pad_dims{ - {0, 0}, - {top_pad_planes, bottom_pad_planes}, - {top_pad_rows, bottom_pad_rows}, - {left_pad_cols, right_pad_cols}, - {0, 0}}; + ConvBackpropDimensions dims; + OP_REQUIRES_OK(context, + ConvBackpropComputeDimensions( + "Conv3DBackpropFilterOp", /*num_spatial_dims=*/3, + input_shape, filter_shape, out_backprop_shape, stride_, + padding_, data_format_, &dims)); + Tensor* filter_backprop; OP_REQUIRES_OK(context, context->allocate_output(0, filter_shape, &filter_backprop)); @@ -349,70 +697,292 @@ class Conv3DBackpropFilterOp : public OpKernel { return; } - // For the backprop of the filter, we need to also transpose the - // out_backprop. - // The shape of backprop is - // [batch, out_z, out_y, out_x, out_depth] - // And we need to change it to - // [out_depth, out_x, out_y, out_z, batch] - Eigen::DSizes<Eigen::DenseIndex, 5> out_order{4, 1, 2, 3, 0}; - TensorShape padded_out_shape({out_depth, padded_out_planes, padded_out_rows, - padded_out_cols, batch}); - Tensor padded_output; + functor::CuboidConvolutionBackwardFilter<Device, T>()( + context->eigen_device<Device>(), + filter_backprop->tensor<T, 5>(), // filter_backward + input.tensor<T, 5>(), // input + out_backprop.tensor<T, 5>(), // output_backward + static_cast<int>(dims.spatial_dims[0].stride), // stride_planes + static_cast<int>(dims.spatial_dims[1].stride), // stride_rows + static_cast<int>(dims.spatial_dims[2].stride)); // stride_cols + } + + private: + std::vector<int32> dilation_; + std::vector<int32> stride_; + Padding padding_; + TensorFormat data_format_; + bool takes_shape_; + + TF_DISALLOW_COPY_AND_ASSIGN(Conv3DBackpropFilterOp); +}; + +// Custom backprop for filter that explicitly does the work sharding and calls +// Eigen only to multiply matrices. +template <typename Device, class T> +class Conv3DCustomBackpropFilterOp : public OpKernel { + // Limit the maximum size of allocated temporary buffer to + // kMaxTempAllocationOverhead times the size of the input tensors (input, + // filter, out_backprop). If the size of the temporary buffer exceeds this + // limit, fallback on Eigen implementation. + static constexpr int kMaxTempAllocationOverhead = 25; + + public: + explicit Conv3DCustomBackpropFilterOp(OpKernelConstruction* context) + : OpKernel(context), + data_format_(FORMAT_NHWC), + takes_shape_(type_string().find("V2") != std::string::npos) { + // data_format is only available in V2. + if (takes_shape_) { + string data_format; + OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); + OP_REQUIRES(context, FormatFromString(data_format, &data_format_), + errors::InvalidArgument("Invalid data format")); + OP_REQUIRES( + context, data_format_ == FORMAT_NHWC, + errors::InvalidArgument( + "Conv3DBackpropFilterOpV2 only supports NDHWC on the CPU.")); + } + + OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_)); + OP_REQUIRES(context, dilation_.size() == 5, + errors::InvalidArgument("Dilation rates field must " + "specify 5 dimensions")); + OP_REQUIRES(context, + (GetTensorDim(dilation_, data_format_, 'C') == 1 && + GetTensorDim(dilation_, data_format_, 'N') == 1), + errors::InvalidArgument( + "Current implementation does not yet support " + "dilation rates in the batch and depth dimensions.")); + + // TODO(yangzihao): Add CPU version of dilated conv 3D. + OP_REQUIRES(context, + (GetTensorDim(dilation_, data_format_, '0') == 1 && + GetTensorDim(dilation_, data_format_, '1') == 1 && + GetTensorDim(dilation_, data_format_, '2') == 1), + errors::InvalidArgument( + "Current CPU implementation does not yet support " + "dilation rates larger than 1.")); + + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 5, + errors::InvalidArgument("Sliding window strides field must " + "specify 5 dimensions")); + OP_REQUIRES( + context, + (GetTensorDim(stride_, data_format_, 'C') == 1 && + GetTensorDim(stride_, data_format_, 'N') == 1), + errors::InvalidArgument("Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const TensorShape& input_shape = input.shape(); + + const Tensor& out_backprop = context->input(2); + const TensorShape& out_backprop_shape = out_backprop.shape(); + + TensorShape filter_shape; + if (takes_shape_) { + const Tensor& filter_sizes = context->input(1); + OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape( + filter_sizes.vec<int32>(), &filter_shape)); + } else { + filter_shape = context->input(1).shape(); + } + + ConvBackpropDimensions dims; OP_REQUIRES_OK(context, - context->allocate_temp(DataTypeToEnum<T>::v(), - padded_out_shape, &padded_output)); - Eigen::DSizes<Eigen::DenseIndex, 5> eigen_strides{1, strides[0], strides[1], - strides[2], 1}; - functor::InflatePadAndShuffle<Device, T, 5, Eigen::DenseIndex>()( - context->eigen_device<Device>(), out_backprop.tensor<T, 5>(), - eigen_strides, pad_dims, out_order, padded_output.tensor<T, 5>()); - const Tensor& padded_output_cref = padded_output; - - // For the backprop of the filter, we need to transpose the input. - // The shape of input is - // [batch, in_z, in_y, in_x, in_depth] - // And we need to change it to - // [in_z, in_y, in_x, batch, in_depth] - Eigen::DSizes<Eigen::DenseIndex, 5> in_order{1, 2, 3, 0, 4}; - TensorShape in_shuffle_shape( - {input_size[0], input_size[1], input_size[2], batch, in_depth}); - Tensor in_shuffle; + ConvBackpropComputeDimensions( + "Conv3DBackpropFilterOp", /*num_spatial_dims=*/3, + input_shape, filter_shape, out_backprop_shape, stride_, + padding_, data_format_, &dims)); + + Tensor* filter_backprop; OP_REQUIRES_OK(context, - context->allocate_temp(DataTypeToEnum<T>::v(), - in_shuffle_shape, &in_shuffle)); - // No need for reversing this time. - Eigen::array<bool, 5> no_reverse{false, false, false, false, false}; - functor::ShuffleAndReverse<Device, T, 5, Eigen::DenseIndex>()( - context->eigen_device<Device>(), input.tensor<T, 5>(), in_order, - no_reverse, in_shuffle.tensor<T, 5>()); - const Tensor& in_shuffle_cref = in_shuffle; - - // The output of the conv_3d would be - // [out_depth, filter_size[2], filter_size[1], filter_size[0], in_depth] - // and we need to shuffle it back to - // [filter_size[2], filter_size[1], filter_size[0], in_depth, out_depth]; - // And we need to reverse the filter backprops. - // So we need to allocate (sigh) yet another piece of memory to hold the - // output. - TensorShape filter_shuffle_shape( - {out_depth, filter_size[0], filter_size[1], filter_size[2], in_depth}); - Tensor filter_shuffle; - OP_REQUIRES_OK( - context, context->allocate_temp(DataTypeToEnum<T>::v(), - filter_shuffle_shape, &filter_shuffle)); - functor::CuboidConvolution<Device, T>()( - context->eigen_device<Device>(), filter_shuffle.tensor<T, 5>(), - padded_output_cref.tensor<T, 5>(), in_shuffle_cref.tensor<T, 5>(), 1, 1, - 1, BrainPadding2EigenPadding(VALID)); - - // Now copy the filter_backprop back to the destination. - Eigen::DSizes<Eigen::DenseIndex, 5> filter_order{1, 2, 3, 4, 0}; - Eigen::array<bool, 5> filter_rev_dims{true, true, true, false, false}; - const Tensor& filter_shuffle_cref = filter_shuffle; - functor::ShuffleAndReverse<Device, T, 5, Eigen::DenseIndex>()( - context->eigen_device<Device>(), filter_shuffle_cref.tensor<T, 5>(), - filter_order, filter_rev_dims, filter_backprop->tensor<T, 5>()); + context->allocate_output(0, filter_shape, &filter_backprop)); + + if (input_shape.num_elements() == 0) { + filter_backprop->template flat<T>().setZero(); + return; + } + + int64 top_pad_planes, bottom_pad_planes; + int64 top_pad_rows, bottom_pad_rows; + int64 left_pad_cols, right_pad_cols; + + OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose( + dims.spatial_dims[0].input_size, + dims.spatial_dims[0].filter_size, + dims.spatial_dims[0].stride, padding_, + &dims.spatial_dims[0].output_size, + &top_pad_planes, &bottom_pad_planes)); + OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose( + dims.spatial_dims[1].input_size, + dims.spatial_dims[1].filter_size, + dims.spatial_dims[1].stride, padding_, + &dims.spatial_dims[1].output_size, + &top_pad_rows, &bottom_pad_rows)); + OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose( + dims.spatial_dims[2].input_size, + dims.spatial_dims[2].filter_size, + dims.spatial_dims[2].stride, padding_, + &dims.spatial_dims[2].output_size, + &left_pad_cols, &right_pad_cols)); + + // TODO(ezhulenev): Extract work size and shard estimation to shared + // functions in conv_grad_ops, and update 2d convolution backprop. + + // The total dimension size of each kernel. + const int64 filter_total_size = + dims.spatial_dims[0].filter_size * dims.spatial_dims[1].filter_size * + dims.spatial_dims[2].filter_size * dims.in_depth; + // The output image size is the spatial size of the output. + const int64 output_image_size = dims.spatial_dims[0].output_size * + dims.spatial_dims[1].output_size * + dims.spatial_dims[2].output_size; + + // Shard 'batch' images (volumes) into 'shard_size' groups of images + // (volumes) to be fed into the parallel matmul. Calculate 'shard_size' by + // dividing the L3 cache size ('target_working_set_size') by the matmul size + // of an individual image ('work_unit_size'). + + const auto cache_sizes = Eigen::internal::CacheSizes(); + const ptrdiff_t l3_cache_size = cache_sizes.m_l3; + + // TODO(andydavis) + // *) Consider reducing 'target_working_set_size' if L3 is shared by + // other concurrently running tensorflow ops. + const size_t target_working_set_size = l3_cache_size / sizeof(T); + + const int64 size_A = output_image_size * filter_total_size; + + const int64 size_B = output_image_size * dims.out_depth; + + const int64 size_C = filter_total_size * dims.out_depth; + + const int64 work_unit_size = size_A + size_B + size_C; + + const size_t shard_size = + (target_working_set_size + work_unit_size - 1) / work_unit_size; + + // Total number of elements in all the tensors used by this kernel. + int64 total_tensor_elements = input_shape.num_elements() + + filter_shape.num_elements() + + out_backprop_shape.num_elements(); + + // Shape of the temporary workspace buffer. + TensorShape col_buffer_shape = {static_cast<int64>(shard_size), + static_cast<int64>(output_image_size), + static_cast<int64>(filter_total_size)}; + int64 col_buffer_elements = col_buffer_shape.num_elements(); + + // If the temporary allocation overhead is too large, fallback on Eigen + // implementation which requires much less memory. + int64 col_buffer_overhead = col_buffer_elements / total_tensor_elements; + if (col_buffer_overhead > kMaxTempAllocationOverhead) { + VLOG(2) << "Fallback on Eigen implementation of Conv3DBackpropFilterOp: " + "col_buffer_overhead=" + << col_buffer_overhead; + + functor::CuboidConvolutionBackwardFilter<Device, T>()( + context->eigen_device<Device>(), + filter_backprop->tensor<T, 5>(), // filter_backward + input.tensor<T, 5>(), // input + out_backprop.tensor<T, 5>(), // output_backward + static_cast<int>(dims.spatial_dims[0].stride), // stride_planes + static_cast<int>(dims.spatial_dims[1].stride), // stride_rows + static_cast<int>(dims.spatial_dims[2].stride)); // stride_cols + + return; + } + + Tensor col_buffer; + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum<T>::value, + col_buffer_shape, &col_buffer)); + + // The input offset corresponding to a single input image. + const int64 input_offset = dims.spatial_dims[0].input_size * + dims.spatial_dims[1].input_size * + dims.spatial_dims[2].input_size * dims.in_depth; + // The output offset corresponding to a single output image. + const int64 output_offset = + dims.spatial_dims[0].output_size * dims.spatial_dims[1].output_size * + dims.spatial_dims[2].output_size * dims.out_depth; + + const T* input_data = input.template flat<T>().data(); + T* col_buffer_data = col_buffer.template flat<T>().data(); + const T* out_backprop_data = out_backprop.template flat<T>().data(); + T* filter_backprop_data = filter_backprop->template flat<T>().data(); + + typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, + Eigen::Unaligned> + TensorMap; + typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>, + Eigen::Unaligned> + ConstTensorMap; + + TensorMap C(filter_backprop_data, filter_total_size, dims.out_depth); + C.setZero(); + + // Initialize contraction dims (we need to transpose 'A' below). + Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_dims; + contract_dims[0].first = 0; + contract_dims[0].second = 0; + + auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); + + for (int image_id = 0; image_id < dims.batch_size; image_id += shard_size) { + const int shard_limit = + std::min(static_cast<int>(shard_size), + static_cast<int>(dims.batch_size) - image_id); + + auto shard = [&input_data, &col_buffer_data, &dims, &top_pad_planes, + &top_pad_rows, &left_pad_cols, &bottom_pad_planes, + &bottom_pad_rows, &right_pad_cols, &input_offset, + &size_A](int64 start, int64 limit) { + for (int shard_id = start; shard_id < limit; ++shard_id) { + const T* input_data_shard = input_data + shard_id * input_offset; + T* col_data_shard = col_buffer_data + shard_id * size_A; + + // When we compute the gradient with respect to the filters, we need + // to do im2col to allow gemm-type computation. + Im2col<T>(input_data_shard, dims.in_depth, + // Input spatial dimensions. + dims.spatial_dims[0].input_size, // input planes + dims.spatial_dims[1].input_size, // input rows + dims.spatial_dims[2].input_size, // input cols + // Filter spatial dimensions. + dims.spatial_dims[0].filter_size, // filter planes + dims.spatial_dims[1].filter_size, // filter rows + dims.spatial_dims[2].filter_size, // filter cols + // Spatial padding. + top_pad_planes, top_pad_rows, left_pad_cols, + bottom_pad_planes, bottom_pad_rows, right_pad_cols, + // Spatial striding. + dims.spatial_dims[0].stride, // stride planes + dims.spatial_dims[1].stride, // stride rows + dims.spatial_dims[2].stride, // stride cols + col_data_shard); + } + }; + Shard(worker_threads.num_threads, worker_threads.workers, shard_limit, + size_A, shard); + + ConstTensorMap A(col_buffer_data, output_image_size * shard_limit, + filter_total_size); + ConstTensorMap B(out_backprop_data, output_image_size * shard_limit, + dims.out_depth); + + // Gradient with respect to filter. + C.device(context->eigen_cpu_device()) += A.contract(B, contract_dims); + + input_data += input_offset * shard_limit; + out_backprop_data += output_offset * shard_limit; + } } private: @@ -421,21 +991,60 @@ class Conv3DBackpropFilterOp : public OpKernel { Padding padding_; TensorFormat data_format_; bool takes_shape_; + + TF_DISALLOW_COPY_AND_ASSIGN(Conv3DCustomBackpropFilterOp); }; +// Custom backrop input kernel is 30% - 4x faster when compiled with AVX2 than +// default Eigen implementation (at the cost of ~2x-8x peak memory usage). + #define REGISTER_CPU_KERNEL(T) \ REGISTER_KERNEL_BUILDER( \ Name("Conv3DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ - Conv3DBackpropFilterOp<CPUDevice, T>); \ + Conv3DCustomBackpropFilterOp<CPUDevice, T>); \ + REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<T>("T"), \ + Conv3DCustomBackpropFilterOp<CPUDevice, T>); \ + REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilter") \ + .Device(DEVICE_CPU) \ + .Label("custom") \ + .TypeConstraint<T>("T"), \ + Conv3DCustomBackpropFilterOp<CPUDevice, T>); \ + REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2") \ + .Device(DEVICE_CPU) \ + .Label("custom") \ + .TypeConstraint<T>("T"), \ + Conv3DCustomBackpropFilterOp<CPUDevice, T>); \ + REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilter") \ + .Device(DEVICE_CPU) \ + .Label("eigen_tensor") \ + .TypeConstraint<T>("T"), \ + Conv3DBackpropFilterOp<CPUDevice, T>); \ REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2") \ .Device(DEVICE_CPU) \ + .Label("eigen_tensor") \ .TypeConstraint<T>("T"), \ Conv3DBackpropFilterOp<CPUDevice, T>); -TF_CALL_half(REGISTER_CPU_KERNEL); + TF_CALL_float(REGISTER_CPU_KERNEL); TF_CALL_double(REGISTER_CPU_KERNEL); #undef REGISTER_CPU_KERNEL +// WARNING: Eigen::half is not trivially copyable and can't be used in +// custom backprop filter kernel because of memcpy and memset in Im2col. +#define REGISTER_CPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("Conv3DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ + Conv3DBackpropFilterOp<CPUDevice, T>); \ + REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<T>("T"), \ + Conv3DBackpropFilterOp<CPUDevice, T>); + +TF_CALL_half(REGISTER_CPU_KERNEL); +#undef REGISTER_CPU_KERNEL + // GPU definitions of both ops. #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. @@ -445,7 +1054,8 @@ namespace functor { #define DECLARE_GPU_SPEC(T) \ template <> \ void TransformFilter<GPUDevice, T, int, 5>::operator()( \ - const GPUDevice& d, typename TTypes<T, 5, int>::ConstTensor in, \ + const GPUDevice& d, FilterTensorFormat dst_filter_format, \ + typename TTypes<T, 5, int>::ConstTensor in, \ typename TTypes<T, 5, int>::Tensor out); \ template <> \ void ReverseTransformFilter<GPUDevice, T, 5>::operator()( \ @@ -523,6 +1133,10 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel { void Compute(OpKernelContext* context) override { const Tensor& filter = context->input(1); const TensorShape& filter_shape = filter.shape(); + + const Tensor& out_backprop = context->input(2); + const TensorShape& out_backprop_shape = out_backprop.shape(); + TensorShape input_shape; if (takes_shape_) { const Tensor& input_sizes = context->input(0); @@ -531,7 +1145,14 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel { } else { input_shape = context->input(0).shape(); } - EXTRACT_AND_VERIFY_DIMENSIONS("Conv3DBackpropInput"); + + ConvBackpropDimensions dims; + OP_REQUIRES_OK(context, + ConvBackpropComputeDimensionsV2( + "Conv3DBackpropInputOp", /*num_spatial_dims=*/3, + input_shape, filter_shape, out_backprop_shape, dilation_, + stride_, padding_, data_format_, &dims)); + Tensor* in_backprop; OP_REQUIRES_OK(context, context->allocate_output(0, input_shape, &in_backprop)); @@ -539,13 +1160,15 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel { auto* stream = context->op_device_context()->stream(); OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); - if (filter_size[0] == 1 && filter_size[1] == 1 && filter_size[2] == 1 && - dilation_[0] == 1 && dilation_[1] == 1 && dilation_[2] == 1 && - stride_[0] == 1 && stride_[1] == 1 && stride_[2] == 1 && + if (dims.filter_size(0) == 1 && dims.filter_size(1) == 1 && + dims.filter_size(2) == 1 && dims.dilation(0) == 1 && + dims.dilation(1) == 1 && dims.dilation(2) == 1 && dims.stride(0) == 1 && + dims.stride(1) == 1 && dims.stride(2) == 1 && data_format_ == FORMAT_NHWC) { - const uint64 m = batch * input_size[0] * input_size[1] * input_size[2]; - const uint64 k = out_depth; - const uint64 n = in_depth; + const uint64 m = dims.batch_size * dims.input_size(0) * + dims.input_size(1) * dims.input_size(2); + const uint64 k = dims.out_depth; + const uint64 n = dims.in_depth; auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(), out_backprop.template flat<T>().size()); @@ -567,13 +1190,14 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel { ", n=", n, ", k=", k)); } return; - } else if (filter_size[0] == input_size[0] && - filter_size[1] == input_size[1] && - filter_size[2] == input_size[2] && padding_ == Padding::VALID && - data_format_ == FORMAT_NHWC) { - const uint64 m = batch; - const uint64 k = out_depth; - const uint64 n = input_size[0] * input_size[1] * input_size[2] * in_depth; + } else if (dims.filter_size(0) == dims.input_size(0) && + dims.filter_size(1) == dims.input_size(1) && + dims.filter_size(2) == dims.input_size(2) && + padding_ == Padding::VALID && data_format_ == FORMAT_NHWC) { + const uint64 m = dims.batch_size; + const uint64 k = dims.out_depth; + const uint64 n = dims.input_size(0) * dims.input_size(1) * + dims.input_size(2) * dims.in_depth; auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(), out_backprop.template flat<T>().size()); @@ -597,65 +1221,59 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel { return; } - int padding_rows = 0, padding_cols = 0, padding_planes = 0; - - if (padding_ == Padding::SAME) { - padding_planes = std::max<int>( - 0, (output_planes - 1) * strides[0] + filter_size[0] - input_size[0]); - padding_cols = std::max<int>( - 0, (output_cols - 1) * strides[2] + filter_size[2] - input_size[2]); - padding_rows = std::max<int>( - 0, (output_rows - 1) * strides[1] + filter_size[1] - input_size[1]); - } + int padding_planes = dims.SpatialPadding(padding_, 0); + int padding_rows = dims.SpatialPadding(padding_, 1); + int padding_cols = dims.SpatialPadding(padding_, 2); + const bool planes_odd = (padding_planes % 2 != 0); const bool rows_odd = (padding_rows % 2 != 0); const bool cols_odd = (padding_cols % 2 != 0); - const bool planes_odd = (padding_planes % 2 != 0); TensorShape compatible_input_shape; if (rows_odd || cols_odd || planes_odd) { // cuDNN only supports the same amount of padding on both sides. compatible_input_shape = { - batch, - in_depth, - input_size[0] + planes_odd, - input_size[1] + rows_odd, - input_size[2] + cols_odd, + dims.batch_size, + dims.in_depth, + dims.input_size(0) + planes_odd, + dims.input_size(1) + rows_odd, + dims.input_size(2) + cols_odd, }; } else { - compatible_input_shape = {batch, in_depth, input_size[0], input_size[1], - input_size[2]}; + compatible_input_shape = {dims.batch_size, dims.in_depth, + dims.input_size(0), dims.input_size(1), + dims.input_size(2)}; } CHECK(padding_rows >= 0 && padding_cols >= 0 && padding_planes >= 0) << "Negative paddings: (" << padding_rows << ", " << padding_cols << ", " << padding_planes << ")"; se::dnn::BatchDescriptor input_desc(3); - input_desc.set_count(batch) + input_desc.set_count(dims.batch_size) .set_spatial_dim(DimIndex::X, compatible_input_shape.dim_size(4)) .set_spatial_dim(DimIndex::Y, compatible_input_shape.dim_size(3)) .set_spatial_dim(DimIndex::Z, compatible_input_shape.dim_size(2)) - .set_feature_map_count(in_depth) + .set_feature_map_count(dims.in_depth) .set_layout(se::dnn::DataLayout::kBatchDepthYX); se::dnn::BatchDescriptor output_desc(3); - output_desc.set_count(batch) - .set_spatial_dim(DimIndex::X, output_cols) - .set_spatial_dim(DimIndex::Y, output_rows) - .set_spatial_dim(DimIndex::Z, output_planes) - .set_feature_map_count(out_depth) + output_desc.set_count(dims.batch_size) + .set_spatial_dim(DimIndex::X, dims.output_size(2)) + .set_spatial_dim(DimIndex::Y, dims.output_size(1)) + .set_spatial_dim(DimIndex::Z, dims.output_size(0)) + .set_feature_map_count(dims.out_depth) .set_layout(se::dnn::DataLayout::kBatchDepthYX); se::dnn::FilterDescriptor filter_desc(3); - filter_desc.set_spatial_dim(DimIndex::X, filter_size[2]) - .set_spatial_dim(DimIndex::Y, filter_size[1]) - .set_spatial_dim(DimIndex::Z, filter_size[0]) - .set_input_feature_map_count(in_depth) - .set_output_feature_map_count(out_depth); + filter_desc.set_spatial_dim(DimIndex::X, dims.filter_size(2)) + .set_spatial_dim(DimIndex::Y, dims.filter_size(1)) + .set_spatial_dim(DimIndex::Z, dims.filter_size(0)) + .set_input_feature_map_count(dims.in_depth) + .set_output_feature_map_count(dims.out_depth); se::dnn::ConvolutionDescriptor conv_desc(3); - conv_desc.set_dilation_rate(DimIndex::X, dilations[2]) - .set_dilation_rate(DimIndex::Y, dilations[1]) - .set_dilation_rate(DimIndex::Z, dilations[0]) - .set_filter_stride(DimIndex::X, strides[2]) - .set_filter_stride(DimIndex::Y, strides[1]) - .set_filter_stride(DimIndex::Z, strides[0]) + conv_desc.set_dilation_rate(DimIndex::X, dims.dilation(2)) + .set_dilation_rate(DimIndex::Y, dims.dilation(1)) + .set_dilation_rate(DimIndex::Z, dims.dilation(0)) + .set_filter_stride(DimIndex::X, dims.stride(2)) + .set_filter_stride(DimIndex::Y, dims.stride(1)) + .set_filter_stride(DimIndex::Z, dims.stride(0)) .set_zero_padding(DimIndex::X, padding_cols / 2) .set_zero_padding(DimIndex::Y, padding_rows / 2) .set_zero_padding(DimIndex::Z, padding_planes / 2); @@ -664,20 +1282,23 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel { Tensor transformed_filter; OP_REQUIRES_OK( context, - context->allocate_temp(DataTypeToEnum<T>::value, - TensorShape({out_depth, in_depth, filter_size[0], - filter_size[1], filter_size[2]}), - &transformed_filter)); + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({dims.out_depth, dims.in_depth, dims.filter_size(0), + dims.filter_size(1), dims.filter_size(2)}), + &transformed_filter)); functor::TransformFilter<GPUDevice, T, int, 5>()( - context->eigen_device<GPUDevice>(), To32Bit(filter.tensor<T, 5>()), + context->eigen_device<GPUDevice>(), FORMAT_OIHW, + To32Bit(filter.tensor<T, 5>()), To32Bit(transformed_filter.tensor<T, 5>())); // Shape: batch, filters, z, y, x. Tensor transformed_out_backprop; if (data_format_ == FORMAT_NHWC) { - TensorShape nchw_shape = {batch, out_depth, output_planes, output_rows, - output_cols}; - if (out_depth > 1) { + TensorShape nchw_shape = {dims.batch_size, dims.out_depth, + dims.output_size(0), dims.output_size(1), + dims.output_size(2)}; + if (dims.out_depth > 1) { OP_REQUIRES_OK(context, context->allocate_temp( DataTypeToEnum<T>::value, nchw_shape, &transformed_out_backprop)); @@ -713,14 +1334,14 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel { const int device_id = stream->parent()->device_ordinal(); DataType dtype = context->input(0).dtype(); const ConvParameters conv_parameters = { - batch, - in_depth, - {{input_size[0], input_size[1], input_size[2]}}, + dims.batch_size, + dims.in_depth, + {{dims.input_size(0), dims.input_size(1), dims.input_size(2)}}, FORMAT_NCHW, - out_depth, - {{filter_size[0], filter_size[1], filter_size[2]}}, - {{dilations[0], dilations[1], dilations[2]}}, - {{strides[0], strides[1], strides[2]}}, + dims.out_depth, + {{dims.filter_size(0), dims.filter_size(1), dims.filter_size(2)}}, + {{dims.dilation(0), dims.dilation(1), dims.dilation(2)}}, + {{dims.stride(0), dims.stride(1), dims.stride(2)}}, {{padding_planes, padding_rows, padding_cols}}, dtype, device_id, @@ -799,10 +1420,11 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel { if (rows_odd || cols_odd || planes_odd) { Tensor in_backprop_remove_padding; OP_REQUIRES_OK(context, - context->allocate_temp(DataTypeToEnum<T>::value, - {batch, in_depth, input_size[0], - input_size[1], input_size[2]}, - &in_backprop_remove_padding)); + context->allocate_temp( + DataTypeToEnum<T>::value, + {dims.batch_size, dims.in_depth, dims.input_size(0), + dims.input_size(1), dims.input_size(2)}, + &in_backprop_remove_padding)); // Remove the padding for odd spatial dimensions. functor::PadInput<GPUDevice, T, int, 5>()( @@ -896,6 +1518,10 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel { void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); const TensorShape& input_shape = input.shape(); + + const Tensor& out_backprop = context->input(2); + const TensorShape& out_backprop_shape = out_backprop.shape(); + TensorShape filter_shape; if (takes_shape_) { const Tensor& filter_sizes = context->input(1); @@ -905,7 +1531,12 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel { filter_shape = context->input(1).shape(); } - EXTRACT_AND_VERIFY_DIMENSIONS("Conv3DBackpropFilter"); + ConvBackpropDimensions dims; + OP_REQUIRES_OK(context, + ConvBackpropComputeDimensionsV2( + "Conv3DBackpropFilterOp", /*num_spatial_dims=*/3, + input_shape, filter_shape, out_backprop_shape, dilation_, + stride_, padding_, data_format_, &dims)); Tensor* filter_backprop; OP_REQUIRES_OK(context, @@ -914,13 +1545,15 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel { auto* stream = context->op_device_context()->stream(); OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); - if (filter_size[1] == 1 && filter_size[2] == 1 && filter_size[0] == 1 && - dilations[2] == 1 && dilations[1] == 1 && dilations[0] == 1 && - strides[2] == 1 && strides[1] == 1 && strides[0] == 1 && + if (dims.filter_size(1) == 1 && dims.filter_size(2) == 1 && + dims.filter_size(0) == 1 && dims.dilation(2) == 1 && + dims.dilation(1) == 1 && dims.dilation(0) == 1 && dims.stride(2) == 1 && + dims.stride(1) == 1 && dims.stride(0) == 1 && data_format_ == FORMAT_NHWC) { - const uint64 m = in_depth; - const uint64 k = batch * input_size[1] * input_size[2] * input_size[0]; - const uint64 n = out_depth; + const uint64 m = dims.in_depth; + const uint64 k = dims.batch_size * dims.input_size(1) * + dims.input_size(2) * dims.input_size(0); + const uint64 n = dims.out_depth; // The shape of output backprop is // [batch, out_z, out_y, out_x, out_depth] @@ -951,13 +1584,14 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel { ", n=", n, ", k=", k)); } return; - } else if (filter_size[0] == input_size[0] && - filter_size[1] == input_size[1] && - filter_size[2] == input_size[2] && padding_ == Padding::VALID && - data_format_ == FORMAT_NHWC) { - const uint64 m = input_size[0] * input_size[1] * input_size[2] * in_depth; - const uint64 k = batch; - const uint64 n = out_depth; + } else if (dims.filter_size(0) == dims.input_size(0) && + dims.filter_size(1) == dims.input_size(1) && + dims.filter_size(2) == dims.input_size(2) && + padding_ == Padding::VALID && data_format_ == FORMAT_NHWC) { + const uint64 m = dims.input_size(0) * dims.input_size(1) * + dims.input_size(2) * dims.in_depth; + const uint64 k = dims.batch_size; + const uint64 n = dims.out_depth; auto a_ptr = AsDeviceMemory(input.template flat<T>().data(), input.template flat<T>().size()); @@ -979,30 +1613,24 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel { return; } - int padding_rows = 0, padding_cols = 0, padding_planes = 0; - - if (padding_ == Padding::SAME) { - padding_planes = std::max<int>( - 0, (output_planes - 1) * strides[0] + filter_size[0] - input_size[0]); - padding_cols = std::max<int>( - 0, (output_cols - 1) * strides[2] + filter_size[2] - input_size[2]); - padding_rows = std::max<int>( - 0, (output_rows - 1) * strides[1] + filter_size[1] - input_size[1]); - } - bool rows_odd = (padding_rows % 2 != 0); - bool cols_odd = (padding_cols % 2 != 0); - bool planes_odd = (padding_planes % 2 != 0); + int padding_planes = dims.SpatialPadding(padding_, 0); + int padding_rows = dims.SpatialPadding(padding_, 1); + int padding_cols = dims.SpatialPadding(padding_, 2); + const bool planes_odd = (padding_planes % 2 != 0); + const bool rows_odd = (padding_rows % 2 != 0); + const bool cols_odd = (padding_cols % 2 != 0); Tensor compatible_input; if (rows_odd || cols_odd || planes_odd) { - OP_REQUIRES_OK(context, context->allocate_temp( - DataTypeToEnum<T>::value, - ShapeFromFormat(data_format_, batch, - {{input_size[0] + planes_odd, - input_size[1] + rows_odd, - input_size[2] + cols_odd}}, - in_depth), - &compatible_input)); + OP_REQUIRES_OK(context, + context->allocate_temp( + DataTypeToEnum<T>::value, + ShapeFromFormat(data_format_, dims.batch_size, + {{dims.input_size(0) + planes_odd, + dims.input_size(1) + rows_odd, + dims.input_size(2) + cols_odd}}, + dims.in_depth), + &compatible_input)); functor::PadInput<GPUDevice, T, int, 5>()( context->template eigen_device<GPUDevice>(), To32Bit(input.tensor<T, 5>()), {{0, 0, 0}}, @@ -1016,35 +1644,35 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel { << "Negative paddings: (" << padding_rows << ", " << padding_cols << ", " << padding_planes << ")"; se::dnn::BatchDescriptor input_desc(3); - input_desc.set_count(batch) + input_desc.set_count(dims.batch_size) .set_spatial_dim(DimIndex::X, GetTensorDim(compatible_input, data_format_, '2')) .set_spatial_dim(DimIndex::Y, GetTensorDim(compatible_input, data_format_, '1')) .set_spatial_dim(DimIndex::Z, GetTensorDim(compatible_input, data_format_, '0')) - .set_feature_map_count(in_depth) + .set_feature_map_count(dims.in_depth) .set_layout(se::dnn::DataLayout::kBatchDepthYX); se::dnn::BatchDescriptor output_desc(3); - output_desc.set_count(batch) - .set_spatial_dim(DimIndex::X, output_cols) - .set_spatial_dim(DimIndex::Y, output_rows) - .set_spatial_dim(DimIndex::Z, output_planes) - .set_feature_map_count(out_depth) + output_desc.set_count(dims.batch_size) + .set_spatial_dim(DimIndex::X, dims.output_size(2)) + .set_spatial_dim(DimIndex::Y, dims.output_size(1)) + .set_spatial_dim(DimIndex::Z, dims.output_size(0)) + .set_feature_map_count(dims.out_depth) .set_layout(se::dnn::DataLayout::kBatchDepthYX); se::dnn::FilterDescriptor filter_desc(3); - filter_desc.set_spatial_dim(DimIndex::X, filter_size[2]) - .set_spatial_dim(DimIndex::Y, filter_size[1]) - .set_spatial_dim(DimIndex::Z, filter_size[0]) - .set_input_feature_map_count(in_depth) - .set_output_feature_map_count(out_depth); + filter_desc.set_spatial_dim(DimIndex::X, dims.filter_size(2)) + .set_spatial_dim(DimIndex::Y, dims.filter_size(1)) + .set_spatial_dim(DimIndex::Z, dims.filter_size(0)) + .set_input_feature_map_count(dims.in_depth) + .set_output_feature_map_count(dims.out_depth); se::dnn::ConvolutionDescriptor conv_desc(3); - conv_desc.set_dilation_rate(DimIndex::X, dilations[2]) - .set_dilation_rate(DimIndex::Y, dilations[1]) - .set_dilation_rate(DimIndex::Z, dilations[0]) - .set_filter_stride(DimIndex::X, strides[2]) - .set_filter_stride(DimIndex::Y, strides[1]) - .set_filter_stride(DimIndex::Z, strides[0]) + conv_desc.set_dilation_rate(DimIndex::X, dims.dilation(2)) + .set_dilation_rate(DimIndex::Y, dims.dilation(1)) + .set_dilation_rate(DimIndex::Z, dims.dilation(0)) + .set_filter_stride(DimIndex::X, dims.stride(2)) + .set_filter_stride(DimIndex::Y, dims.stride(1)) + .set_filter_stride(DimIndex::Z, dims.stride(0)) .set_zero_padding(DimIndex::X, padding_cols / 2) .set_zero_padding(DimIndex::Y, padding_rows / 2) .set_zero_padding(DimIndex::Z, padding_planes / 2); @@ -1052,19 +1680,21 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel { Tensor pre_transformed_filter_backprop; OP_REQUIRES_OK( context, - context->allocate_temp(DataTypeToEnum<T>::value, - TensorShape({out_depth, in_depth, filter_size[0], - filter_size[1], filter_size[2]}), - &pre_transformed_filter_backprop)); + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({dims.out_depth, dims.in_depth, dims.filter_size(0), + dims.filter_size(1), dims.filter_size(2)}), + &pre_transformed_filter_backprop)); Tensor transformed_out_backprop; if (data_format_ == FORMAT_NHWC) { - TensorShape nchw_shape = {batch, out_depth, output_planes, output_rows, - output_cols}; + TensorShape nchw_shape = {dims.batch_size, dims.out_depth, + dims.output_size(0), dims.output_size(1), + dims.output_size(2)}; OP_REQUIRES_OK( context, context->allocate_temp(DataTypeToEnum<T>::value, nchw_shape, &transformed_out_backprop)); - if (out_depth > 1) { + if (dims.out_depth > 1) { functor::NHWCToNCHW<GPUDevice, T, 5>()( context->eigen_device<GPUDevice>(), out_backprop.tensor<T, 5>(), transformed_out_backprop.tensor<T, 5>()); @@ -1076,10 +1706,10 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel { } Tensor transformed_input; if (data_format_ == FORMAT_NHWC) { - TensorShape nchw_shape = {batch, in_depth, compatible_input.dim_size(1), - compatible_input.dim_size(2), - compatible_input.dim_size(3)}; - if (in_depth > 1) { + TensorShape nchw_shape = { + dims.batch_size, dims.in_depth, compatible_input.dim_size(1), + compatible_input.dim_size(2), compatible_input.dim_size(3)}; + if (dims.in_depth > 1) { OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value, nchw_shape, &transformed_input)); @@ -1110,14 +1740,14 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel { const int device_id = stream->parent()->device_ordinal(); DataType dtype = input.dtype(); const ConvParameters conv_parameters = { - batch, - in_depth, - {{input_size[0], input_size[1], input_size[2]}}, + dims.batch_size, + dims.in_depth, + {{dims.input_size(0), dims.input_size(1), dims.input_size(2)}}, FORMAT_NCHW, - out_depth, - {{filter_size[0], filter_size[1], filter_size[2]}}, - {{dilations[0], dilations[1], dilations[2]}}, - {{strides[0], strides[1], strides[2]}}, + dims.out_depth, + {{dims.filter_size(0), dims.filter_size(1), dims.filter_size(2)}}, + {{dims.dilation(0), dims.dilation(1), dims.dilation(2)}}, + {{dims.stride(0), dims.stride(1), dims.stride(2)}}, {{padding_planes, padding_rows, padding_cols}}, dtype, device_id, diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc index ef692418d6..717a9f40a9 100644 --- a/tensorflow/core/kernels/conv_ops.cc +++ b/tensorflow/core/kernels/conv_ops.cc @@ -680,9 +680,9 @@ void LaunchConv2DOp<GPUDevice, T>::operator()( TensorShape({filter.dim_size(3), filter.dim_size(2), filter.dim_size(0), filter.dim_size(1)}), &transformed_filter)); - functor::TransformFilter<GPUDevice, T, int, 4>()( - ctx->eigen_device<GPUDevice>(), To32Bit(filter.tensor<T, 4>()), + ctx->eigen_device<GPUDevice>(), FORMAT_OIHW, + To32Bit(filter.tensor<T, 4>()), To32Bit(transformed_filter.tensor<T, 4>())); Tensor transformed_output; @@ -731,9 +731,15 @@ void LaunchConv2DOp<GPUDevice, T>::operator()( if (cudnn_use_autotune && !AutoTuneConv::GetInstance()->Find(conv_parameters, &algorithm_config)) { std::vector<AlgorithmDesc> algorithms; - CHECK(stream->parent()->GetConvolveAlgorithms( - conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(stream->parent()), - &algorithms)); + OP_REQUIRES( + ctx, + stream->parent()->GetConvolveAlgorithms( + conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>( + stream->parent()), + &algorithms), + errors::Unknown("Failed to get convolution algorithm. This is probably " + "because cuDNN failed to initialize, so try looking to " + "see if a warning log message was printed above.")); ProfileResult best_result; ProfileResult best_result_no_scratch; for (auto profile_algorithm : algorithms) { @@ -823,7 +829,8 @@ namespace functor { extern template struct MatMulConvFunctor<GPUDevice, T>; \ template <> \ void TransformFilter<GPUDevice, T, int, 4>::operator()( \ - const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in, \ + const GPUDevice& d, FilterTensorFormat dst_filter_format, \ + typename TTypes<T, 4, int>::ConstTensor in, \ typename TTypes<T, 4, int>::Tensor out); \ extern template struct TransformFilter<GPUDevice, T, int, 4>; \ template <> \ diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc index a1eed4e68c..83df4dce38 100644 --- a/tensorflow/core/kernels/conv_ops_3d.cc +++ b/tensorflow/core/kernels/conv_ops_3d.cc @@ -386,7 +386,8 @@ struct LaunchConvOp<GPUDevice, T> { // filter: [x, y, z, in, out] // t_filter: [out, in, x, y, z] functor::TransformFilter<GPUDevice, T, int, 5>()( - ctx->eigen_device<GPUDevice>(), To32Bit(filter.tensor<T, 5>()), + ctx->eigen_device<GPUDevice>(), FORMAT_OIHW, + To32Bit(filter.tensor<T, 5>()), To32Bit(transformed_filter.tensor<T, 5>())); Tensor transformed_output; @@ -434,10 +435,16 @@ struct LaunchConvOp<GPUDevice, T> { if (cudnn_use_autotune && !AutoTuneConv3d::GetInstance()->Find( conv_parameters, &algorithm_config)) { std::vector<AlgorithmDesc> algorithms; - CHECK(stream->parent()->GetConvolveAlgorithms( - conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>( - stream->parent()), - &algorithms)); + OP_REQUIRES(ctx, + stream->parent()->GetConvolveAlgorithms( + conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>( + stream->parent()), + &algorithms), + errors::Unknown( + "Failed to get convolution algorithm. This is probably " + "because cuDNN failed to initialize, so try looking to " + "see if a warning log message was printed above.")); + ProfileResult best_result; ProfileResult best_result_no_scratch; for (auto profile_algorithm : algorithms) { @@ -514,7 +521,8 @@ namespace functor { #define DECLARE_GPU_SPEC(T) \ template <> \ void TransformFilter<GPUDevice, T, int, 5>::operator()( \ - const GPUDevice& d, typename TTypes<T, 5, int>::ConstTensor in, \ + const GPUDevice& d, FilterTensorFormat dst_filter_format, \ + typename TTypes<T, 5, int>::ConstTensor in, \ typename TTypes<T, 5, int>::Tensor out); \ template <> \ void ReverseTransformFilter<GPUDevice, T, 5>::operator()( \ diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h index afc611f277..21d135decd 100644 --- a/tensorflow/core/kernels/conv_ops_gpu.h +++ b/tensorflow/core/kernels/conv_ops_gpu.h @@ -142,8 +142,12 @@ class ConvParameters { template <typename T> bool ShouldIncludeWinogradNonfusedAlgo( se::StreamExecutor* stream_exec) const { + auto* dnn_support = stream_exec->AsDnn(); + if (!dnn_support) { + return false; + } // Skip this check for cuDNN 7 and newer. - auto version = stream_exec->AsDnn()->GetVersion(); + auto version = dnn_support->GetVersion(); if (version.ok() && version.ValueOrDie().major_version() >= 7) { return true; } diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc index a5fa48f85e..46167db3a2 100644 --- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc +++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc @@ -170,51 +170,33 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index<IndexCount> FlatToTensorIndex( return tensor_index; } -// A Cuda custom kernel that swaps dimension-0 and dimension-2 of a 3D tensor. -template <typename T, bool conjugate = false> -__global__ void SwapDimension0And2InTensor3Simple(int nthreads, const T* input, - Dimension<3> input_dims, - T* output) { - Dimension<3> output_dims; - output_dims[0] = input_dims[2]; - output_dims[1] = input_dims[1]; - output_dims[2] = input_dims[0]; - - CUDA_1D_KERNEL_LOOP(index, nthreads) { - int output_index = index; - - Index<3> output_tensor_index = FlatToTensorIndex(output_index, output_dims); - - Index<3> input_tensor_index; - input_tensor_index[0] = output_tensor_index[2]; - input_tensor_index[1] = output_tensor_index[1]; - input_tensor_index[2] = output_tensor_index[0]; - - int input_index = TensorIndexToFlat(input_tensor_index, input_dims); - - output[output_index] = - maybe_conj<T, conjugate>::run(ldg(input + input_index)); - } -} - -// A Cuda custom kernel that swaps dimension-1 and dimension-2 of a 3D tensor. -template <typename T, bool conjugate = false> -__global__ void SwapDimension1And2InTensor3Simple(int nthreads, const T* input, - Dimension<3> input_dims, - T* output) { +// A simple CUDA custom kernel to shuffle dimensions of a 3D tensor according to +// the given shuffle permutation in template parameters. Shuffle permutation +// <sp0, sp1, sp2> shuffles dimensions such that input dimension 0 goes to sp0, +// 1 goes to sp1 and 2 goes to sp2. For example, shuffle permutation <2, 0, 1> +// will populate output so that input[x][y][z] is equal to (*output)[y][z][x]. +// +// Requires that nthreads is equal to the total number of elements in the input +// tensor. +template <typename T, int sp0, int sp1, int sp2, bool conjugate = false> +__global__ void ShuffleInTensor3Simple(int nthreads, const T* input, + Dimension<3> input_dims, T* output) { Dimension<3> output_dims; - output_dims[0] = input_dims[0]; - output_dims[1] = input_dims[2]; - output_dims[2] = input_dims[1]; - - CUDA_1D_KERNEL_LOOP(index, nthreads) { - int output_index = index; + output_dims[sp0] = input_dims[0]; + output_dims[sp1] = input_dims[1]; + output_dims[sp2] = input_dims[2]; + + // Iterate over output as opposed to iterating over input for better + // performance. Iterating over output will generate sequential writes and + // random reads that performs better compared to sequential reads and random + // writes. + CUDA_1D_KERNEL_LOOP(output_index, nthreads) { Index<3> output_tensor_index = FlatToTensorIndex(output_index, output_dims); Index<3> input_tensor_index; - input_tensor_index[0] = output_tensor_index[0]; - input_tensor_index[1] = output_tensor_index[2]; - input_tensor_index[2] = output_tensor_index[1]; + input_tensor_index[0] = output_tensor_index[sp0]; + input_tensor_index[1] = output_tensor_index[sp1]; + input_tensor_index[2] = output_tensor_index[sp2]; int input_index = TensorIndexToFlat(input_tensor_index, input_dims); @@ -439,7 +421,7 @@ __global__ void PadInputCustomKernelNCHW(int nthreads, const T* input, template <typename T, int NDIMS> struct TransformFilter<GPUDevice, T, int, NDIMS> { typedef GPUDevice Device; - void operator()(const Device& d, + void operator()(const Device& d, FilterTensorFormat dst_filter_format, typename TTypes<T, NDIMS, int>::ConstTensor in, typename TTypes<T, NDIMS, int>::Tensor out) { Dimension<3> combined_dims; @@ -450,13 +432,18 @@ struct TransformFilter<GPUDevice, T, int, NDIMS> { combined_dims[1] = in.dimension(NDIMS - 2); // input filters combined_dims[2] = in.dimension(NDIMS - 1); // output filters CudaLaunchConfig config = GetCudaLaunchConfig(out.size(), d); - SwapDimension0And2InTensor3Simple<T> + + CHECK(dst_filter_format == FORMAT_OIHW) + << "Unsupported output layout: " << ToString(dst_filter_format); + + ShuffleInTensor3Simple<T, 2, 1, 0> <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( config.virtual_thread_count, in.data(), combined_dims, out.data()); } }; -// Converts Cudnn filter format back to TensorFlow filter format. +// Converts Cudnn filter format OIHW back to TensorFlow filter format HWIO. +// TODO(hinsu): Support reverse transformation from filter format OHWI as well. template <typename T, int NDIMS> struct ReverseTransformFilter<GPUDevice, T, NDIMS> { typedef GPUDevice Device; @@ -470,7 +457,7 @@ struct ReverseTransformFilter<GPUDevice, T, NDIMS> { combined_dims[2] *= in.dimension(i); } CudaLaunchConfig config = GetCudaLaunchConfig(out.size(), d); - SwapDimension0And2InTensor3Simple<T> + ShuffleInTensor3Simple<T, 2, 1, 0> <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( config.virtual_thread_count, in.data(), combined_dims, out.data()); } @@ -937,7 +924,7 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input, } else { int total_element_count = input_dims[0] * input_dims[1] * input_dims[2]; CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d); - SwapDimension1And2InTensor3Simple<T, conjugate> + ShuffleInTensor3Simple<T, 0, 2, 1, conjugate> <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( config.virtual_thread_count, input, input_dims, output); } @@ -969,7 +956,7 @@ struct SwapDimension0And2InTensor3<GPUDevice, T, conjugate> { static_cast<int>(combined_dims[2])}; size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2]; CudaLaunchConfig config = GetCudaLaunchConfig(total_size, d); - SwapDimension0And2InTensor3Simple<T, conjugate> + ShuffleInTensor3Simple<T, 2, 1, 0, conjugate> <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( config.virtual_thread_count, in, input_dims, out); } diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD index 3a1ac73f64..87efdff789 100644 --- a/tensorflow/core/kernels/data/BUILD +++ b/tensorflow/core/kernels/data/BUILD @@ -628,6 +628,20 @@ tf_kernel_library( ) tf_kernel_library( + name = "multi_device_iterator_ops", + srcs = ["multi_device_iterator_ops.cc"], + deps = [ + ":dataset", + ":dataset_utils", + "//tensorflow/core:core_cpu_internal", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", + "//tensorflow/core/kernels:ops_util", + ], +) + +tf_kernel_library( name = "optional_ops", srcs = ["optional_ops.cc"], hdrs = ["optional_ops.h"], @@ -675,6 +689,19 @@ tf_kernel_library( ) tf_kernel_library( + name = "model_dataset_op", + srcs = ["model_dataset_op.cc"], + deps = [ + ":dataset", + "//tensorflow/core:core_cpu_internal", + "//tensorflow/core:dataset_ops_op_lib", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", + ], +) + +tf_kernel_library( name = "dataset_ops", srcs = ["dataset_ops.cc"], deps = [ @@ -708,6 +735,8 @@ tf_kernel_library( ":map_and_batch_dataset_op", ":map_dataset_op", ":map_defun_op", + ":model_dataset_op", + ":multi_device_iterator_ops", ":optimize_dataset_op", ":optional_ops", ":padded_batch_dataset_op", diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc index a25f78c6f1..d1db1d7bec 100644 --- a/tensorflow/core/kernels/data/batch_dataset_op.cc +++ b/tensorflow/core/kernels/data/batch_dataset_op.cc @@ -117,6 +117,7 @@ class BatchDatasetOp : public UnaryDatasetOpKernel { : DatasetIterator<Dataset>(params) {} Status Initialize(IteratorContext* ctx) override { + AddConstantParameter(ctx, "batch_size", dataset()->batch_size_); return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_); } diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc index 221b5ad835..34c6c86538 100644 --- a/tensorflow/core/kernels/data/cache_dataset_ops.cc +++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc @@ -69,7 +69,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel { std::unique_ptr<IteratorBase> MakeIteratorInternal( const string& prefix) const override { return std::unique_ptr<IteratorBase>( - new FileIterator({this, strings::StrCat(prefix, "::FileIterator")})); + new FileIterator({this, strings::StrCat(prefix, "::FileCache")})); } const DataTypeVector& output_dtypes() const override { @@ -553,7 +553,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel { std::unique_ptr<IteratorBase> MakeIteratorInternal( const string& prefix) const override { return std::unique_ptr<IteratorBase>(new MemoryIterator( - {this, strings::StrCat(prefix, "::MemoryIterator")}, cache_)); + {this, strings::StrCat(prefix, "::MemoryCache")}, cache_)); } const DataTypeVector& output_dtypes() const override { diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc index ad2365b25b..0bb929b3ce 100644 --- a/tensorflow/core/kernels/data/captured_function.cc +++ b/tensorflow/core/kernels/data/captured_function.cc @@ -17,43 +17,101 @@ limitations under the License. #include <utility> #include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/common_runtime/step_stats_collector.h" #include "tensorflow/core/framework/cancellation.h" #include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/lib/random/random.h" #include "tensorflow/core/platform/notification.h" +#include "tensorflow/core/util/ptr_util.h" namespace tensorflow { namespace data { -/* static */ -Status CapturedFunction::Create( - const NameAttrList& func, std::vector<Tensor> captured_inputs, - std::unique_ptr<CapturedFunction>* out_function) { - return Create(func, std::move(captured_inputs), true, out_function); -} +namespace { + +// Simplistic implementation of the `StepStatsCollectorInterface` that only +// cares about collecting the CPU time needed to execute a captured function. +class SimpleStepStatsCollector : public StepStatsCollectorInterface { + public: + void IncrementProcessingTime(int64 delta) { + mutex_lock l(mu_); + processing_time_ += delta; + } + + NodeExecStatsInterface* CreateNodeExecStats(const Node* node) override { + return new SimpleNodeExecStats(this); + } + + string ReportAllocsOnResourceExhausted(const string& err) override { + return ""; + } + + int64 processing_time() { + tf_shared_lock l(mu_); + return processing_time_; + } + + private: + class SimpleNodeExecStats : public NodeExecStatsInterface { + public: + explicit SimpleNodeExecStats(SimpleStepStatsCollector* step_stats_collector) + : step_stats_collector_(step_stats_collector) {} + + void Done(const string& device) override { + step_stats_collector_->IncrementProcessingTime(end_time_ns_ - + start_time_ns_); + delete this; + } + + void RecordExecutorStarted() override { + start_time_ns_ = Env::Default()->NowNanos(); + } + + void RecordComputeStarted() override {} + + void RecordComputeEnded() override {} + + void RecordExecutorEnded() override { + end_time_ns_ = Env::Default()->NowNanos(); + } + + void SetMemory(OpKernelContext* ctx) override {} + + void SetOutput(int slot, const Tensor* tensor) override {} + + void SetReferencedTensors(const TensorReferenceVector& tensors) override {} + + void SetScheduled(int64 nanos) override {} + + private: + int64 start_time_ns_ = 0; + int64 end_time_ns_ = 0; + SimpleStepStatsCollector* step_stats_collector_; // Not owned. + }; + + mutex mu_; + int64 processing_time_ GUARDED_BY(mu_) = 0; +}; + +} // namespace /* static */ Status CapturedFunction::Create( - const NameAttrList& func, std::vector<Tensor> captured_inputs, - bool use_inter_op_parallelism, + const NameAttrList& func, OpKernelContext* ctx, const string& argument, std::unique_ptr<CapturedFunction>* out_function) { - out_function->reset(new CapturedFunction(func, std::move(captured_inputs), - use_inter_op_parallelism)); - return Status::OK(); + return CapturedFunction::Create(func, ctx, argument, true, out_function); } -/* static */ Status CapturedFunction::Create( const NameAttrList& func, OpKernelContext* ctx, const string& argument, + bool use_inter_op_parallelism, std::unique_ptr<CapturedFunction>* out_function) { - OpInputList argument_inputs; - TF_RETURN_IF_ERROR(ctx->input_list(argument, &argument_inputs)); - std::vector<Tensor> arguments_t; - arguments_t.reserve(argument_inputs.size()); - for (const Tensor& t : argument_inputs) { - arguments_t.push_back(t); - } - return CapturedFunction::Create(func, std::move(arguments_t), out_function); + OpInputList inputs; + TF_RETURN_IF_ERROR(ctx->input_list(argument, &inputs)); + std::vector<Tensor> arguments(inputs.begin(), inputs.end()); + *out_function = WrapUnique(new CapturedFunction(func, std::move(arguments), + use_inter_op_parallelism)); + return Status::OK(); } CapturedFunction::~CapturedFunction() { @@ -358,7 +416,8 @@ Status CapturedFunction::RunInstantiated(const std::vector<Tensor>& args, void CapturedFunction::RunAsync(IteratorContext* ctx, std::vector<Tensor>&& args, std::vector<Tensor>* rets, - FunctionLibraryRuntime::DoneCallback done) { + FunctionLibraryRuntime::DoneCallback done, + const string& prefix) { // NOTE(mrry): This method does not transfer ownership of `ctx`, and it may // be deleted before `done` is called. Take care not to capture `ctx` in any // code that may execute asynchronously in this function. @@ -368,13 +427,13 @@ void CapturedFunction::RunAsync(IteratorContext* ctx, done(s); return; } - auto frame = + OwnedArgsCallFrame* frame = new OwnedArgsCallFrame(std::move(args), &captured_inputs_, ret_types_); FunctionLibraryRuntime::Options f_opts; f_opts.step_id = CapturedFunction::generate_step_id(); ResourceMgr* resource_mgr = ctx->lib()->device()->resource_manager(); - auto step_container = new ScopedStepContainer( + ScopedStepContainer* step_container = new ScopedStepContainer( f_opts.step_id, [resource_mgr](const string& name) { resource_mgr->Cleanup(name).IgnoreError(); }); @@ -389,25 +448,40 @@ void CapturedFunction::RunAsync(IteratorContext* ctx, // (such as queue kernels) that depend on the non-nullness of // `OpKernelContext::cancellation_manager()`, but additional effort // will be required to plumb it through the `IteratorContext`. - auto c_mgr = new CancellationManager; + CancellationManager* c_mgr = new CancellationManager; f_opts.cancellation_manager = c_mgr; - - tf_shared_lock l(mu_); - ctx->lib()->Run(f_opts, handle, frame, - std::bind( - [rets, step_container, c_mgr, frame]( - FunctionLibraryRuntime::DoneCallback done, - // Begin unbound arguments. - Status s) { - delete step_container; - delete c_mgr; - if (s.ok()) { - s = frame->ConsumeRetvals(rets); - } - delete frame; - done(s); - }, - std::move(done), std::placeholders::_1)); + std::shared_ptr<SimpleStepStatsCollector> stats_collector; + if (ctx->model()) { + stats_collector = MakeUnique<SimpleStepStatsCollector>(); + } + f_opts.stats_collector = stats_collector.get(); + + auto callback = std::bind( + [rets, step_container, c_mgr, frame]( + const FunctionLibraryRuntime::DoneCallback& done, + const std::shared_ptr<model::Model>& model, const string& prefix, + const std::shared_ptr<SimpleStepStatsCollector>& stats_collector, + // Begin unbound arguments. + Status s) { + delete step_container; + delete c_mgr; + if (s.ok()) { + s = frame->ConsumeRetvals(rets); + } + delete frame; + if (model) { + model->AddProcessingTime(prefix, stats_collector->processing_time()); + model->RecordStart(prefix, false /* stop_output */); + } + done(s); + if (model) { + model->RecordStop(prefix, false /* start_output */); + } + }, + std::move(done), ctx->model(), prefix, std::move(stats_collector), + std::placeholders::_1); + + ctx->lib()->Run(f_opts, handle, frame, std::move(callback)); } CapturedFunction::CapturedFunction(const NameAttrList& func, diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h index e44bc78b1c..a10376bf97 100644 --- a/tensorflow/core/kernels/data/captured_function.h +++ b/tensorflow/core/kernels/data/captured_function.h @@ -42,27 +42,19 @@ namespace data { // context. class CapturedFunction { public: - // Creates a new instance from a list of named attributes and captured inputs. - // - // NOTE(mrry): The `captured_inputs` are passed by value. For - // efficiency, you are recommended to move this argument into the call. - static Status Create(const NameAttrList& func, - std::vector<Tensor> captured_inputs, + // Creates a new instance using a list of named attributes, fetching captured + // inputs from a context argument. + static Status Create(const NameAttrList& func, OpKernelContext* ctx, + const string& argument, std::unique_ptr<CapturedFunction>* out_function); - // Creates a new instance from a list of named attributes and captured inputs. + // Creates a new instance using a list of named attributes, fetching captured + // inputs from a context argument. // // If `use_inter_op_parallelism` is false, the runtime may use an executor // that is optimized for small functions. - static Status Create(const NameAttrList& func, - std::vector<Tensor> captured_inputs, - bool use_inter_op_parallelism, - std::unique_ptr<CapturedFunction>* out_function); - - // Creates a new instance using a list of named attributes, fetching captured - // inputs from a context argument. static Status Create(const NameAttrList& func, OpKernelContext* ctx, - const string& argument, + const string& argument, bool use_inter_op_parallelism, std::unique_ptr<CapturedFunction>* out_function); ~CapturedFunction(); @@ -104,7 +96,8 @@ class CapturedFunction { // in order to be able to deallocate them as early as possible. void RunAsync(IteratorContext* ctx, std::vector<Tensor>&& args, std::vector<Tensor>* rets, - FunctionLibraryRuntime::DoneCallback done); + FunctionLibraryRuntime::DoneCallback done, + const string& prefix); // Returns the named list of function arguments. const NameAttrList& func() { return func_; } diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc index e7ac368ae3..e10833f525 100644 --- a/tensorflow/core/kernels/data/dataset_utils.cc +++ b/tensorflow/core/kernels/data/dataset_utils.cc @@ -44,5 +44,42 @@ Status MakeIteratorFromInputElement( ctx, strings::StrCat(prefix, "[", thread_index, "]"), out_iterator); } +Status VerifyTypesMatch(const DataTypeVector& expected, + const DataTypeVector& received) { + if (expected.size() != received.size()) { + return errors::InvalidArgument( + "Number of components does not match: expected ", expected.size(), + " types but got ", received.size(), "."); + } + for (size_t i = 0; i < expected.size(); ++i) { + if (expected[i] != received[i]) { + return errors::InvalidArgument("Data type mismatch at component ", i, + ": expected ", DataTypeString(expected[i]), + " but got ", DataTypeString(received[i]), + "."); + } + } + return Status::OK(); +} + +Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected, + const std::vector<PartialTensorShape>& received) { + if (expected.size() != received.size()) { + return errors::InvalidArgument( + "Number of components does not match: expected ", expected.size(), + " shapes but got ", received.size(), "."); + } + for (size_t i = 0; i < expected.size(); ++i) { + if (!expected[i].IsCompatibleWith(received[i])) { + return errors::InvalidArgument("Incompatible shapes at component ", i, + ": expected ", expected[i].DebugString(), + " but got ", received[i].DebugString(), + "."); + } + } + + return Status::OK(); +} + } // namespace data } // namespace tensorflow diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h index 234856ea39..6ec1350cd4 100644 --- a/tensorflow/core/kernels/data/dataset_utils.h +++ b/tensorflow/core/kernels/data/dataset_utils.h @@ -27,6 +27,16 @@ Status MakeIteratorFromInputElement( int64 thread_index, CapturedFunction* captured_func, StringPiece prefix, std::unique_ptr<IteratorBase>* out_iterator); +// Returns Status::OK() if `expected` and `received` types match, +// errors::InvalidArgument otherwise. +Status VerifyTypesMatch(const DataTypeVector& expected, + const DataTypeVector& received); + +// Returns Status::OK() if `expected` and `received` shapes are compatible, +// errors::InvalidArgument otherwise. +Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected, + const std::vector<PartialTensorShape>& received); + } // namespace data } // namespace tensorflow diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc index bf0aecaf3c..00884314a9 100644 --- a/tensorflow/core/kernels/data/filter_dataset_op.cc +++ b/tensorflow/core/kernels/data/filter_dataset_op.cc @@ -14,11 +14,13 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/common_runtime/function.h" #include "tensorflow/core/framework/partial_tensor_shape.h" +#include "tensorflow/core/framework/stats_aggregator.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/kernels/data/captured_function.h" #include "tensorflow/core/kernels/data/dataset.h" #include "tensorflow/core/lib/gtl/cleanup.h" #include "tensorflow/core/lib/random/random.h" +#include "tensorflow/core/lib/strings/str_util.h" namespace tensorflow { namespace data { @@ -37,14 +39,6 @@ class FilterDatasetOp : public UnaryDatasetOpKernel { void MakeDataset(OpKernelContext* ctx, DatasetBase* input, DatasetBase** output) override { - OpInputList inputs; - OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs)); - std::vector<Tensor> other_arguments; - other_arguments.reserve(inputs.size()); - for (const Tensor& t : inputs) { - other_arguments.push_back(t); - } - FunctionLibraryRuntime::Handle pred_handle; OP_REQUIRES_OK(ctx, ctx->function_library()->Instantiate( @@ -61,9 +55,10 @@ class FilterDatasetOp : public UnaryDatasetOpKernel { Node* ret_node = pred_body->ret_nodes[0]; Node* ret_input_node; OP_REQUIRES_OK(ctx, ret_node->input_node(0, &ret_input_node)); + std::unique_ptr<CapturedFunction> captured_func; - OP_REQUIRES_OK(ctx, CapturedFunction::Create( - func_, std::move(other_arguments), &captured_func)); + OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments", + &captured_func)); if (ret_input_node->def().op() == "_Arg") { int32 index = -1; @@ -146,7 +141,13 @@ class FilterDatasetOp : public UnaryDatasetOpKernel { class Iterator : public DatasetIterator<FilterDatasetBase> { public: explicit Iterator(const Params& params) - : DatasetIterator<FilterDatasetBase>(params) {} + : DatasetIterator<FilterDatasetBase>(params), + filtered_elements_(0), + dropped_elements_(0) { + std::vector<string> components = + str_util::Split(params.prefix, "::", str_util::SkipEmpty()); + prefix_end_ = components.back(); + } Status Initialize(IteratorContext* ctx) override { TF_RETURN_IF_ERROR( @@ -161,6 +162,7 @@ class FilterDatasetOp : public UnaryDatasetOpKernel { // `input_impl_` and `f` are thread-safe. However, if multiple // threads enter this method, outputs may be observed in a // non-deterministic order. + auto stats_aggregator = ctx->stats_aggregator(); bool matched; do { { @@ -183,8 +185,34 @@ class FilterDatasetOp : public UnaryDatasetOpKernel { if (!matched) { // Clear the output tensor list since it didn't match. out_tensors->clear(); + if (stats_aggregator) { + mutex_lock l(mu_); + dropped_elements_++; + stats_aggregator->AddScalar( + strings::StrCat(prefix_end_, "::dropped_elements"), + static_cast<float>((dropped_elements_))); + // TODO(shivaniagrawal): multiple pipelines would collect + // aggregated number of dropped elements for all the pipelines, + // exploit tagged_context here. + stats_aggregator->IncrementCounter( + prefix_end_, "dropped_elements", static_cast<float>(1)); + } } } while (!matched); + // TODO(shivaniagrawal): add ratio of dropped_elements and + // filtered_elements as a histogram. + if (stats_aggregator) { + mutex_lock l(mu_); + filtered_elements_++; + stats_aggregator->AddScalar( + strings::StrCat(prefix_end_, "::filtered_elements"), + static_cast<float>((filtered_elements_))); + // TODO(shivaniagrawal): multiple pipelines would collect aggregated + // number of filtered elements for all the pipelines, exploit + // tagged_context here. + stats_aggregator->IncrementCounter(prefix_end_, "filtered_elements", + static_cast<float>(1)); + } *end_of_sequence = false; return Status::OK(); } @@ -197,6 +225,10 @@ class FilterDatasetOp : public UnaryDatasetOpKernel { else TF_RETURN_IF_ERROR( writer->WriteScalar(full_name("input_impls_empty"), "")); + TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("filtered_elements"), + filtered_elements_)); + TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("dropped_elements"), + dropped_elements_)); return Status::OK(); } @@ -207,12 +239,19 @@ class FilterDatasetOp : public UnaryDatasetOpKernel { input_impl_.reset(); else TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_)); + TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("filtered_elements"), + &filtered_elements_)); + TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("dropped_elements"), + &dropped_elements_)); return Status::OK(); } private: mutex mu_; std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_); + int64 filtered_elements_ GUARDED_BY(mu_); + int64 dropped_elements_ GUARDED_BY(mu_); + string prefix_end_; }; const DatasetBase* const input_; diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc index e3c45ef86c..2fada22a21 100644 --- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc +++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc @@ -39,18 +39,9 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel { void MakeDataset(OpKernelContext* ctx, DatasetBase* input, DatasetBase** output) override { - OpInputList inputs; - OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs)); - std::vector<Tensor> other_arguments; - other_arguments.reserve(inputs.size()); - for (const Tensor& t : inputs) { - other_arguments.push_back(t); - } - std::unique_ptr<CapturedFunction> captured_func; - OP_REQUIRES_OK(ctx, CapturedFunction::Create( - func_, std::move(other_arguments), &captured_func)); - + OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments", + &captured_func)); *output = new Dataset(ctx, input, func_, std::move(captured_func), output_types_, output_shapes_); } diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc index ac5cc1b2c1..71a36314a0 100644 --- a/tensorflow/core/kernels/data/generator_dataset_op.cc +++ b/tensorflow/core/kernels/data/generator_dataset_op.cc @@ -145,44 +145,18 @@ GeneratorDatasetOp::GeneratorDatasetOp(OpKernelConstruction* ctx) void GeneratorDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase** output) { - OpInputList init_func_other_args_input; - OP_REQUIRES_OK(ctx, ctx->input_list("init_func_other_args", - &init_func_other_args_input)); - std::vector<Tensor> init_func_other_args; - init_func_other_args.reserve(init_func_other_args_input.size()); - for (const Tensor& t : init_func_other_args_input) { - init_func_other_args.push_back(t); - } std::unique_ptr<CapturedFunction> init_func; - OP_REQUIRES_OK( - ctx, CapturedFunction::Create(init_func_, std::move(init_func_other_args), - &init_func)); - - OpInputList next_func_other_args_input; - OP_REQUIRES_OK(ctx, ctx->input_list("next_func_other_args", - &next_func_other_args_input)); - std::vector<Tensor> next_func_other_args; - next_func_other_args.reserve(next_func_other_args_input.size()); - for (const Tensor& t : next_func_other_args_input) { - next_func_other_args.push_back(t); - } + OP_REQUIRES_OK(ctx, CapturedFunction::Create( + init_func_, ctx, "init_func_other_args", &init_func)); + std::unique_ptr<CapturedFunction> next_func; - OP_REQUIRES_OK( - ctx, CapturedFunction::Create(next_func_, std::move(next_func_other_args), - &next_func)); - - OpInputList finalize_func_other_args_input; - OP_REQUIRES_OK(ctx, ctx->input_list("finalize_func_other_args", - &finalize_func_other_args_input)); - std::vector<Tensor> finalize_func_other_args; - finalize_func_other_args.reserve(finalize_func_other_args_input.size()); - for (const Tensor& t : finalize_func_other_args_input) { - finalize_func_other_args.push_back(t); - } - std::unique_ptr<CapturedFunction> finalize_func; OP_REQUIRES_OK(ctx, CapturedFunction::Create( - finalize_func_, std::move(finalize_func_other_args), - &finalize_func)); + next_func_, ctx, "next_func_other_args", &next_func)); + + std::unique_ptr<CapturedFunction> finalize_func; + OP_REQUIRES_OK(ctx, CapturedFunction::Create(finalize_func_, ctx, + "finalize_func_other_args", + &finalize_func)); *output = new Dataset(ctx, std::move(init_func), std::move(next_func), diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc index e4fa557598..8b417bb1c2 100644 --- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc +++ b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc @@ -42,50 +42,19 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel { void MakeDataset(OpKernelContext* ctx, DatasetBase* input, DatasetBase** output) override { - // Get captured inputs for the key, reduce, and window_size functions. - OpInputList key_func_other_argument_inputs; - OP_REQUIRES_OK(ctx, ctx->input_list("key_func_other_arguments", - &key_func_other_argument_inputs)); - std::vector<Tensor> key_func_other_arguments; - key_func_other_arguments.reserve(key_func_other_argument_inputs.size()); - for (const Tensor& t : key_func_other_argument_inputs) { - key_func_other_arguments.push_back(t); - } - OpInputList reduce_func_other_argument_inputs; - OP_REQUIRES_OK(ctx, ctx->input_list("reduce_func_other_arguments", - &reduce_func_other_argument_inputs)); - std::vector<Tensor> reduce_func_other_arguments; - reduce_func_other_arguments.reserve( - reduce_func_other_argument_inputs.size()); - for (const Tensor& t : reduce_func_other_argument_inputs) { - reduce_func_other_arguments.push_back(t); - } - OpInputList window_size_func_other_argument_inputs; - OP_REQUIRES_OK(ctx, - ctx->input_list("window_size_func_other_arguments", - &window_size_func_other_argument_inputs)); - std::vector<Tensor> window_size_func_other_arguments; - window_size_func_other_arguments.reserve( - window_size_func_other_argument_inputs.size()); - for (const Tensor& t : window_size_func_other_argument_inputs) { - window_size_func_other_arguments.push_back(t); - } - // TODO(mrry): Refactor CapturedFunction to share the runtime - // state between multiple functions? std::unique_ptr<CapturedFunction> captured_key_func; - OP_REQUIRES_OK(ctx, CapturedFunction::Create( - key_func_, std::move(key_func_other_arguments), - &captured_key_func)); + OP_REQUIRES_OK(ctx, CapturedFunction::Create(key_func_, ctx, + "key_func_other_arguments", + &captured_key_func)); std::unique_ptr<CapturedFunction> captured_reduce_func; - OP_REQUIRES_OK( - ctx, CapturedFunction::Create(reduce_func_, - std::move(reduce_func_other_arguments), - &captured_reduce_func)); + OP_REQUIRES_OK(ctx, CapturedFunction::Create(reduce_func_, ctx, + "reduce_func_other_arguments", + &captured_reduce_func)); std::unique_ptr<CapturedFunction> captured_window_size_func; - OP_REQUIRES_OK( - ctx, CapturedFunction::Create( - window_size_func_, std::move(window_size_func_other_arguments), - &captured_window_size_func)); + OP_REQUIRES_OK(ctx, + CapturedFunction::Create(window_size_func_, ctx, + "window_size_func_other_arguments", + &captured_window_size_func)); *output = new Dataset( ctx, input, key_func_, reduce_func_, window_size_func_, diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc index 0768f46665..0aa802b874 100644 --- a/tensorflow/core/kernels/data/interleave_dataset_op.cc +++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc @@ -39,14 +39,6 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel { void MakeDataset(OpKernelContext* ctx, DatasetBase* input, DatasetBase** output) override { - OpInputList inputs; - OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs)); - std::vector<Tensor> other_arguments; - other_arguments.reserve(inputs.size()); - for (const Tensor& t : inputs) { - other_arguments.push_back(t); - } - const Tensor* cycle_length_t; OP_REQUIRES_OK(ctx, ctx->input("cycle_length", &cycle_length_t)); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(cycle_length_t->shape()), @@ -66,8 +58,8 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel { errors::InvalidArgument("block_length must be greater than zero.")); std::unique_ptr<CapturedFunction> captured_func; - OP_REQUIRES_OK(ctx, CapturedFunction::Create( - func_, std::move(other_arguments), &captured_func)); + OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments", + &captured_func)); *output = new Dataset(ctx, input, func_, std::move(captured_func), cycle_length, diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc index fe6d705eab..c0bc507ec0 100644 --- a/tensorflow/core/kernels/data/iterator_ops.cc +++ b/tensorflow/core/kernels/data/iterator_ops.cc @@ -44,43 +44,6 @@ namespace { const char kIteratorVariantTypeName[] = "tensorflow::Iterator"; -Status VerifyTypesMatch(const DataTypeVector& expected, - const DataTypeVector& received) { - if (expected.size() != received.size()) { - return errors::InvalidArgument( - "Number of components does not match: expected ", expected.size(), - " types but got ", received.size(), "."); - } - for (size_t i = 0; i < expected.size(); ++i) { - if (expected[i] != received[i]) { - return errors::InvalidArgument("Data type mismatch at component ", i, - ": expected ", DataTypeString(expected[i]), - " but got ", DataTypeString(received[i]), - "."); - } - } - return Status::OK(); -} - -Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected, - const std::vector<PartialTensorShape>& received) { - if (expected.size() != received.size()) { - return errors::InvalidArgument( - "Number of components does not match: expected ", expected.size(), - " shapes but got ", received.size(), "."); - } - for (size_t i = 0; i < expected.size(); ++i) { - if (!expected[i].IsCompatibleWith(received[i])) { - return errors::InvalidArgument("Incompatible shapes at component ", i, - ": expected ", expected[i].DebugString(), - " but got ", received[i].DebugString(), - "."); - } - } - - return Status::OK(); -} - } // namespace class IteratorResource : public ResourceBase { @@ -403,12 +366,12 @@ class IteratorStateVariant { } string TypeName() const { return kIteratorVariantTypeName; } void Encode(VariantTensorData* data) const { *data = *data_; } - bool Decode(const VariantTensorData& data) { + bool Decode(VariantTensorData data) { if (data.type_name() != TypeName()) { return false; } std::unique_ptr<VariantTensorData> tensor_data(new VariantTensorData); - *tensor_data = data; + std::swap(*tensor_data, data); std::unique_ptr<VariantTensorDataReader> reader( new VariantTensorDataReader(tensor_data.get())); status_ = reader->status(); diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc index 27c89b3661..2bbf4af664 100644 --- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc +++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #define EIGEN_USE_THREADS +#include <atomic> #include <utility> #include "tensorflow/core/common_runtime/function.h" @@ -26,6 +27,7 @@ limitations under the License. #include "tensorflow/core/lib/gtl/cleanup.h" #include "tensorflow/core/lib/random/random.h" #include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/tracing.h" namespace tensorflow { @@ -39,7 +41,6 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { public: explicit MapAndBatchDatasetOp(OpKernelConstruction* ctx) : UnaryDatasetOpKernel(ctx), - graph_def_version_(ctx->graph_def_version()), op_version_(ctx->def().op() == "MapAndBatchDataset" ? 1 : 2) { OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_)); @@ -49,14 +50,6 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { protected: void MakeDataset(OpKernelContext* ctx, DatasetBase* input, DatasetBase** output) override { - OpInputList inputs; - OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs)); - std::vector<Tensor> other_arguments; - other_arguments.reserve(inputs.size()); - for (const Tensor& t : inputs) { - other_arguments.push_back(t); - } - int64 batch_size; OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "batch_size", &batch_size)); OP_REQUIRES( @@ -77,7 +70,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { case 2: OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls", &num_parallel_calls)); - OP_REQUIRES(ctx, num_parallel_calls > 0, + OP_REQUIRES(ctx, + num_parallel_calls > 0 || num_parallel_calls == kAutoTune, errors::InvalidArgument( "num_parallel_calls must be greater than zero.")); break; @@ -92,8 +86,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { ParseScalarArgument(ctx, "drop_remainder", &drop_remainder)); std::unique_ptr<CapturedFunction> captured_func; - OP_REQUIRES_OK(ctx, CapturedFunction::Create( - func_, std::move(other_arguments), &captured_func)); + OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments", + &captured_func)); *output = new Dataset(ctx, input, batch_size, num_parallel_calls, drop_remainder, output_types_, output_shapes_, func_, @@ -190,7 +184,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { class Iterator : public DatasetIterator<Dataset> { public: explicit Iterator(const Params& params) - : DatasetIterator<Dataset>(params) {} + : DatasetIterator<Dataset>(params), + num_parallel_calls_(params.dataset->num_parallel_calls_) {} ~Iterator() override { mutex_lock l(mu_); @@ -204,6 +199,16 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { } Status Initialize(IteratorContext* ctx) override { + mutex_lock l(mu_); + AddConstantParameter(ctx, "batch_size", dataset()->batch_size_); + if (num_parallel_calls_ == kAutoTune) { + num_parallel_calls_ = 1; + AddTunableParameter(ctx, "parallelism", + &num_parallel_calls_ /* value */, 1 /* min */, + port::NumSchedulableCPUs() /* max */, &cond_var_); + } else { + AddConstantParameter(ctx, "parallelism", num_parallel_calls_); + } TF_RETURN_IF_ERROR( dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_)); return dataset()->captured_func_->Instantiate(ctx); @@ -218,12 +223,14 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { EnsureRunnerThreadStarted(ctx); while (batch_results_.empty() || batch_results_.front()->num_calls > 0) { + RecordStop(ctx); cond_var_.wait(l); + RecordStart(ctx); } std::swap(result, batch_results_.front()); batch_results_.pop_front(); + cond_var_.notify_all(); } - cond_var_.notify_all(); return ProcessResult(ctx, result, out_tensors, end_of_sequence); } @@ -326,11 +333,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { void CallCompleted(const std::shared_ptr<BatchResult>& result) LOCKS_EXCLUDED(mu_) { - { - mutex_lock l(mu_); - num_calls_--; - result->num_calls--; - } + mutex_lock l(mu_); + num_calls_--; + result->num_calls--; cond_var_.notify_all(); } @@ -365,7 +370,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { ctx.get(), std::move(input_element), return_values.get(), [this, ctx, result, return_values, offset](Status status) { Callback(ctx, result, return_values, offset, status); - }); + }, + prefix()); }, ctx, std::move(input_element))); } @@ -422,11 +428,6 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { result->output_allocated = true; } - int MaxBatchResults() EXCLUSIVE_LOCKS_REQUIRED(mu_) { - return (dataset()->num_parallel_calls_ + dataset()->batch_size_ - 1) / - dataset()->batch_size_; - } - Status ProcessResult(IteratorContext* ctx, const std::shared_ptr<BatchResult>& result, std::vector<Tensor>* out_tensors, @@ -475,26 +476,34 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { void RunnerThread(const std::shared_ptr<IteratorContext>& ctx) LOCKS_EXCLUDED(mu_) { std::vector<std::pair<std::shared_ptr<BatchResult>, int64>> new_calls; - new_calls.reserve(dataset()->num_parallel_calls_); + RecordStart(ctx.get()); + auto stop_cleanup = + gtl::MakeCleanup([this, &ctx]() { RecordStop(ctx.get()); }); + new_calls.reserve(num_parallel_calls_); + auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(mu_) -> bool { + int64 num_parallel_calls = num_parallel_calls_; + int64 max_batch_results = + (num_parallel_calls + dataset()->batch_size_ - 1) / + dataset()->batch_size_; + return num_calls_ >= num_parallel_calls || + (batch_results_.size() > max_batch_results || + (batch_results_.size() == max_batch_results && + call_counter_ % dataset()->batch_size_ == 0)); + }; while (true) { { mutex_lock l(mu_); - while (!cancelled_ && - (num_calls_ >= dataset()->num_parallel_calls_ || - batch_results_.size() > MaxBatchResults() || - (batch_results_.size() == MaxBatchResults() && - call_counter_ % dataset()->batch_size_ == 0))) { + while (!cancelled_ && busy()) { + RecordStop(ctx.get()); cond_var_.wait(l); + RecordStart(ctx.get()); } if (cancelled_) { return; } - while (num_calls_ < dataset()->num_parallel_calls_ && - (batch_results_.size() < MaxBatchResults() || - (batch_results_.size() == MaxBatchResults() && - call_counter_ % dataset()->batch_size_ != 0))) { + while (!busy()) { if (call_counter_ % dataset()->batch_size_ == 0) { batch_results_.emplace_back( new BatchResult(dataset()->batch_size_)); @@ -638,6 +647,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { // user specified level of parallelism and there are slots available in // the `batch_results_` buffer. condition_variable cond_var_; + // Identifies the maximum number of parallel calls. + std::atomic<int64> num_parallel_calls_; // Counts the number of outstanding calls for this batch. int64 num_calls_ GUARDED_BY(mu_) = 0; // Counts the total number of calls. @@ -661,7 +672,6 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { const Eigen::ThreadPoolDevice* device_; // not owned }; - const int graph_def_version_; const int op_version_; DataTypeVector output_types_; std::vector<PartialTensorShape> output_shapes_; diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc index af301e2b42..f112e1dc43 100644 --- a/tensorflow/core/kernels/data/map_dataset_op.cc +++ b/tensorflow/core/kernels/data/map_dataset_op.cc @@ -38,18 +38,10 @@ class MapDatasetOp : public UnaryDatasetOpKernel { void MakeDataset(OpKernelContext* ctx, DatasetBase* input, DatasetBase** output) override { - OpInputList inputs; - OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs)); - std::vector<Tensor> other_arguments; - other_arguments.reserve(inputs.size()); - for (const Tensor& t : inputs) { - other_arguments.push_back(t); - } - std::unique_ptr<CapturedFunction> captured_func; - OP_REQUIRES_OK(ctx, CapturedFunction::Create( - func_, std::move(other_arguments), - use_inter_op_parallelism_, &captured_func)); + OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments", + use_inter_op_parallelism_, + &captured_func)); *output = new Dataset(ctx, input, func_, std::move(captured_func), output_types_, output_shapes_); diff --git a/tensorflow/core/kernels/data/map_defun_op.cc b/tensorflow/core/kernels/data/map_defun_op.cc index b87d61ee44..6657f2b2b3 100644 --- a/tensorflow/core/kernels/data/map_defun_op.cc +++ b/tensorflow/core/kernels/data/map_defun_op.cc @@ -81,119 +81,167 @@ class MapDefunOp : public AsyncOpKernel { } void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override { - int64 batch_size; - OP_REQUIRES_OK_ASYNC(ctx, GetInputBatchSize(ctx, &batch_size), done); + ComputeOptions* compute_opts = nullptr; - // Inputs - auto* args = new std::vector<Tensor>; - auto* arg_shapes = new std::vector<TensorShape>; + OP_REQUIRES_OK_ASYNC(ctx, SetupArgs(ctx, &compute_opts), done); - // Create a copy because every `Compute` may have different output shapes. - auto* output_shapes = new std::vector<PartialTensorShape>(output_shapes_); - arg_shapes->reserve(ctx->num_inputs()); - args->reserve(ctx->num_inputs()); + Status s = SetupOutputs(ctx, compute_opts); + if (!s.ok()) delete compute_opts; + OP_REQUIRES_OK_ASYNC(ctx, s, done); - auto* mu = new mutex; - - for (size_t i = 0; i < ctx->num_inputs(); ++i) { - args->push_back(ctx->input(i)); - arg_shapes->push_back(ctx->input(i).shape()); - arg_shapes->at(i).RemoveDim(0); // Remove the first batch dimension - } - - // Outputs - auto* output = new OpOutputList; - OP_REQUIRES_OK_ASYNC(ctx, ctx->output_list("output", output), done); - - for (size_t i = 0; i < output_types().size(); ++i) { - if (output_shapes_.at(i).IsFullyDefined()) { - Tensor* out = nullptr; - TensorShape output_shape; - output_shapes_.at(i).AsTensorShape(&output_shape); - output_shape.InsertDim(0, batch_size); - OP_REQUIRES_OK_ASYNC(ctx, output->allocate(i, output_shape, &out), - done); - } - } - - SetRunOptions(ctx, &opts_, false); + FunctionLibraryRuntime::Options opts; + SetRunOptions(ctx, &opts, false); // Run loop StatusCallback callback = std::bind( - [](OpKernelContext* ctx, std::vector<Tensor>* args, - std::vector<TensorShape>* arg_shapes, - std::vector<PartialTensorShape>* output_shapes, OpOutputList* output, - mutex* mu, DoneCallback& done, const Status& status) { - delete args; - delete arg_shapes; - delete output; - delete output_shapes; - delete mu; + [](OpKernelContext* ctx, ComputeOptions* compute_opts, + DoneCallback& done, const Status& status) { + delete compute_opts; ctx->SetStatus(status); done(); }, - ctx, args, arg_shapes, output_shapes, output, mu, std::move(done), - std::placeholders::_1); + ctx, compute_opts, std::move(done), std::placeholders::_1); auto* refcounted = new ReffedStatusCallback(std::move(callback)); - for (size_t i = 1; i < static_cast<size_t>(batch_size); ++i) { - // Start from i = 1 because refcounted is initialized with refcount = 1 - refcounted->Ref(); - } + CancellationManager* parent_mgr = ctx->cancellation_manager(); - for (size_t i = 0; i < static_cast<size_t>(batch_size); ++i) { - auto* call_frame = new MapFunctionCallFrame( - *args, *arg_shapes, output_shapes, mu, output, this, i, - static_cast<size_t>(batch_size)); + for (size_t i = 0; i < static_cast<size_t>(compute_opts->batch_size); ++i) { + // We use a different cancellation manager each time the function is run + // to avoid the race condition between a function run error and other + // functions being cancelled as a result. CancellationManager* c_mgr = new CancellationManager; - opts_.cancellation_manager = c_mgr; - ctx->function_library()->Run( - opts_, func_handle_, call_frame, - [call_frame, refcounted, c_mgr](const Status& func_status) { - delete call_frame; - delete c_mgr; - refcounted->UpdateStatus(func_status); - refcounted->Unref(); - }); + CancellationToken token = parent_mgr->get_cancellation_token(); + const bool success = parent_mgr->RegisterCallback( + token, [c_mgr]() { c_mgr->StartCancel(); }); + + opts.cancellation_manager = c_mgr; + if (!success) { + delete c_mgr; + refcounted->UpdateStatus(errors::Cancelled( + "MapDefunOp functions cancelled because parent graph cancelled")); + break; + } + + auto* call_frame = new MapFunctionCallFrame(compute_opts, this, i); + + refcounted->Ref(); + ctx->function_library()->Run(opts, func_handle_, call_frame, + [call_frame, refcounted, c_mgr, parent_mgr, + token](const Status& func_status) { + parent_mgr->DeregisterCallback(token); + delete c_mgr; + delete call_frame; + refcounted->UpdateStatus(func_status); + refcounted->Unref(); + }); } + + // Unref 1 because refcounted is initialized with refcount = 1 + refcounted->Unref(); } private: FunctionLibraryRuntime::Handle func_handle_; - FunctionLibraryRuntime::Options opts_; std::vector<PartialTensorShape> output_shapes_; + struct ComputeOptions { + // These vary per MapDefunOp::ComputeAsync call, but must persist until + // all calls to the function are complete. This struct also encapsulates + // all the components that need to be passed to each MapFunctionCallFrame. + + const std::vector<Tensor> args; + const std::vector<TensorShape> arg_shapes; + const int64 batch_size; + + // Output of a compute call + std::vector<PartialTensorShape> output_shapes GUARDED_BY(mu); + OpOutputList output GUARDED_BY(mu); + mutex mu; + + // Create a copy of output_shapes because every `Compute` may expect a + // different output shape. + ComputeOptions(std::vector<Tensor> args, + std::vector<TensorShape> arg_shapes, int64 batch_size, + const std::vector<PartialTensorShape>& output_shapes_attr) + : args(std::move(args)), + arg_shapes(std::move(arg_shapes)), + batch_size(batch_size), + output_shapes(output_shapes_attr) {} + }; + + // Get inputs to Compute and check that they are valid. + Status SetupArgs(OpKernelContext* ctx, ComputeOptions** compute_opts) { + int64 batch_size = + ctx->input(0).dims() > 0 ? ctx->input(0).dim_size(0) : -1; + + for (size_t i = 0; i < ctx->num_inputs(); ++i) { + if (ctx->input(i).dims() == 0) { + return errors::InvalidArgument( + "All inputs must have rank at least 1. Input ", i, + " has a rank of 0."); + } else if (ctx->input(i).dim_size(0) != batch_size) { + return errors::InvalidArgument( + "All inputs must have the same dimension 0. Input ", i, + " has leading dimension ", ctx->input(i).dim_size(0), + ", while all previous inputs have leading dimension ", batch_size); + } + } + + std::vector<Tensor> args; + std::vector<TensorShape> arg_shapes; + args.reserve(ctx->num_inputs()); + arg_shapes.reserve(ctx->num_inputs()); + + for (size_t i = 0; i < ctx->num_inputs(); ++i) { + args.push_back(ctx->input(i)); + arg_shapes.push_back(ctx->input(i).shape()); + arg_shapes.at(i).RemoveDim(0); + } + + *compute_opts = new ComputeOptions(std::move(args), std::move(arg_shapes), + batch_size, output_shapes_); + return Status::OK(); + } + + Status SetupOutputs(OpKernelContext* ctx, ComputeOptions* opts) { + mutex_lock l(opts->mu); + TF_RETURN_IF_ERROR(ctx->output_list("output", &opts->output)); + + for (size_t i = 0; i < output_types().size(); ++i) { + if (output_shapes_.at(i).IsFullyDefined()) { + Tensor* out = nullptr; + TensorShape output_shape; + output_shapes_.at(i).AsTensorShape(&output_shape); + output_shape.InsertDim(0, opts->batch_size); + TF_RETURN_IF_ERROR(opts->output.allocate(i, output_shape, &out)); + } + } + return Status::OK(); + } + class MapFunctionCallFrame : public CallFrameInterface { public: - MapFunctionCallFrame(const std::vector<Tensor>& args, - const std::vector<TensorShape>& arg_shapes, - std::vector<PartialTensorShape>* output_shapes, - mutex* output_shapes_mutex, OpOutputList* output, - OpKernel* kernel, size_t iter, size_t batch_size) - : args_(args), - arg_shapes_(arg_shapes), - output_shapes_(output_shapes), - output_shapes_mutex_(output_shapes_mutex), - output_(output), - kernel_(kernel), - iter_(iter), - batch_size_(batch_size) {} + MapFunctionCallFrame(ComputeOptions* compute_opts, OpKernel* kernel, + size_t iter) + : compute_opts_(compute_opts), kernel_(kernel), iter_(iter) {} ~MapFunctionCallFrame() override {} - size_t num_args() const override { return args_.size(); } + size_t num_args() const override { return compute_opts_->args.size(); } + size_t num_retvals() const override { return static_cast<size_t>(kernel_->num_outputs()); } Status GetArg(int index, Tensor* val) const override { - if (index < 0 || index >= args_.size()) { + if (index < 0 || index >= compute_opts_->args.size()) { return errors::InvalidArgument( "Mismatch in number of function inputs."); } - bool result = val->CopyFrom(args_.at(index).Slice(iter_, iter_ + 1), - arg_shapes_.at(index)); + bool result = + val->CopyFrom(compute_opts_->args.at(index).Slice(iter_, iter_ + 1), + compute_opts_->arg_shapes.at(index)); if (!result) { return errors::Internal("GetArg failed."); } else if (!val->IsAligned()) { @@ -217,36 +265,34 @@ class MapDefunOp : public AsyncOpKernel { index); } { // Locking scope - mutex_lock l(*output_shapes_mutex_); - if (!output_shapes_->at(index).IsCompatibleWith(val.shape())) { + mutex_lock l(compute_opts_->mu); + if (!compute_opts_->output_shapes.at(index).IsCompatibleWith( + val.shape())) { return errors::InvalidArgument( "Mismatch in function retval shape, ", val.shape(), - ", and expected output shape,", - output_shapes_->at(index).DebugString(), "."); + ", and expected output shape, ", + compute_opts_->output_shapes.at(index).DebugString(), "."); } - if (!output_shapes_->at(index).IsFullyDefined()) { + if (!compute_opts_->output_shapes.at(index).IsFullyDefined()) { // Given val, we have new information about the output shape at // this index. Store the shape and allocate the output accordingly. - output_shapes_->at(index) = val.shape(); + compute_opts_->output_shapes.at(index) = val.shape(); Tensor* out = nullptr; TensorShape actual_shape = val.shape(); - actual_shape.InsertDim(0, batch_size_); - TF_RETURN_IF_ERROR(output_->allocate(index, actual_shape, &out)); + actual_shape.InsertDim(0, compute_opts_->batch_size); + TF_RETURN_IF_ERROR( + compute_opts_->output.allocate(index, actual_shape, &out)); } + return batch_util::CopyElementToSlice( + val, (compute_opts_->output)[index], iter_); } - return batch_util::CopyElementToSlice(val, (*output_)[index], iter_); } private: - const std::vector<Tensor>& args_; - const std::vector<TensorShape>& arg_shapes_; - std::vector<PartialTensorShape>* output_shapes_; - mutex* output_shapes_mutex_; - OpOutputList* output_; + ComputeOptions* const compute_opts_; // Not owned const OpKernel* kernel_; const size_t iter_; - const size_t batch_size_; }; }; diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc new file mode 100644 index 0000000000..9aa505f4f1 --- /dev/null +++ b/tensorflow/core/kernels/data/model_dataset_op.cc @@ -0,0 +1,183 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/partial_tensor_shape.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/kernels/data/dataset.h" +#include "tensorflow/core/lib/random/random.h" +#include "tensorflow/core/platform/cpu_info.h" + +namespace tensorflow { +namespace data { +namespace { + +const int kOptimizationPeriodThresholdMs = 60 * EnvTime::kSecondsToMicros; + +class ModelDatasetOp : public UnaryDatasetOpKernel { + public: + explicit ModelDatasetOp(OpKernelConstruction* ctx) + : UnaryDatasetOpKernel(ctx) {} + + void MakeDataset(OpKernelContext* ctx, DatasetBase* input, + DatasetBase** output) override { + *output = new Dataset(ctx, input); + } + + private: + class Dataset : public DatasetBase { + public: + explicit Dataset(OpKernelContext* ctx, const DatasetBase* input) + : DatasetBase(DatasetContext(ctx)), input_(input) { + input_->Ref(); + } + + ~Dataset() override { input_->Unref(); } + + std::unique_ptr<IteratorBase> MakeIteratorInternal( + const string& prefix) const override { + return std::unique_ptr<IteratorBase>( + new Iterator({this, strings::StrCat(prefix, "::Model")})); + } + + const DataTypeVector& output_dtypes() const override { + return input_->output_dtypes(); + } + const std::vector<PartialTensorShape>& output_shapes() const override { + return input_->output_shapes(); + } + + string DebugString() const override { return "ModelDatasetOp::Dataset"; } + + protected: + Status AsGraphDefInternal(SerializationContext* ctx, + DatasetGraphDefBuilder* b, + Node** output) const override { + Node* input_graph_node = nullptr; + TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node)); + TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output)); + return Status::OK(); + } + + private: + class Iterator : public DatasetIterator<Dataset> { + public: + explicit Iterator(const Params& params) + : DatasetIterator<Dataset>(params), + model_(std::make_shared<model::Model>()) {} + + ~Iterator() override { + // Signal the optimize thread to terminate it. We will then join that + // thread when we delete `this->optimize_thread_`. + mutex_lock l(mu_); + cancelled_ = true; + cond_var_.notify_all(); + } + + Status Initialize(IteratorContext* ctx) override { + IteratorContext ctx_with_model(CreateParams(ctx)); + return dataset()->input_->MakeIterator(&ctx_with_model, prefix(), + &input_impl_); + } + + Status GetNextInternal(IteratorContext* ctx, + std::vector<Tensor>* out_tensors, + bool* end_of_sequence) override { + mutex_lock l(mu_); + TF_RETURN_IF_ERROR(EnsureOptimizeThreadStarted(ctx)); + IteratorContext ctx_with_model(CreateParams(ctx)); + return input_impl_->GetNext(&ctx_with_model, out_tensors, + end_of_sequence); + } + + protected: + Status SaveInternal(IteratorStateWriter* writer) override { + mutex_lock l(mu_); + TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_)); + return Status::OK(); + } + + Status RestoreInternal(IteratorContext* ctx, + IteratorStateReader* reader) override { + mutex_lock l(mu_); + TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_)); + return Status::OK(); + } + + IteratorContext::Params CreateParams(IteratorContext* ctx) { + IteratorContext::Params params = ctx->params(); + params.model = model_; + return params; + } + + private: + Status EnsureOptimizeThreadStarted(IteratorContext* ctx) + EXCLUSIVE_LOCKS_REQUIRED(mu_) { + if (!optimize_thread_) { + std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx)); + optimize_thread_.reset(ctx->env()->StartThread( + {}, "optimize_thread", + [this, new_ctx]() { OptimizeThread(new_ctx); })); + } + return Status::OK(); + } + + void OptimizeThread(const std::shared_ptr<IteratorContext>& ctx) { + int64 last_optimization_ms = 0; + int64 optimization_period_ms = 10; + while (true) { + { + mutex_lock l(mu_); + while (!cancelled_ && + last_optimization_ms + optimization_period_ms >= + ctx->env()->NowMicros() / EnvTime::kMillisToMicros) { + cond_var_.wait_for( + l, std::chrono::milliseconds( + last_optimization_ms + optimization_period_ms - + ctx->env()->NowMicros() / EnvTime::kMillisToMicros)); + } + if (cancelled_) return; + } + model_->Optimize(port::NumSchedulableCPUs()); + // Exponentially increase the period of running the optimization + // until a threshold is reached. + if (optimization_period_ms < kOptimizationPeriodThresholdMs) { + if (optimization_period_ms << 1 < kOptimizationPeriodThresholdMs) { + optimization_period_ms <<= 1; + } else { + optimization_period_ms = kOptimizationPeriodThresholdMs; + } + } + last_optimization_ms = + ctx->env()->NowMicros() / EnvTime::kMillisToMicros; + } + } + + mutex mu_; + condition_variable cond_var_; + std::shared_ptr<model::Model> model_; + std::unique_ptr<Thread> optimize_thread_ GUARDED_BY(mu_); + bool cancelled_ GUARDED_BY(mu_) = false; + std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_); + }; + + const DatasetBase* input_; + }; +}; + +REGISTER_KERNEL_BUILDER(Name("ModelDataset").Device(DEVICE_CPU), + ModelDatasetOp); +} // namespace +} // namespace data +} // namespace tensorflow diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc new file mode 100644 index 0000000000..5f143967d9 --- /dev/null +++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc @@ -0,0 +1,633 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <deque> + +#include "tensorflow/core/common_runtime/process_function_library_runtime.h" +#include "tensorflow/core/framework/dataset.h" +#include "tensorflow/core/framework/function.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_op_kernel.h" +#include "tensorflow/core/kernels/data/dataset_utils.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/lib/random/random.h" +#include "tensorflow/core/util/device_name_utils.h" + +namespace tensorflow { +namespace data { +namespace { + +struct HostBufferElement { + Status status; + bool end_of_sequence; + std::vector<Tensor> value; +}; + +using MultiDeviceIteratorCallback = + std::function<void(const HostBufferElement&)>; + +class MultiDeviceIterator : public ResourceBase { + public: + MultiDeviceIterator(const DataTypeVector& output_types, + const std::vector<PartialTensorShape>& output_shapes, + const std::vector<string>& devices, + std::unique_ptr<FunctionLibraryDefinition> flib_def, + std::unique_ptr<ProcessFunctionLibraryRuntime> pflr, + FunctionLibraryRuntime* lib) + : output_types_(output_types), + output_shapes_(output_shapes), + devices_(devices), + flib_def_(std::move(flib_def)), + pflr_(std::move(pflr)), + lib_(lib) { + DCHECK(lib_ != nullptr); + } + + string DebugString() override { + return strings::StrCat("MultiDeviceIterator for ", devices_.size(), + " devices"); + } + + Status Init(std::unique_ptr<IteratorBase> iterator, int64 max_buffer_size, + int64* incarnation_id) { + if (iterator) { + TF_RETURN_IF_ERROR( + VerifyTypesMatch(output_types_, iterator->output_dtypes())); + TF_RETURN_IF_ERROR( + VerifyShapesCompatible(output_shapes_, iterator->output_shapes())); + } + + mutex_lock l(mu_); + if (multi_device_buffer_) { + multi_device_buffer_->Reset(); + } + + ++incarnation_id_; + *incarnation_id = incarnation_id_; + + multi_device_buffer_.reset( + new MultiDeviceBuffer(devices_.size(), max_buffer_size, incarnation_id_, + std::move(iterator))); + return Status::OK(); + } + + void GetNextFromShard(IteratorContext* ctx, int shard_num, + int64 incarnation_id, + MultiDeviceIteratorCallback callback) { + if (lib_ != nullptr) { + ctx->set_lib(lib_); + } + tf_shared_lock l(mu_); + multi_device_buffer_->GetNextFromShard(ctx, shard_num, incarnation_id, + std::move(callback)); + } + + const DataTypeVector& output_types() const { return output_types_; } + + const std::vector<PartialTensorShape>& output_shapes() const { + return output_shapes_; + } + + std::shared_ptr<const FunctionLibraryDefinition> function_library() { + tf_shared_lock l(mu_); + return lib_def_; + } + + FunctionLibraryRuntime* const lib() { + tf_shared_lock l(mu_); + return lib_; + } + + private: + // A private class that uses a background thread to keep a per device buffer + // full. + class MultiDeviceBuffer { + public: + MultiDeviceBuffer(size_t size, int64 max_buffer_size, int64 incarnation_id, + std::unique_ptr<IteratorBase> host_iterator) + : buffer_(size), + size_(size), + max_buffer_size_(max_buffer_size), + incarnation_id_(incarnation_id), + host_iterator_(std::move(host_iterator)) {} + + ~MultiDeviceBuffer() { + { + mutex_lock l(mu_); + if (!background_thread_started_) return; + } + Reset(); + } + + void Reset() LOCKS_EXCLUDED(mu_) { + { + mutex_lock l(mu_); + if (background_thread_finished_) { + return; + } + + cancelled_ = true; + // Wake up the background thread. + for (int i = 0; i < size_; ++i) { + buffer_[i].cond_var.notify_all(); + } + + // Make sure background thread has finished first. + while (!background_thread_finished_) { + shutdown_cond_var_.wait(l); + } + } + RunPendingCallbacks(); + } + + void GetNextFromShard(IteratorContext* ctx, int shard_num, + int64 incarnation_id, + MultiDeviceIteratorCallback callback) { + HostBufferElement elem; + if (incarnation_id_ != incarnation_id) { + elem.status = errors::InvalidArgument("Invalid incarnation id"); + callback(elem); + return; + } + + bool produced_output = false; + { + mutex_lock l(mu_); + if (cancelled_) { + elem.status = errors::Cancelled("Cancelled Multidevice iterator"); + callback(elem); + return; + } + + EnsureBackgroundThreadStarted(ctx); + + if (!buffer_[shard_num].data.empty()) { + produced_output = true; + std::swap(elem, buffer_[shard_num].data.front()); + buffer_[shard_num].data.pop_front(); + // Wake up background thread if it is blocked on this element. + if (buffer_[shard_num].data.size() == max_buffer_size_ - 1) { + buffer_[shard_num].cond_var.notify_all(); + } + } else { + if (background_thread_finished_) { + produced_output = true; + elem.end_of_sequence = true; + } else { + buffer_[shard_num].callbacks.push_back(std::move(callback)); + callback = nullptr; + } + } + } + + if (produced_output) { + callback(elem); + } + } + + private: + void EnsureBackgroundThreadStarted(IteratorContext* ctx) + EXCLUSIVE_LOCKS_REQUIRED(mu_) { + if (!background_thread_) { + background_thread_.reset(ctx->env()->StartThread( + {}, "multi_device_iterator_background_thread", + std::bind(&MultiDeviceIterator::MultiDeviceBuffer::BackgroundThread, + this, new IteratorContext(*ctx)))); + } + } + + void RunPendingCallbacks() LOCKS_EXCLUDED(mu_) { + // Run all remaining callbacks. + std::vector<MultiDeviceIteratorCallback> cancellation_callbacks; + std::vector<HostBufferElement> cancellation_elements; + { + mutex_lock l(mu_); + + for (int i = 0; i < size_; ++i) { + while (!buffer_[i].callbacks.empty()) { + if (buffer_[i].data.empty()) { + HostBufferElement elem; + elem.status = + errors::Cancelled("Cancelled and buffer not filled."); + cancellation_elements.push_back(std::move(elem)); + } else { + cancellation_elements.push_back( + std::move(buffer_[i].data.front())); + buffer_[i].data.pop_front(); + } + cancellation_callbacks.push_back( + std::move(buffer_[i].callbacks.front())); + buffer_[i].callbacks.pop_front(); + } + } + } + for (int i = 0; i < cancellation_callbacks.size(); ++i) { + cancellation_callbacks[i](cancellation_elements[i]); + } + } + + void BackgroundThread(IteratorContext* ctx) { + { + mutex_lock l(mu_); + background_thread_started_ = true; + } + std::unique_ptr<IteratorContext> cleanup(ctx); + int shard_to_fetch = 0; + while (true) { + HostBufferElement elem; + MultiDeviceIteratorCallback callback = nullptr; + bool end_of_iterator = false; + + { + mutex_lock l(mu_); + while (!cancelled_ && + buffer_[shard_to_fetch].data.size() >= max_buffer_size_) { + buffer_[shard_to_fetch].cond_var.wait(l); + } + + if (cancelled_) { + background_thread_finished_ = true; + shutdown_cond_var_.notify_all(); + return; + } + } + + elem.status = + host_iterator_->GetNext(ctx, &elem.value, &elem.end_of_sequence); + + if (elem.status.ok() && elem.end_of_sequence) { + end_of_iterator = true; + } + + { + mutex_lock l(mu_); + // Try to find a callback, else just push stuff into buffer. + if (!buffer_[shard_to_fetch].callbacks.empty()) { + callback = buffer_[shard_to_fetch].callbacks.front(); + buffer_[shard_to_fetch].callbacks.pop_front(); + } else { + buffer_[shard_to_fetch].data.push_back(std::move(elem)); + elem = HostBufferElement(); + } + } + + if (callback) { + (*ctx->runner())(std::bind(std::move(callback), std::move(elem))); + } + + // Finish off the thread if we reach the end of the iterator. Runs + // pending callbacks. + if (end_of_iterator) { + { + mutex_lock l(mu_); + background_thread_finished_ = true; + shutdown_cond_var_.notify_all(); + } + RunPendingCallbacks(); + return; + } + shard_to_fetch = (shard_to_fetch + 1) % size_; + } + } + + struct HostBuffer { + condition_variable cond_var; + std::deque<HostBufferElement> data; + std::deque<MultiDeviceIteratorCallback> callbacks; + }; + + mutex mu_; + std::unique_ptr<Thread> background_thread_ GUARDED_BY(mu_); + bool background_thread_finished_ GUARDED_BY(mu_) = false; + bool background_thread_started_ GUARDED_BY(mu_) = false; + bool cancelled_ GUARDED_BY(mu_) = false; + condition_variable shutdown_cond_var_ GUARDED_BY(mu_); + + std::vector<HostBuffer> buffer_; + + const size_t size_; + const int64 max_buffer_size_; + const int64 incarnation_id_; + const std::unique_ptr<IteratorBase> host_iterator_; + }; + + mutex mu_; + const DataTypeVector output_types_; + const std::vector<PartialTensorShape> output_shapes_; + const std::vector<string> devices_; + const std::unique_ptr<FunctionLibraryDefinition> flib_def_; + const std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_; + FunctionLibraryRuntime* const lib_ = nullptr; // not owned. + std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_); + + int64 incarnation_id_ GUARDED_BY(mu_) = 0; + std::unique_ptr<MultiDeviceBuffer> multi_device_buffer_ GUARDED_BY(mu_); +}; + +// Just creates a MultiDeviceIterator and returns it. +class MultiDeviceIteratorHandleOp : public OpKernel { + public: + explicit MultiDeviceIteratorHandleOp(OpKernelConstruction* ctx) + : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("container", &container_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("devices", &devices_)); + } + + // The resource is deleted from the resource manager only when it is private + // to kernel. + ~MultiDeviceIteratorHandleOp() override { + if (resource_ != nullptr) { + resource_->Unref(); + if (cinfo_.resource_is_private_to_kernel()) { + if (!cinfo_.resource_manager() + ->template Delete<MultiDeviceIterator>(cinfo_.container(), + cinfo_.name()) + .ok()) { + // Do nothing; the resource can have been deleted by session resets. + } + } + } + } + + void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) { + { + mutex_lock l(mu_); + if (resource_ == nullptr) { + FunctionLibraryRuntime* lib; + std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr); + std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr); + OP_REQUIRES_OK(context, context->function_library()->Clone( + &flib_def, &pflr, &lib)); + ResourceMgr* mgr = context->resource_manager(); + OP_REQUIRES_OK(context, cinfo_.Init(mgr, def())); + + MultiDeviceIterator* resource; + OP_REQUIRES_OK( + context, + mgr->LookupOrCreate<MultiDeviceIterator>( + cinfo_.container(), cinfo_.name(), &resource, + [this, lib, &flib_def, &pflr](MultiDeviceIterator** ret) + EXCLUSIVE_LOCKS_REQUIRED(mu_) { + *ret = new MultiDeviceIterator( + output_types_, output_shapes_, devices_, + std::move(flib_def), std::move(pflr), lib); + return Status::OK(); + })); + + Status s = VerifyResource(resource); + if (TF_PREDICT_FALSE(!s.ok())) { + resource->Unref(); + context->SetStatus(s); + return; + } + + resource_ = resource; + } + } + OP_REQUIRES_OK(context, MakeResourceHandleToOutput( + context, 0, cinfo_.container(), cinfo_.name(), + MakeTypeIndex<MultiDeviceIterator>())); + } + + private: + // During the first Compute(), resource is either created or looked up using + // shared_name. In the latter case, the resource found should be verified if + // it is compatible with this op's configuration. The verification may fail in + // cases such as two graphs asking queues of the same shared name to have + // inconsistent capacities. + Status VerifyResource(MultiDeviceIterator* resource) { + TF_RETURN_IF_ERROR( + VerifyTypesMatch(output_types_, resource->output_types())); + TF_RETURN_IF_ERROR( + VerifyShapesCompatible(output_shapes_, resource->output_shapes())); + return Status::OK(); + } + + mutex mu_; + ContainerInfo cinfo_; // Written once under mu_ then constant afterwards. + MultiDeviceIterator* resource_ GUARDED_BY(mu_) = nullptr; + DataTypeVector output_types_; + std::vector<PartialTensorShape> output_shapes_; + const int graph_def_version_; + string name_; + string container_; + std::vector<string> devices_; +}; + +REGISTER_KERNEL_BUILDER(Name("MultiDeviceIterator").Device(DEVICE_CPU), + MultiDeviceIteratorHandleOp); + +// Calls init on the MultiDeviceIterator. +class MultiDeviceIteratorInitOp : public OpKernel { + public: + explicit MultiDeviceIteratorInitOp(OpKernelConstruction* ctx) + : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + const Tensor* tensor_max_buffer_size; + OP_REQUIRES_OK(ctx, ctx->input("max_buffer_size", &tensor_max_buffer_size)); + int64 max_buffer_size = tensor_max_buffer_size->scalar<int64>()(); + + DatasetBase* dataset; + OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset)); + MultiDeviceIterator* resource; + OP_REQUIRES_OK(ctx, + LookupResource(ctx, HandleFromInput(ctx, 1), &resource)); + core::ScopedUnref unref(resource); + + std::unique_ptr<IteratorBase> iterator; + IteratorContext iter_ctx(ctx); + iter_ctx.set_lib(resource->lib()); + OP_REQUIRES_OK( + ctx, dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator)); + int64 incarnation_id; + OP_REQUIRES_OK(ctx, resource->Init(std::move(iterator), max_buffer_size, + &incarnation_id)); + Tensor tensor_incarnation_id(DT_INT64, TensorShape({})); + tensor_incarnation_id.scalar<int64>()() = incarnation_id; + OP_REQUIRES_OK(ctx, + ctx->set_output("incarnation_id", tensor_incarnation_id)); + } +}; + +REGISTER_KERNEL_BUILDER(Name("MultiDeviceIteratorInit").Device(DEVICE_CPU), + MultiDeviceIteratorInitOp); + +// Calls GetNextFromShard(shard) and returns a vector of Tensors as output. +// TODO(rohanj): Implement using BackgroundWorker that Derek built? +class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel { + public: + explicit MultiDeviceIteratorGetNextFromShardOp(OpKernelConstruction* ctx) + : AsyncOpKernel(ctx), + thread_pool_(new thread::ThreadPool( + ctx->env(), ThreadOptions(), + strings::StrCat("multi_device_iterator_get_next_thread_", + SanitizeThreadSuffix(name())), + 1 /* num_threads */, false /* low_latency_hint */)) {} + + void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override { + const Tensor* tensor_shard_num; + OP_REQUIRES_OK_ASYNC(ctx, ctx->input("shard_num", &tensor_shard_num), done); + int32 shard_num = tensor_shard_num->scalar<int32>()(); + + const Tensor* tensor_incarnation_id; + OP_REQUIRES_OK_ASYNC( + ctx, ctx->input("incarnation_id", &tensor_incarnation_id), done); + int64 incarnation_id = tensor_incarnation_id->scalar<int64>()(); + + MultiDeviceIterator* iterator; + OP_REQUIRES_OK_ASYNC( + ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done); + thread_pool_->Schedule(std::bind( + [ctx, iterator, shard_num, incarnation_id](DoneCallback done) { + IteratorContext::Params params; + params.env = ctx->env(); + params.runner = *(ctx->runner()); + params.function_library = iterator->function_library(); + DeviceBase* device = ctx->function_library()->device(); + params.allocator_getter = [device](AllocatorAttributes attrs) { + return device->GetAllocator(attrs); + }; + IteratorContext iter_ctx(std::move(params)); + + MultiDeviceIteratorCallback callback = std::bind( + [ctx](const HostBufferElement& elem, DoneCallback done) { + // iterator->Unref(); + Status s = elem.status; + if (!s.ok()) { + ctx->SetStatus(s); + } else if (elem.end_of_sequence) { + ctx->SetStatus(errors::OutOfRange("End of sequence")); + } else { + for (int i = 0; i < elem.value.size(); ++i) { + ctx->set_output(i, elem.value[i]); + } + } + done(); + }, + std::placeholders::_1, std::move(done)); + + iterator->GetNextFromShard(&iter_ctx, shard_num, incarnation_id, + callback); + iterator->Unref(); + }, + std::move(done))); + } + + private: + std::unique_ptr<thread::ThreadPool> thread_pool_; +}; + +REGISTER_KERNEL_BUILDER( + Name("MultiDeviceIteratorGetNextFromShard").Device(DEVICE_CPU), + MultiDeviceIteratorGetNextFromShardOp); + +class MultiDeviceIteratorToStringHandleOp : public OpKernel { + public: + explicit MultiDeviceIteratorToStringHandleOp(OpKernelConstruction* ctx) + : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + const Tensor& resource_handle_t = ctx->input(0); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(resource_handle_t.shape()), + errors::InvalidArgument("resource_handle must be a scalar")); + + // Validate that the handle corresponds to a real resource, and + // that it is an MultiDeviceIterator. + MultiDeviceIterator* resource; + OP_REQUIRES_OK(ctx, + LookupResource(ctx, HandleFromInput(ctx, 0), &resource)); + resource->Unref(); + + Tensor* string_handle_t; + OP_REQUIRES_OK(ctx, + ctx->allocate_output(0, TensorShape({}), &string_handle_t)); + string_handle_t->scalar<string>()() = + resource_handle_t.scalar<ResourceHandle>()().SerializeAsString(); + } +}; + +REGISTER_KERNEL_BUILDER( + Name("MultiDeviceIteratorToStringHandle").Device(DEVICE_CPU), + MultiDeviceIteratorToStringHandleOp); + +class MultiDeviceIteratorFromStringHandleOp : public OpKernel { + public: + explicit MultiDeviceIteratorFromStringHandleOp(OpKernelConstruction* ctx) + : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_)); + OP_REQUIRES( + ctx, + output_types_.empty() || output_shapes_.empty() || + output_types_.size() == output_shapes_.size(), + errors::InvalidArgument("If both 'output_types' and 'output_shapes' " + "are set, they must have the same length.")); + } + + void Compute(OpKernelContext* ctx) override { + const Tensor& string_handle_t = ctx->input(0); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(string_handle_t.shape()), + errors::InvalidArgument("string_handle must be a scalar")); + + ResourceHandle resource_handle; + OP_REQUIRES( + ctx, + resource_handle.ParseFromString(string_handle_t.scalar<string>()()), + errors::InvalidArgument( + "Could not parse string_handle as a valid ResourceHandle")); + + OP_REQUIRES( + ctx, resource_handle.device() == ctx->device()->attributes().name(), + errors::InvalidArgument("Attempted create an iterator on device \"", + ctx->device()->attributes().name(), + "\" from handle defined on device \"", + resource_handle.device(), "\"")); + + // Validate that the handle corresponds to a real resource, and + // that it is an MultiDeviceIterator. + MultiDeviceIterator* resource; + OP_REQUIRES_OK(ctx, LookupResource(ctx, resource_handle, &resource)); + core::ScopedUnref unref_iterator(resource); + if (!output_types_.empty()) { + OP_REQUIRES_OK(ctx, + VerifyTypesMatch(output_types_, resource->output_types())); + } + if (!output_shapes_.empty()) { + OP_REQUIRES_OK(ctx, VerifyShapesCompatible(output_shapes_, + resource->output_shapes())); + } + + Tensor* resource_handle_t; + OP_REQUIRES_OK( + ctx, ctx->allocate_output(0, TensorShape({}), &resource_handle_t)); + resource_handle_t->scalar<ResourceHandle>()() = resource_handle; + } + + private: + DataTypeVector output_types_; + std::vector<PartialTensorShape> output_shapes_; +}; + +REGISTER_KERNEL_BUILDER( + Name("MultiDeviceIteratorFromStringHandle").Device(DEVICE_CPU), + MultiDeviceIteratorFromStringHandleOp); + +} // namespace +} // namespace data +} // namespace tensorflow diff --git a/tensorflow/core/kernels/data/optional_ops.cc b/tensorflow/core/kernels/data/optional_ops.cc index b372d31a93..2ab5c83082 100644 --- a/tensorflow/core/kernels/data/optional_ops.cc +++ b/tensorflow/core/kernels/data/optional_ops.cc @@ -108,11 +108,8 @@ class OptionalFromValueOp : public OpKernel { void Compute(OpKernelContext* ctx) override { OpInputList components_input; OP_REQUIRES_OK(ctx, ctx->input_list("components", &components_input)); - std::vector<Tensor> components; - components.reserve(components_input.size()); - for (const Tensor& component_t : components_input) { - components.push_back(component_t); - } + std::vector<Tensor> components(components_input.begin(), + components_input.end()); OP_REQUIRES_OK( ctx, WriteOptionalWithValueToOutput(ctx, 0, std::move(components))); } @@ -216,6 +213,14 @@ static Status OptionalDeviceCopy( std::vector<Tensor> to_values; to_values.reserve(from_values.size()); for (const Tensor& t : from_values) { + if (t.dtype() == DT_VARIANT) { + // TODO(b/116349787): Implement support for nested variants. + return errors::Unimplemented( + "Support for copying nested variants to device has not yet been " + "implemented."); + } + } + for (const Tensor& t : from_values) { if (DMAHelper::CanUseDMA(&t)) { Tensor tmp(t.dtype()); TF_RETURN_IF_ERROR(copy(t, &tmp)); @@ -231,10 +236,9 @@ static Status OptionalDeviceCopy( return Status::OK(); } -#define REGISTER_OPTIONAL_COPY(DIRECTION) \ - INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION( \ - OptionalVariant, DIRECTION, kOptionalVariantTypeName, \ - OptionalDeviceCopy) +#define REGISTER_OPTIONAL_COPY(DIRECTION) \ + INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION( \ + OptionalVariant, DIRECTION, OptionalDeviceCopy) REGISTER_OPTIONAL_COPY(VariantDeviceCopyDirection::HOST_TO_DEVICE); REGISTER_OPTIONAL_COPY(VariantDeviceCopyDirection::DEVICE_TO_HOST); diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc index fd0e6c4cd0..7b01c3b4e0 100644 --- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc +++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc @@ -207,6 +207,7 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel { : DatasetIterator<Dataset>(params) {} Status Initialize(IteratorContext* ctx) override { + AddConstantParameter(ctx, "batch_size", dataset()->batch_size_); return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_); } diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc index 640f1565b7..2e6e0465f7 100644 --- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc +++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include <atomic> #include <deque> #include <utility> @@ -44,14 +45,6 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel { void MakeDataset(OpKernelContext* ctx, DatasetBase* input, DatasetBase** output) override { - OpInputList inputs; - OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs)); - std::vector<Tensor> other_arguments; - other_arguments.reserve(inputs.size()); - for (const Tensor& t : inputs) { - other_arguments.push_back(t); - } - int64 cycle_length = 0; OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "cycle_length", &cycle_length)); @@ -83,8 +76,8 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel { std::unique_ptr<CapturedFunction> captured_func; OP_REQUIRES_OK( - ctx, CapturedFunction::Create( - interleave_func_, std::move(other_arguments), &captured_func)); + ctx, CapturedFunction::Create(interleave_func_, ctx, "other_arguments", + &captured_func)); *output = new Dataset(ctx, input, interleave_func_, std::move(captured_func), @@ -252,6 +245,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel { } Status Initialize(IteratorContext* ctx) override { + AddConstantParameter(ctx, "parallelism", dataset()->cycle_length_); TF_RETURN_IF_ERROR( dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_)); return dataset()->captured_func_->Instantiate(ctx); @@ -351,11 +345,13 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel { if (must_wait_for_input) { // Wait for elements to become available. + RecordStop(ctx); if (dataset()->sloppy_) { sloppy_cond_var_.wait(l); } else { workers_[interleave_indices_[next_index_]].cond_var.wait(l); } + RecordStart(ctx); } } return errors::Cancelled( @@ -484,10 +480,10 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel { if (reader->Contains(full_name("worker_threads_running"))) { worker_threads_.reserve(dataset()->num_threads()); for (size_t i = 0; i < dataset()->num_threads(); ++i) { + std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx)); worker_threads_.emplace_back(ctx->env()->StartThread( {}, "worker_thread", - std::bind(&Iterator::WorkerThread, this, - new IteratorContext(*ctx), i))); + [this, new_ctx, i]() { WorkerThread(new_ctx, i); })); } } return Status::OK(); @@ -583,10 +579,10 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel { return Status::OK(); } workers_[i].SetInputs(s, std::move(args)); + std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx)); worker_threads_.emplace_back(ctx->env()->StartThread( {}, "worker_thread", - std::bind(&Iterator::WorkerThread, this, - new IteratorContext(*ctx), i))); + [this, new_ctx, i]() { WorkerThread(new_ctx, i); })); if (i < dataset()->cycle_length_) { interleave_indices_.push_back(i); } else { @@ -601,7 +597,8 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel { } // Produces elements into the worker's output buffers. - void WorkerThread(IteratorContext* ctx_ptr, const int64 thread_index) { + void WorkerThread(const std::shared_ptr<IteratorContext>& ctx, + const int64 thread_index) { // Notes on checkpointing thread local state, i.e., `WorkerThreadState`: // // 1. Any local state that may need to be checkpointed should be kept @@ -622,10 +619,11 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel { // std::function arguments are copy-constructable, so we pass raw // pointers, and then immediately wrap them to ensure correct ownership. - std::unique_ptr<IteratorContext> ctx(ctx_ptr); - auto cleanup = gtl::MakeCleanup([this, thread_index] { + RecordStart(ctx.get()); + auto cleanup = gtl::MakeCleanup([this, thread_index, ctx] { mutex_lock l(mu_); workers_[thread_index].cond_var.notify_all(); + RecordStop(ctx.get()); }); bool make_new_iterator; { @@ -651,9 +649,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel { // 1. Build a new iterator or use the existing one. if (make_new_iterator) { // 1a. Get new input tensors or use the exiting ones. - bool read_new_input; - { tf_shared_lock l(ckpt_mu_); // worker_thread_states_[thread_index].input will be non-empty @@ -665,7 +661,9 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel { if (read_new_input) { mutex_lock l(mu_); while (!cancelled_ && !workers_[thread_index].is_producing) { + RecordStop(ctx.get()); workers_[thread_index].cond_var.wait(l); + RecordStart(ctx.get()); } if (cancelled_) return; // Copy the input tensors so that we do not need to block on `mu_` @@ -715,7 +713,9 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel { // Wait for space in the prefetch queue. while (!cancelled_ && workers_[thread_index].outputs.size() == dataset()->buffer_output_elements_) { + RecordStop(ctx.get()); workers_[thread_index].cond_var.wait(l); + RecordStart(ctx.get()); } if (cancelled_) return; tf_shared_lock ckpt_l(ckpt_mu_); @@ -764,7 +764,9 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel { // Wait for space in the prefetch queue. while (!cancelled_ && workers_[thread_index].outputs.size() == dataset()->buffer_output_elements_) { + RecordStop(ctx.get()); workers_[thread_index].cond_var.wait(l); + RecordStart(ctx.get()); } if (cancelled_) return; @@ -1093,9 +1095,6 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel { void MakeDataset(OpKernelContext* ctx, DatasetBase* input, DatasetBase** output) override { - OpInputList inputs; - OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs)); - int64 cycle_length = 0; OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "cycle_length", &cycle_length)); @@ -1111,7 +1110,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel { int64 num_parallel_calls; OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls", &num_parallel_calls)); - OP_REQUIRES(ctx, num_parallel_calls > 0, + OP_REQUIRES(ctx, num_parallel_calls > 0 || num_parallel_calls == kAutoTune, errors::InvalidArgument( "num_parallel_calls must be greater than zero.")); OP_REQUIRES( @@ -1119,16 +1118,10 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel { errors::InvalidArgument( "num_parallel_calls must less than or equal to cycle_length.")); - // TODO(b/114267189): Use `other_arguments(inputs.begin(), inputs.end());`. - std::vector<Tensor> other_arguments; - other_arguments.reserve(inputs.size()); - for (const Tensor& t : inputs) { - other_arguments.push_back(t); - } std::unique_ptr<CapturedFunction> captured_func; OP_REQUIRES_OK( - ctx, CapturedFunction::Create( - interleave_func_, std::move(other_arguments), &captured_func)); + ctx, CapturedFunction::Create(interleave_func_, ctx, "other_arguments", + &captured_func)); *output = new Dataset(ctx, input, interleave_func_, std::move(captured_func), cycle_length, block_length, @@ -1221,6 +1214,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel { public: explicit Iterator(const Params& params) : DatasetIterator<Dataset>(params), + num_parallel_calls_(params.dataset->num_parallel_calls_), args_list_(params.dataset->cycle_length_), current_elements_(params.dataset->cycle_length_), element_in_use_(params.dataset->cycle_length_, false), @@ -1241,6 +1235,16 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel { } Status Initialize(IteratorContext* ctx) override { + mutex_lock l(mu_); + if (num_parallel_calls_ == kAutoTune) { + num_parallel_calls_ = 1; + AddTunableParameter(ctx, "parallelism", + &num_parallel_calls_ /* value */, 1 /* min */, + dataset()->cycle_length_ /* max */, &cond_var_); + } else { + AddConstantParameter(ctx, "parallelism", num_parallel_calls_); + } + AddConstantParameter(ctx, "cycle_length", dataset()->cycle_length_); TF_RETURN_IF_ERROR( dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_)); return dataset()->captured_func_->Instantiate(ctx); @@ -1256,7 +1260,9 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel { EnsureRunnerThreadStarted(ctx); while (invocation_results_.empty() && (!end_of_input_ || num_open_ > 0)) { + RecordStop(ctx); cond_var_.wait(l); + RecordStart(ctx); } if (!invocation_results_.empty()) { std::swap(result, invocation_results_.front()); @@ -1265,9 +1271,11 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel { *end_of_sequence = true; return Status::OK(); } + cond_var_.notify_all(); } - cond_var_.notify_all(); + RecordStop(ctx); result->notification.WaitForNotification(); + RecordStart(ctx); } while (result->skip); if (result->status.ok()) { @@ -1391,6 +1399,8 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel { const std::shared_ptr<IteratorContext>& ctx, int64 cycle_index, const std::vector<std::shared_ptr<InvocationResult>>& results) LOCKS_EXCLUDED(mu_) { + RecordStart(ctx.get()); + auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); }); bool end_of_input = false; for (auto& result : results) { if (!end_of_input) { @@ -1408,56 +1418,66 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel { // Release the ownership of the cycle element iterator, closing the // iterator if end of input was encountered. - { - if (end_of_input) { - current_elements_[cycle_index].reset(); - } - mutex_lock l(mu_); - element_in_use_[cycle_index] = false; - num_calls_--; - if (end_of_input) { - args_list_[cycle_index].clear(); - num_open_--; - } + if (end_of_input) { + current_elements_[cycle_index].reset(); + } + mutex_lock l(mu_); + element_in_use_[cycle_index] = false; + num_calls_--; + if (end_of_input) { + args_list_[cycle_index].clear(); + num_open_--; } cond_var_.notify_all(); } - int64 MaxInvocationResults() { - return dataset()->cycle_length_ * dataset()->block_length_; - } - // Method responsible for 1) creating iterators out of input elements, 2) // determining the order in which elements are fetched from the iterators, // and 3) scheduling the fetching of the elements to a threadpool. // // This method runs in the `runner_thread` background thread. void RunnerThread(const std::shared_ptr<IteratorContext>& ctx) { + RecordStart(ctx.get()); + auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); }); + auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(mu_) -> bool { + return element_in_use_[cycle_index_] || + num_calls_ >= num_parallel_calls_ || + invocation_results_.size() >= + dataset()->cycle_length_ * dataset()->block_length_; + }; while (true) { - { - mutex_lock l(mu_); - // Wait until this thread is cancelled, the end of input has been - // reached, or the cycle element at the `cycle_index_` position is - // not in use and there is space in the `invocation_results_` queue. - while (!cancelled_ && (!end_of_input_ || num_open_ > 0) && - (element_in_use_[cycle_index_] || - num_calls_ >= dataset()->num_parallel_calls_ || - invocation_results_.size() >= MaxInvocationResults())) { - cond_var_.wait(l); - } + mutex_lock l(mu_); + // Wait until this thread is cancelled, the end of input has been + // reached, or the cycle element at the `cycle_index_` position is + // not in use and there is space in the `invocation_results_` queue. + while (!cancelled_ && (!end_of_input_ || num_open_ > 0) && busy()) { + RecordStop(ctx.get()); + cond_var_.wait(l); + RecordStart(ctx.get()); + } - if (cancelled_ || (end_of_input_ && num_open_ == 0)) { - return; - } + if (cancelled_ || (end_of_input_ && num_open_ == 0)) { + return; + } - while (!element_in_use_[cycle_index_] && - (!end_of_input_ || num_open_ > 0) && - num_calls_ < dataset()->num_parallel_calls_ && - invocation_results_.size() < MaxInvocationResults()) { - if (!current_elements_[cycle_index_]) { - // Try to create a new iterator from the next input element. - Status status = input_impl_->GetNext( - ctx.get(), &args_list_[cycle_index_], &end_of_input_); + while ((!end_of_input_ || num_open_ > 0) && !busy()) { + if (!current_elements_[cycle_index_]) { + // Try to create a new iterator from the next input element. + Status status = input_impl_->GetNext( + ctx.get(), &args_list_[cycle_index_], &end_of_input_); + if (!status.ok()) { + invocation_results_.emplace_back(new InvocationResult()); + std::shared_ptr<InvocationResult>& result = + invocation_results_.back(); + result->status.Update(status); + result->notification.Notify(); + break; + } + if (!end_of_input_) { + Status status = MakeIteratorFromInputElement( + ctx.get(), args_list_[cycle_index_], cycle_index_, + dataset()->captured_func_.get(), prefix(), + ¤t_elements_[cycle_index_]); if (!status.ok()) { invocation_results_.emplace_back(new InvocationResult()); std::shared_ptr<InvocationResult>& result = @@ -1466,39 +1486,25 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel { result->notification.Notify(); break; } - if (!end_of_input_) { - Status status = MakeIteratorFromInputElement( - ctx.get(), args_list_[cycle_index_], cycle_index_, - dataset()->captured_func_.get(), prefix(), - ¤t_elements_[cycle_index_]); - if (!status.ok()) { - invocation_results_.emplace_back(new InvocationResult()); - std::shared_ptr<InvocationResult>& result = - invocation_results_.back(); - result->status.Update(status); - result->notification.Notify(); - break; - } - ++num_open_; - } + ++num_open_; } - if (current_elements_[cycle_index_]) { - // Pre-allocate invocation results for outputs to be fetched - // and then fetch the outputs asynchronously. - std::vector<std::shared_ptr<InvocationResult>> results; - results.reserve(dataset()->block_length_); - for (int i = 0; i < dataset()->block_length_; ++i) { - invocation_results_.emplace_back(new InvocationResult()); - results.push_back(invocation_results_.back()); - } - num_calls_++; - element_in_use_[cycle_index_] = true; - thread_pool_->Schedule(std::bind(&Iterator::FetchOutputs, this, - ctx, cycle_index_, - std::move(results))); + } + if (current_elements_[cycle_index_]) { + // Pre-allocate invocation results for outputs to be fetched + // and then fetch the outputs asynchronously. + std::vector<std::shared_ptr<InvocationResult>> results; + results.reserve(dataset()->block_length_); + for (int i = 0; i < dataset()->block_length_; ++i) { + invocation_results_.emplace_back(new InvocationResult()); + results.push_back(invocation_results_.back()); } - cycle_index_ = (cycle_index_ + 1) % dataset()->cycle_length_; + num_calls_++; + element_in_use_[cycle_index_] = true; + thread_pool_->Schedule(std::bind(&Iterator::FetchOutputs, this, + ctx, cycle_index_, + std::move(results))); } + cycle_index_ = (cycle_index_ + 1) % dataset()->cycle_length_; } cond_var_.notify_all(); } @@ -1601,6 +1607,9 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel { // and there are elements left to be fetched. condition_variable cond_var_; + // Identifies the maximum number of parallel calls. + std::atomic<int64> num_parallel_calls_; + // Iterator for input elements. std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_); diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc index a0cb179eb8..6abe6c8338 100644 --- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc +++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc @@ -44,25 +44,17 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel { protected: void MakeDataset(OpKernelContext* ctx, DatasetBase* input, DatasetBase** output) override { - OpInputList inputs; - OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs)); - std::vector<Tensor> other_arguments; - other_arguments.reserve(inputs.size()); - for (const Tensor& t : inputs) { - other_arguments.push_back(t); - } - int32 num_parallel_calls; OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls", &num_parallel_calls)); - OP_REQUIRES(ctx, num_parallel_calls > 0, + OP_REQUIRES(ctx, num_parallel_calls > 0 || num_parallel_calls == kAutoTune, errors::InvalidArgument( "num_parallel_calls must be greater than zero.")); std::unique_ptr<CapturedFunction> captured_func; - OP_REQUIRES_OK(ctx, CapturedFunction::Create( - func_, std::move(other_arguments), - use_inter_op_parallelism_, &captured_func)); + OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments", + use_inter_op_parallelism_, + &captured_func)); *output = new Dataset(ctx, input, func_, num_parallel_calls, output_types_, output_shapes_, use_inter_op_parallelism_, @@ -97,31 +89,26 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel { return captured_func_->Instantiate(ctx); }; - ParallelMapIteratorFunction map_func; - if (use_inter_op_parallelism_) { - map_func = [this](IteratorContext* ctx, - std::vector<Tensor> input_element, - std::vector<Tensor>* result, StatusCallback done) { - captured_func_->RunAsync(ctx, std::move(input_element), result, - std::move(done)); - }; - } else { - map_func = [this](IteratorContext* ctx, - std::vector<Tensor> input_element, - std::vector<Tensor>* result, StatusCallback done) { - (*ctx->runner())(std::bind( - [this, ctx, result](std::vector<Tensor>& input_element, - StatusCallback& done) { - captured_func_->RunAsync(ctx, std::move(input_element), result, - std::move(done)); - }, - std::move(input_element), std::move(done))); + const string& new_prefix = strings::StrCat(prefix, "::ParallelMap"); + ParallelMapIteratorFunction map_func = + [this, new_prefix](IteratorContext* ctx, + std::vector<Tensor> input_element, + std::vector<Tensor>* result, StatusCallback done) { + captured_func_->RunAsync(ctx, std::move(input_element), result, + std::move(done), new_prefix); + }; + if (!use_inter_op_parallelism_) { + map_func = [map_func]( + IteratorContext* ctx, std::vector<Tensor> input_element, + std::vector<Tensor>* result, StatusCallback done) { + (*ctx->runner())(std::bind(map_func, ctx, std::move(input_element), + result, std::move(done))); }; } - return NewParallelMapIterator( - {this, strings::StrCat(prefix, "::ParallelMap")}, input_, - std::move(init_func), std::move(map_func), num_parallel_calls_); + return NewParallelMapIterator({this, new_prefix}, input_, + std::move(init_func), std::move(map_func), + num_parallel_calls_); } const DataTypeVector& output_dtypes() const override { diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc index 4ae742aaaf..ee20249bfe 100644 --- a/tensorflow/core/kernels/data/parallel_map_iterator.cc +++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc @@ -14,11 +14,15 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/kernels/data/parallel_map_iterator.h" +#include <atomic> #include <deque> #include <functional> #include <utility> #include <vector> +#include "tensorflow/core/lib/gtl/cleanup.h" +#include "tensorflow/core/platform/cpu_info.h" + namespace tensorflow { namespace data { namespace { @@ -37,11 +41,6 @@ class ParallelMapIterator : public DatasetBaseIterator { num_parallel_calls_(num_parallel_calls) {} ~ParallelMapIterator() override { - // TODO(mrry): Replace this cancellation logic with a - // CancellationManager. The syntax would be more heavyweight, - // but it would be possible to thread a cancellation manager - // through the IteratorContext to upstream, - // potentially-blocking iterators, when we add these. mutex_lock l(mu_); // Cancel the runner thread. cancelled_ = true; @@ -53,6 +52,17 @@ class ParallelMapIterator : public DatasetBaseIterator { } Status Initialize(IteratorContext* ctx) override { + mutex_lock l(mu_); + if (num_parallel_calls_ == kAutoTune) { + num_parallel_calls_ = 1; + // TODO(jsimsa): Surface the number of threads used by `ctx->runner()` and + // use it here for the maximum. + AddTunableParameter(ctx, "parallelism", &num_parallel_calls_ /* value */, + 1 /* min */, port::NumSchedulableCPUs() /* max */, + &cond_var_); + } else { + AddConstantParameter(ctx, "parallelism", num_parallel_calls_); + } TF_RETURN_IF_ERROR( input_dataset_->MakeIterator(ctx, prefix(), &input_impl_)); if (init_func_) { @@ -68,13 +78,17 @@ class ParallelMapIterator : public DatasetBaseIterator { mutex_lock l(mu_); EnsureRunnerThreadStarted(ctx); while (invocation_results_.empty()) { + RecordStop(ctx); cond_var_.wait(l); + RecordStart(ctx); } std::swap(result, invocation_results_.front()); invocation_results_.pop_front(); + cond_var_.notify_all(); } - cond_var_.notify_all(); + RecordStop(ctx); result->notification.WaitForNotification(); + RecordStart(ctx); return ProcessResult(result, out_tensors, end_of_sequence); } @@ -87,9 +101,8 @@ class ParallelMapIterator : public DatasetBaseIterator { } CHECK_EQ(num_calls_, 0); TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_)); - TF_RETURN_IF_ERROR( - writer->WriteScalar(full_name("invocation_results.size"), - invocation_results_.size())); + TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("invocation_results.size"), + invocation_results_.size())); for (size_t i = 0; i < invocation_results_.size(); i++) { std::shared_ptr<InvocationResult> result = invocation_results_[i]; TF_RETURN_IF_ERROR(WriteStatusLocked(writer, i, result->status)); @@ -176,9 +189,9 @@ class ParallelMapIterator : public DatasetBaseIterator { { mutex_lock l(mu_); num_calls_--; + cond_var_.notify_all(); } result->notification.Notify(); - cond_var_.notify_all(); } void CallFunction(const std::shared_ptr<IteratorContext>& ctx, @@ -193,9 +206,8 @@ class ParallelMapIterator : public DatasetBaseIterator { return; } - // Call `func_(input_element)`, store the result in - // `result->return_values`, and notify `result->notification` to unblock - // a consumer. + // Call `func_(input_element)`, store the result in `result->return_values`, + // and notify `result->notification` to unblock a consumer. auto done = [this, result](Status status) { result->status.Update(status); CallCompleted(result); @@ -205,8 +217,6 @@ class ParallelMapIterator : public DatasetBaseIterator { std::move(done)); } - int64 MaxInvocationResults() { return num_parallel_calls_; } - Status ProcessResult(const std::shared_ptr<InvocationResult>& result, std::vector<Tensor>* out_tensors, bool* end_of_sequence) { @@ -226,27 +236,33 @@ class ParallelMapIterator : public DatasetBaseIterator { } void RunnerThread(const std::shared_ptr<IteratorContext>& ctx) { + RecordStart(ctx.get()); + auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); }); std::vector<std::shared_ptr<InvocationResult>> new_calls; new_calls.reserve(num_parallel_calls_); + auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(mu_) -> bool { + int64 num_parallel_calls = num_parallel_calls_; + return num_calls_ >= num_parallel_calls || + invocation_results_.size() >= num_parallel_calls; + }; while (true) { { mutex_lock l(mu_); - while (!cancelled_ && - (num_calls_ >= num_parallel_calls_ || - invocation_results_.size() >= MaxInvocationResults())) { + while (!cancelled_ && busy()) { + RecordStop(ctx.get()); cond_var_.wait(l); + RecordStart(ctx.get()); } if (cancelled_) { return; } - while (num_calls_ < num_parallel_calls_ && - invocation_results_.size() < MaxInvocationResults()) { + while (!busy()) { invocation_results_.emplace_back(new InvocationResult()); new_calls.push_back(invocation_results_.back()); num_calls_++; } + cond_var_.notify_all(); } - cond_var_.notify_all(); for (const auto& call : new_calls) { CallFunction(ctx, call); } @@ -295,7 +311,6 @@ class ParallelMapIterator : public DatasetBaseIterator { const DatasetBase* const input_dataset_; // Not owned. const std::function<Status(IteratorContext*)> init_func_; const ParallelMapIteratorFunction map_func_; - const int32 num_parallel_calls_; // Used for coordination between the main thread and the runner thread. mutex mu_; // Used for coordination between the main thread and the runner thread. In @@ -304,6 +319,8 @@ class ParallelMapIterator : public DatasetBaseIterator { // parallelism and there are slots available in the `invocation_results_` // buffer. condition_variable cond_var_; + // Identifies the maximum number of parallel calls. + std::atomic<int64> num_parallel_calls_; // Counts the number of outstanding calls. int64 num_calls_ GUARDED_BY(mu_) = 0; std::unique_ptr<IteratorBase> input_impl_; diff --git a/tensorflow/core/kernels/data/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/parse_example_dataset_op.cc index 0cf5db017b..c28c06da62 100644 --- a/tensorflow/core/kernels/data/parse_example_dataset_op.cc +++ b/tensorflow/core/kernels/data/parse_example_dataset_op.cc @@ -87,11 +87,8 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel { "Expected len(dense_defaults) == len(dense_keys) but got: ", dense_default_tensors.size(), " vs. ", dense_keys_.size())); - std::vector<Tensor> dense_defaults; - dense_defaults.reserve(dense_default_tensors.size()); - for (const Tensor& dense_default_t : dense_default_tensors) { - dense_defaults.push_back(dense_default_t); - } + std::vector<Tensor> dense_defaults(dense_default_tensors.begin(), + dense_default_tensors.end()); for (int d = 0; d < dense_keys_.size(); ++d) { const Tensor& def_value = dense_defaults[d]; diff --git a/tensorflow/core/kernels/data/prefetch_autotuner.cc b/tensorflow/core/kernels/data/prefetch_autotuner.cc index 533d0bd5d2..da357339c9 100644 --- a/tensorflow/core/kernels/data/prefetch_autotuner.cc +++ b/tensorflow/core/kernels/data/prefetch_autotuner.cc @@ -26,6 +26,13 @@ PrefetchAutotuner::PrefetchAutotuner(int64 initial_buffer_size) } } +namespace { +// Determines what strategy to use for increasing the buffer size limit. For +// limits less than the threshold, an exponential increase is used, while for +// limits greater than or equal to the threshold, a linear increase is used. +size_t kBufferLimitThreshold = 2048; +} // namespace + void PrefetchAutotuner::RecordConsumption(size_t current_buffer_size) { switch (mode_) { case Mode::kDisabled: @@ -37,7 +44,11 @@ void PrefetchAutotuner::RecordConsumption(size_t current_buffer_size) { return; case Mode::kDownswing: if (current_buffer_size == 0) { - buffer_limit_ *= 2; // Increase the buffer size. + if (buffer_limit_ >= kBufferLimitThreshold) { + buffer_limit_ += kBufferLimitThreshold; + } else { + buffer_limit_ *= 2; + } mode_ = Mode::kUpswing; } return; diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc index ad7d5eb3ff..754ed772db 100644 --- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc +++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/core/framework/stats_aggregator.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/lib/core/error_codes.pb.h" +#include "tensorflow/core/lib/gtl/cleanup.h" #include "tensorflow/core/lib/strings/str_util.h" namespace tensorflow { @@ -102,16 +103,18 @@ class PrefetchDatasetOp::Dataset : public DatasetBase { Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors, bool* end_of_sequence) override { + auto stats_aggregator = ctx->stats_aggregator(); { mutex_lock l(mu_); - auto stats_aggregator = ctx->stats_aggregator(); TF_RETURN_IF_ERROR(EnsurePrefetchThreadStarted(ctx)); // Wait until the next element in the buffer has been // produced, or we are shutting down. while (!cancelled_ && buffer_.empty() && !prefetch_thread_finished_ && auto_tuner_.buffer_limit() != 0) { auto_tuner_.RecordEmpty(); + RecordStop(ctx); cond_var_.wait(l); + RecordStart(ctx); } if (cancelled_) { @@ -133,6 +136,14 @@ class PrefetchDatasetOp::Dataset : public DatasetBase { mutex_lock parent_l(parent_mu_); mutex_lock l(mu_); + if (stats_aggregator) { + stats_aggregator->AddScalar( + strings::StrCat(prefix_end_, "::buffer_size"), + static_cast<float>(buffer_.size())); + stats_aggregator->AddScalar( + strings::StrCat(prefix_end_, "::buffer_capacity"), + static_cast<float>(auto_tuner_.buffer_limit())); + } return input_impl_->GetNext(ctx, out_tensors, end_of_sequence); } @@ -216,6 +227,12 @@ class PrefetchDatasetOp::Dataset : public DatasetBase { strings::StrCat(prefix_end_, "::buffer_utilization"), {static_cast<float>(buffer_.size()) / static_cast<float>(auto_tuner_.buffer_limit())}); + stats_aggregator->AddScalar( + strings::StrCat(prefix_end_, "::buffer_size"), + static_cast<float>(buffer_.size())); + stats_aggregator->AddScalar( + strings::StrCat(prefix_end_, "::buffer_capacity"), + static_cast<float>(auto_tuner_.buffer_limit())); } // A new element is available. Forward the status from computing it, and // (if we successfully got an element) the output values. @@ -239,10 +256,10 @@ class PrefetchDatasetOp::Dataset : public DatasetBase { Status EnsurePrefetchThreadStarted(IteratorContext* ctx) EXCLUSIVE_LOCKS_REQUIRED(mu_) { if (!prefetch_thread_) { - prefetch_thread_.reset( - ctx->env()->StartThread({}, "prefetch_thread", - std::bind(&Iterator::PrefetchThread, this, - new IteratorContext(*ctx)))); + std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx)); + prefetch_thread_.reset(ctx->env()->StartThread( + {}, "prefetch_thread", + [this, new_ctx]() { PrefetchThread(new_ctx); })); } return Status::OK(); } @@ -251,8 +268,9 @@ class PrefetchDatasetOp::Dataset : public DatasetBase { // buffer. // // It owns the iterator context passed to it. - void PrefetchThread(IteratorContext* ctx) { - std::unique_ptr<IteratorContext> cleanup(ctx); + void PrefetchThread(const std::shared_ptr<IteratorContext>& ctx) { + RecordStart(ctx.get()); + auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); }); while (true) { std::vector<Tensor> value; @@ -260,7 +278,9 @@ class PrefetchDatasetOp::Dataset : public DatasetBase { { mutex_lock l(mu_); while (!cancelled_ && buffer_.size() >= auto_tuner_.buffer_limit()) { + RecordStop(ctx.get()); cond_var_.wait(l); + RecordStart(ctx.get()); } if (cancelled_) { @@ -277,8 +297,8 @@ class PrefetchDatasetOp::Dataset : public DatasetBase { mutex_lock parent_l(parent_mu_); bool end_of_sequence; BufferElement buffer_element; - buffer_element.status = - input_impl_->GetNext(ctx, &buffer_element.value, &end_of_sequence); + buffer_element.status = input_impl_->GetNext( + ctx.get(), &buffer_element.value, &end_of_sequence); if (buffer_element.status.ok() && end_of_sequence) { mutex_lock l(mu_); prefetch_thread_finished_ = true; diff --git a/tensorflow/core/kernels/data/scan_dataset_op.cc b/tensorflow/core/kernels/data/scan_dataset_op.cc index 6e515d6cc8..dbe31f37b8 100644 --- a/tensorflow/core/kernels/data/scan_dataset_op.cc +++ b/tensorflow/core/kernels/data/scan_dataset_op.cc @@ -45,23 +45,12 @@ class ScanDatasetOp : public UnaryDatasetOpKernel { OpInputList initial_state_inputs; OP_REQUIRES_OK(ctx, ctx->input_list("initial_state", &initial_state_inputs)); - std::vector<Tensor> initial_state; - initial_state.reserve(initial_state_inputs.size()); - for (const Tensor& t : initial_state_inputs) { - initial_state.push_back(t); - } - - OpInputList inputs; - OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs)); - std::vector<Tensor> other_arguments; - other_arguments.reserve(inputs.size()); - for (const Tensor& t : inputs) { - other_arguments.push_back(t); - } + std::vector<Tensor> initial_state(initial_state_inputs.begin(), + initial_state_inputs.end()); std::unique_ptr<CapturedFunction> captured_func; - OP_REQUIRES_OK(ctx, CapturedFunction::Create( - func_, std::move(other_arguments), &captured_func)); + OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments", + &captured_func)); *output = new Dataset(ctx, input, func_, std::move(initial_state), std::move(captured_func), state_types_, output_types_, diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc index e1cefd23d8..ca4ea25b89 100644 --- a/tensorflow/core/kernels/data/tensor_dataset_op.cc +++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc @@ -33,11 +33,7 @@ class TensorDatasetOp : public DatasetOpKernel { OP_REQUIRES_OK(ctx, ctx->input_list("components", &inputs)); // TODO(mrry): Validate that the shapes of the "components" tensors match // the "shapes" attr.; - std::vector<Tensor> components; - components.reserve(inputs.size()); - for (const Tensor& t : inputs) { - components.push_back(t); - } + std::vector<Tensor> components(inputs.begin(), inputs.end()); *output = new Dataset(ctx, std::move(components)); } diff --git a/tensorflow/core/kernels/data/window_dataset_op.cc b/tensorflow/core/kernels/data/window_dataset_op.cc index 3975086841..ac44623ce2 100644 --- a/tensorflow/core/kernels/data/window_dataset_op.cc +++ b/tensorflow/core/kernels/data/window_dataset_op.cc @@ -33,22 +33,44 @@ class WindowDatasetOp : public UnaryDatasetOpKernel { void MakeDataset(OpKernelContext* ctx, DatasetBase* input, DatasetBase** output) override { int64 window_size = 0; - OP_REQUIRES_OK( - ctx, ParseScalarArgument<int64>(ctx, "window_size", &window_size)); + OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "size", &window_size)); OP_REQUIRES( ctx, window_size > 0, errors::InvalidArgument("Window size must be greater than zero.")); - *output = new Dataset(ctx, window_size, input); + int64 window_shift = 0; + OP_REQUIRES_OK(ctx, + ParseScalarArgument<int64>(ctx, "shift", &window_shift)); + OP_REQUIRES( + ctx, window_shift > 0, + errors::InvalidArgument("Window shift must be greater than zero.")); + + int64 window_stride = 0; + OP_REQUIRES_OK(ctx, + ParseScalarArgument<int64>(ctx, "stride", &window_stride)); + OP_REQUIRES( + ctx, window_stride > 0, + errors::InvalidArgument("Window stride must be greater than zero.")); + + bool drop_remainder; + OP_REQUIRES_OK( + ctx, ParseScalarArgument<bool>(ctx, "drop_remainder", &drop_remainder)); + + *output = new Dataset(ctx, input, window_size, window_shift, window_stride, + drop_remainder); } private: class Dataset : public DatasetBase { public: - Dataset(OpKernelContext* ctx, int64 window_size, const DatasetBase* input) + Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 window_size, + int64 window_shift, int64 window_stride, bool drop_remainder) : DatasetBase(DatasetContext(ctx)), + input_(input), window_size_(window_size), - input_(input) { + window_shift_(window_shift), + window_stride_(window_stride), + drop_remainder_(drop_remainder) { input_->Ref(); } @@ -72,7 +94,8 @@ class WindowDatasetOp : public UnaryDatasetOpKernel { } string DebugString() const override { - return strings::StrCat("WindowDatasetOp(", window_size_, ")::Dataset"); + return strings::StrCat("WindowDatasetOp(", window_size_, window_shift_, + window_stride_, drop_remainder_, ")::Dataset"); } protected: @@ -81,10 +104,19 @@ class WindowDatasetOp : public UnaryDatasetOpKernel { Node** output) const override { Node* input_graph_node = nullptr; TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node)); - Node* window_size = nullptr; - TF_RETURN_IF_ERROR(b->AddScalar(window_size_, &window_size)); + Node* window_size_node = nullptr; + TF_RETURN_IF_ERROR(b->AddScalar(window_size_, &window_size_node)); + Node* window_shift_node = nullptr; + TF_RETURN_IF_ERROR(b->AddScalar(window_shift_, &window_shift_node)); + Node* window_stride_node = nullptr; + TF_RETURN_IF_ERROR(b->AddScalar(window_stride_, &window_stride_node)); + Node* drop_remainder_node = nullptr; + TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder_node)); TF_RETURN_IF_ERROR( - b->AddDataset(this, {input_graph_node, window_size}, output)); + b->AddDataset(this, + {input_graph_node, window_size_node, window_shift_node, + window_stride_node, drop_remainder_node}, + output)); return Status::OK(); } @@ -101,37 +133,79 @@ class WindowDatasetOp : public UnaryDatasetOpKernel { Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors, bool* end_of_sequence) override { - // Each row of `window_elements` is a tuple of tensors from the - // input iterator. + const int64 window_size = dataset()->window_size_; + const int64 window_shift = dataset()->window_shift_; + const int64 window_stride = dataset()->window_stride_; std::vector<std::vector<Tensor>> window_elements; + Status status = Status::OK(); { mutex_lock l(mu_); - if (!input_impl_) { + if (!input_impl_ && buffer_.empty()) { *end_of_sequence = true; return Status::OK(); } - window_elements.reserve(dataset()->window_size_); - *end_of_sequence = false; - for (int i = 0; i < dataset()->window_size_ && !*end_of_sequence; - ++i) { - std::vector<Tensor> window_element_tuple; - TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &window_element_tuple, - end_of_sequence)); - if (!*end_of_sequence) { - window_elements.emplace_back(std::move(window_element_tuple)); - } else { - input_impl_.reset(); + + // Add elements to the buffer. + size_t target_size = TargetBufferSize(window_size, window_stride); + if (input_impl_) { + *end_of_sequence = false; + for (size_t i = buffer_.size(); + i < target_size && !*end_of_sequence; ++i) { + std::vector<Tensor> element; + Status status = + input_impl_->GetNext(ctx, &element, end_of_sequence); + if (!*end_of_sequence) { + buffer_.emplace_back(std::move(element), status); + } else { + input_impl_.reset(); + } } } + + // If there are not enough elements and `drop_remainder` is set, we do + // not wish to return a smaller window. + if (buffer_.empty() || + (dataset()->drop_remainder_ && buffer_.size() < target_size)) { + DCHECK(*end_of_sequence); + return Status::OK(); + } + + int num_elements = 1 + (buffer_.size() - 1) / window_stride; + window_elements.reserve(num_elements); + for (size_t i = 0; i < num_elements; ++i) { + status.Update(buffer_[window_stride * i].status); + if (!status.ok()) { + break; + } + window_elements.emplace_back(buffer_[window_stride * i].result); + } + + // Shift the window, discarding elements if necessary. + int buffer_size = buffer_.size(); + if (window_shift >= buffer_size) { + for (size_t i = buffer_size; input_impl_ && i < window_shift; ++i) { + bool end_of_input; + std::vector<Tensor> element; + // Ignore non-error status of discarded elements. + input_impl_->GetNext(ctx, &element, &end_of_input).IgnoreError(); + if (end_of_input) { + input_impl_.reset(); + } + } + buffer_.clear(); + } else { + buffer_.erase(buffer_.begin(), buffer_.begin() + window_shift); + } } - if (window_elements.empty()) { - DCHECK(*end_of_sequence); - return Status::OK(); + if (!status.ok()) { + return status; } + // Construct output tensors. const size_t num_tuple_components = window_elements[0].size(); const int64 num_window_elements = window_elements.size(); + *end_of_sequence = false; for (size_t idx = 0; idx < num_tuple_components; ++idx) { DatasetBase* window_dataset; std::vector<std::vector<Tensor>> window_component_elements; @@ -154,7 +228,6 @@ class WindowDatasetOp : public UnaryDatasetOpKernel { TF_RETURN_IF_ERROR(StoreDatasetInVariantTensor(window_dataset, &out_tensors->back())); } - *end_of_sequence = false; return Status::OK(); } @@ -167,6 +240,20 @@ class WindowDatasetOp : public UnaryDatasetOpKernel { } else { TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_)); } + // Save buffer. + TF_RETURN_IF_ERROR(writer->WriteScalar(strings::StrCat("buffer_size"), + buffer_.size())); + for (int64 i = 0; i < buffer_.size(); i++) { + TF_RETURN_IF_ERROR(WriteStatusLocked(writer, i, buffer_[i].status)); + TF_RETURN_IF_ERROR( + writer->WriteScalar(strings::StrCat("buffer[", i, "].size"), + buffer_[i].result.size())); + for (int64 j = 0; j < buffer_[i].result.size(); j++) { + TF_RETURN_IF_ERROR( + writer->WriteTensor(strings::StrCat("buffer[", i, "][", j, "]"), + buffer_[i].result[j])); + } + } return Status::OK(); } @@ -178,22 +265,92 @@ class WindowDatasetOp : public UnaryDatasetOpKernel { } else { input_impl_.reset(); } + // Restore buffer. + int64 buffer_size; + TF_RETURN_IF_ERROR( + reader->ReadScalar(strings::StrCat("buffer_size"), &buffer_size)); + buffer_.resize(buffer_size); + for (int64 i = 0; i < buffer_size; i++) { + int64 vector_size; + TF_RETURN_IF_ERROR(ReadStatusLocked(reader, i, &buffer_[i].status)); + TF_RETURN_IF_ERROR(reader->ReadScalar( + strings::StrCat("buffer[", i, "].size"), &vector_size)); + buffer_[i].result.resize(vector_size); + for (int64 j = 0; j < vector_size; j++) { + TF_RETURN_IF_ERROR( + reader->ReadTensor(strings::StrCat("buffer[", i, "][", j, "]"), + &buffer_[i].result[j])); + } + } return Status::OK(); } private: + struct InvocationResult { + InvocationResult() = default; + InvocationResult(std::vector<Tensor>&& result, const Status& status) + : result(result), status(status) {} + + std::vector<Tensor> result; + Status status; + }; + + Status WriteStatusLocked(IteratorStateWriter* writer, size_t index, + const Status& status) + EXCLUSIVE_LOCKS_REQUIRED(mu_) { + TF_RETURN_IF_ERROR(writer->WriteScalar( + CodeKey(index), static_cast<int64>(status.code()))); + if (!status.ok()) { + TF_RETURN_IF_ERROR(writer->WriteScalar(ErrorMessageKey(index), + status.error_message())); + } + return Status::OK(); + } + + Status ReadStatusLocked(IteratorStateReader* reader, size_t index, + Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + int64 code_int; + TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int)); + error::Code code = static_cast<error::Code>(code_int); + + if (code != error::Code::OK) { + string error_message; + TF_RETURN_IF_ERROR( + reader->ReadScalar(ErrorMessageKey(index), &error_message)); + *status = Status(code, error_message); + } else { + *status = Status::OK(); + } + return Status::OK(); + } + + string CodeKey(size_t index) { + return full_name(strings::StrCat("buffer[", index, "].code")); + } + + string ErrorMessageKey(size_t index) { + return full_name(strings::StrCat("buffer[", index, "].error_message")); + } + + size_t TargetBufferSize(int64 window_size, int64 window_stride) { + return (window_size - 1) * window_stride + 1; + } + mutex mu_; + std::deque<InvocationResult> buffer_ GUARDED_BY(mu_); std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_); }; - const int64 window_size_; const DatasetBase* const input_; + const int64 window_size_; + const int64 window_shift_; + const int64 window_stride_; + const bool drop_remainder_; }; }; REGISTER_KERNEL_BUILDER(Name("WindowDataset").Device(DEVICE_CPU), WindowDatasetOp); - } // namespace } // namespace data } // namespace tensorflow diff --git a/tensorflow/core/kernels/decode_bmp_op.cc b/tensorflow/core/kernels/decode_bmp_op.cc index b4dcf0a74b..ae451be7e2 100644 --- a/tensorflow/core/kernels/decode_bmp_op.cc +++ b/tensorflow/core/kernels/decode_bmp_op.cc @@ -91,8 +91,10 @@ class DecodeBmpOp : public OpKernel { errors::InvalidArgument( "Number of channels must be 1, 3 or 4, was ", channels_)); - OP_REQUIRES(context, width > 0 && header_size >= 0, + OP_REQUIRES(context, width > 0, errors::InvalidArgument("Width must be positive")); + OP_REQUIRES(context, height != 0, + errors::InvalidArgument("Height must be nonzero")); OP_REQUIRES(context, header_size >= 0, errors::InvalidArgument("header size must be nonnegative")); @@ -108,8 +110,7 @@ class DecodeBmpOp : public OpKernel { const int32 abs_height = abs(height); // there may be padding bytes when the width is not a multiple of 4 bytes - // 8 * channels == bits per pixel - const int row_size = (8 * channels_ * width + 31) / 32 * 4; + const int row_size = (channels_ * width + 3) / 4 * 4; const int64 last_pixel_offset = static_cast<int64>(header_size) + (abs_height - 1) * row_size + diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc index 3eed847c16..6bfb5bd5bc 100644 --- a/tensorflow/core/kernels/decode_csv_op.cc +++ b/tensorflow/core/kernels/decode_csv_op.cc @@ -61,6 +61,9 @@ class DecodeCSVOp : public OpKernel { OP_REQUIRES_OK(ctx, ctx->input_list("record_defaults", &record_defaults)); for (int i = 0; i < record_defaults.size(); ++i) { + OP_REQUIRES(ctx, record_defaults[i].dims() <= 1, + errors::InvalidArgument( + "Each record default should be at most rank 1")); OP_REQUIRES(ctx, record_defaults[i].NumElements() < 2, errors::InvalidArgument( "There should only be 1 default per field but field ", i, diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc index 2a25459194..76afd6f18c 100644 --- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc +++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc @@ -17,7 +17,7 @@ limitations under the License. #define EIGEN_USE_GPU #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "external/cub_archive/cub/util_ptx.cuh" +#include "third_party/cub/util_ptx.cuh" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/kernels/depthwise_conv_op.h" #include "tensorflow/core/platform/types.h" diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc index 862a97723f..e7882acc80 100644 --- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc +++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc @@ -35,10 +35,10 @@ limitations under the License. #define EIGEN_USE_GPU -#include "external/cub_archive/cub/device/device_radix_sort.cuh" -#include "external/cub_archive/cub/device/device_reduce.cuh" -#include "external/cub_archive/cub/iterator/constant_input_iterator.cuh" -#include "external/cub_archive/cub/thread/thread_operators.cuh" +#include "third_party/cub/device/device_radix_sort.cuh" +#include "third_party/cub/device/device_reduce.cuh" +#include "third_party/cub/iterator/constant_input_iterator.cuh" +#include "third_party/cub/thread/thread_operators.cuh" #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" diff --git a/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h b/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h index 27918b410b..8edf7d4a2c 100644 --- a/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h +++ b/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h @@ -59,12 +59,12 @@ EIGEN_ALWAYS_INLINE static const typename internal::conditional< const array< typename internal::traits<OutputBackward>::Index, 5>, const TensorReverseOp<const Eigen::array<bool, 5>, - const Kernel> > > >, + const Kernel>>>>, const TensorReshapingOp< const DSizes<typename internal::traits<OutputBackward>::Index, 2>, const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, - const OutputBackward> > > >, + const OutputBackward>>>>, TensorReshapingOp< const DSizes<typename internal::traits<OutputBackward>::Index, internal::traits<OutputBackward>::NumDimensions>, @@ -75,7 +75,7 @@ EIGEN_ALWAYS_INLINE static const typename internal::conditional< const DSizes<typename internal::traits<OutputBackward>::Index, 2>, const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, - const OutputBackward> >, + const OutputBackward>>, const Eigen::TensorForcedEvalOp<const TensorReshapingOp< const DSizes<typename internal::traits<OutputBackward>::Index, 2>, @@ -83,7 +83,7 @@ EIGEN_ALWAYS_INLINE static const typename internal::conditional< const array< typename internal::traits<OutputBackward>::Index, 5>, const TensorReverseOp<const Eigen::array<bool, 5>, - const Kernel> > > > > > >::type + const Kernel>>>>>>>::type CuboidConvolutionBackwardInput( const Kernel& kernel, const OutputBackward& output_backward, typename internal::traits<OutputBackward>::Index inputPlanes, @@ -94,12 +94,12 @@ CuboidConvolutionBackwardInput( typedef typename internal::traits<OutputBackward>::Index TensorIndex; const TensorRef<const Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, - internal::traits<Kernel>::Layout, TensorIndex> > + internal::traits<Kernel>::Layout, TensorIndex>> kern(kernel); const TensorRef< const Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, - internal::traits<OutputBackward>::Layout, TensorIndex> > + internal::traits<OutputBackward>::Layout, TensorIndex>> out(output_backward); EIGEN_STATIC_ASSERT(internal::traits<Kernel>::Layout == @@ -239,8 +239,8 @@ CuboidConvolutionBackwardInput( } } - // We will contract along the fused dimension that contains the kernelFilters, - // kernelPlanes, kernelRows and kernelCols. + // We will contract along the collapsed dimension that contains the + // kernelFilters, kernelPlanes, kernelRows and kernelCols. array<IndexPair<TensorIndex>, 1> contract_dims; if (isColMajor) { // col-major: kernel.contract(output.patches) @@ -323,35 +323,69 @@ CuboidConvolutionBackwardInput( */ template <typename OutputBackward, typename Input> EIGEN_ALWAYS_INLINE static const typename internal::conditional< - internal::traits<OutputBackward>::Layout == ColMajor, - TensorReshapingOp< - const DSizes<typename internal::traits<Input>::Index, 5>, - const TensorContractionOp< - const array<IndexPair<typename internal::traits<Input>::Index>, 1>, - const TensorReshapingOp< - const DSizes<typename internal::traits<Input>::Index, 2>, - const OutputBackward>, - const TensorShufflingOp< - const array<typename internal::traits<OutputBackward>::Index, - 2>, - const TensorReshapingOp< - const DSizes<typename internal::traits<Input>::Index, 2>, - const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, - const Input> > > > >, - TensorReshapingOp< - const DSizes<typename internal::traits<Input>::Index, 5>, - const TensorContractionOp< - const array<IndexPair<typename internal::traits<Input>::Index>, 1>, - const TensorShufflingOp< - const array<typename internal::traits<OutputBackward>::Index, - 2>, - const TensorReshapingOp< - const DSizes<typename internal::traits<Input>::Index, 2>, - const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, - const Input> > >, - const TensorReshapingOp< - const DSizes<typename internal::traits<Input>::Index, 2>, - const OutputBackward> > > >::type + internal::traits<Input>::Layout == ColMajor, + const TensorReverseOp< + const Eigen::array<typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions>, + const Eigen::TensorShufflingOp< + const Eigen::array<typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions>, + const Eigen::TensorReshapingOp< + const Eigen::DSizes<typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions>, + const TensorContractionOp< + const array< + IndexPair<typename internal::traits<Input>::Index>, 1>, + const Eigen::TensorForcedEvalOp<const TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, + 2>, + const Eigen::TensorShufflingOp< + const Eigen::array< + typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions>, + const OutputBackward>>>, + const TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, + 2>, + const TensorVolumePatchOp< + Dynamic, Dynamic, Dynamic, + const Eigen::TensorForcedEvalOp< + const Eigen::TensorShufflingOp< + const Eigen::array< + typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions>, + const Input>>>>>>>>, + const TensorReverseOp< + const Eigen::array<typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions>, + const Eigen::TensorShufflingOp< + const Eigen::array<typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions>, + const Eigen::TensorReshapingOp< + const Eigen::DSizes<typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions>, + const TensorContractionOp< + const array< + IndexPair<typename internal::traits<Input>::Index>, 1>, + const TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, + 2>, + const TensorVolumePatchOp< + Dynamic, Dynamic, Dynamic, + const Eigen::TensorForcedEvalOp< + const Eigen::TensorShufflingOp< + const Eigen::array< + typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions>, + const Input>>>>, + const Eigen::TensorForcedEvalOp<const TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, + 2>, + const Eigen::TensorShufflingOp< + const Eigen::array< + typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions>, + const OutputBackward>>>>>>>>::type CuboidConvolutionBackwardKernel( const Input& input, const OutputBackward& output_backward, typename internal::traits<Input>::Index kernelPlanes, @@ -362,11 +396,11 @@ CuboidConvolutionBackwardKernel( typedef typename internal::traits<Input>::Index TensorIndex; TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, - internal::traits<Input>::Layout, TensorIndex> > + internal::traits<Input>::Layout, TensorIndex>> in(input); TensorRef<Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, - internal::traits<OutputBackward>::Layout, TensorIndex> > + internal::traits<OutputBackward>::Layout, TensorIndex>> out(output_backward); EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == @@ -380,6 +414,13 @@ CuboidConvolutionBackwardKernel( internal::traits<OutputBackward>::NumDimensions, YOU_MADE_A_PROGRAMMING_MISTAKE); + // We do not support higher dimensional backward convolutions, or convolutions + // without batch dimension. + // TODO(ezhulenev): Relax this constraint, and turn on tests without batch + // dimension in eigen_backward_cuboid_convolutions_test.cc. + EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 5, + YOU_MADE_A_PROGRAMMING_MISTAKE); + const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2); const TensorIndex inputRows = @@ -401,6 +442,10 @@ CuboidConvolutionBackwardKernel( const TensorIndex kernelChannels = isColMajor ? in.dimension(0) : in.dimension(NumDims - 1); + // Number of batches in the input tensor. + const TensorIndex batch = + isColMajor ? in.dimension(4) : in.dimension(NumDims - 5); + // TODO(ezhulenev): Add support for inflated strides. Without inflated strides // effective kernel planes/rows/cols are always the same as the kernel itself // (see eigen_spatial_convolutions for details). @@ -408,6 +453,7 @@ CuboidConvolutionBackwardKernel( const TensorIndex kernelRowsEff = kernelRows; const TensorIndex kernelColsEff = kernelCols; + // Compute forward padding from input and output_backward dimensions. const TensorIndex padPlanes = numext::maxi<Index>( 0, (outputPlanes - 1) * stridePlanes + kernelPlanesEff - inputPlanes); const TensorIndex padRows = numext::maxi<Index>( @@ -416,92 +462,147 @@ CuboidConvolutionBackwardKernel( 0, (outputCols - 1) * strideCols + kernelColsEff - inputCols); const TensorIndex padding_top_z = padPlanes / 2; - const TensorIndex padding_bottom_z = padPlanes - padding_top_z; const TensorIndex padding_top = padRows / 2; - const TensorIndex padding_bottom = padRows - padding_top; const TensorIndex padding_left = padCols / 2; - const TensorIndex padding_right = padCols - padding_left; - // Reshaped output_backward before contraction. - DSizes<TensorIndex, 2> output_dims; + // Compute paddings for output_backward before extracting patches. + const auto expanded_out_planes = (outputPlanes - 1) * stridePlanes + 1; + const auto expanded_out_rows = (outputRows - 1) * strideRows + 1; + const auto expanded_out_cols = (outputCols - 1) * strideCols + 1; + const auto padded_out_planes = inputPlanes + kernelPlanes - 1; + const auto padded_out_rows = inputRows + kernelRows - 1; + const auto padded_out_cols = inputCols + kernelCols - 1; + const auto top_pad_planes = kernelPlanes - 1 - padding_top_z; + const auto top_pad_rows = kernelRows - 1 - padding_top; + const auto left_pad_cols = kernelCols - 1 - padding_left; + const auto bottom_pad_planes = + padded_out_planes - expanded_out_planes - top_pad_planes; + const auto bottom_pad_rows = + padded_out_rows - expanded_out_rows - top_pad_rows; + const auto right_pad_cols = + padded_out_cols - expanded_out_cols - left_pad_cols; + + // Reorder output_backward dimensions. + array<TensorIndex, 5> output_backward_shuffle; if (isColMajor) { - output_dims[0] = kernelFilters; - output_dims[1] = outputPlanes * outputRows * outputCols; - for (int i = 4; i < NumDims; ++i) { - output_dims[1] *= out.dimension(i); - } + // From: [out_depth, out_planes, out_rows, out_cols, batch] + // To: [batch, out_planes, out_rows, out_cols, out_depth] + output_backward_shuffle = {4, 1, 2, 3, 0}; } else { - output_dims[1] = kernelFilters; - output_dims[0] = outputCols * outputRows * outputPlanes; - for (int i = 0; i < NumDims - 4; ++i) { - output_dims[0] *= out.dimension(i); - } + // From: [batch, out_cols, out_rows, out_planes, out_depth] + // To: [out_depth, out_cols, out_rows, out_planes, batch] + output_backward_shuffle = {4, 1, 2, 3, 0}; } - // Reshaped extract_volume_patches(in) - DSizes<TensorIndex, 2> pre_contract_dims; + // Reorder input dimensions. + array<TensorIndex, 5> input_shuffle; if (isColMajor) { - pre_contract_dims[0] = - kernelChannels * kernelPlanes * kernelRows * kernelCols; - pre_contract_dims[1] = outputPlanes * outputRows * outputCols; - for (int i = 4; i < NumDims; ++i) { - pre_contract_dims[1] *= in.dimension(i); - } - eigen_assert(output_dims[1] == pre_contract_dims[1]); + // From: [in_depth, in_planes, in_rows, in_cols, batch] + // To: [in_depth, batch, in_planes, in_rows, in_cols] + input_shuffle = {0, 4, 1, 2, 3}; } else { - pre_contract_dims[1] = - kernelCols * kernelRows * kernelPlanes * kernelChannels; - pre_contract_dims[0] = outputCols * outputRows * outputPlanes; - for (int i = 0; i < NumDims - 4; ++i) { - pre_contract_dims[0] *= in.dimension(i); - } - eigen_assert(output_dims[0] == pre_contract_dims[0]); + // From: [batch, in_cols, in_rows, in_planes, in_depth] + // To: [in_cols, in_rows, in_planes, batch, in_depth] + input_shuffle = {1, 2, 3, 0, 4}; } - array<TensorIndex, 2> shuffle_dims; - shuffle_dims[0] = 1; - shuffle_dims[1] = 0; + // Input is playing the role of a "kernel" in this convolution. + DSizes<TensorIndex, 2> input_dims; + if (isColMajor) { + input_dims[0] = kernelChannels; + input_dims[1] = batch * inputPlanes * inputRows * inputCols; + } else { + input_dims[1] = kernelChannels; + input_dims[0] = inputCols * inputRows * inputPlanes * batch; + } + // Molds the output of the patch extraction result into a 2D tensor: + // - the first dimension (dims[0]): the patch values to be multiplied with the + // kernels + // - the second dimension (dims[1]): everything else + DSizes<TensorIndex, 2> pre_contract_dims; + if (isColMajor) { + pre_contract_dims[0] = batch * inputPlanes * inputRows * inputCols; + pre_contract_dims[1] = + kernelPlanes * kernelRows * kernelCols * kernelFilters; + } else { + pre_contract_dims[1] = inputCols * inputRows * inputPlanes * batch; + pre_contract_dims[0] = + kernelFilters * kernelCols * kernelRows * kernelPlanes; + } + + // We will contract along the collapsed dimension that contains the + // batch, inputPlanes, inputRows and inputCols. array<IndexPair<TensorIndex>, 1> contract_dims; contract_dims[0] = IndexPair<TensorIndex>(1, 0); - DSizes<TensorIndex, 5> kernel_dims; + // Dimensions after contraction. + DSizes<TensorIndex, NumDims> post_contract_dims; if (isColMajor) { - kernel_dims[0] = kernelFilters; - kernel_dims[1] = kernelChannels; - kernel_dims[2] = kernelPlanes; - kernel_dims[3] = kernelRows; - kernel_dims[4] = kernelCols; + post_contract_dims[0] = kernelChannels; + post_contract_dims[1] = kernelPlanes; + post_contract_dims[2] = kernelRows; + post_contract_dims[3] = kernelCols; + post_contract_dims[4] = kernelFilters; } else { - kernel_dims[4] = kernelFilters; - kernel_dims[3] = kernelChannels; - kernel_dims[2] = kernelPlanes; - kernel_dims[1] = kernelRows; - kernel_dims[0] = kernelCols; + post_contract_dims[0] = kernelFilters; + post_contract_dims[1] = kernelCols; + post_contract_dims[2] = kernelRows; + post_contract_dims[3] = kernelPlanes; + post_contract_dims[4] = kernelChannels; } - return choose( - Cond<internal::traits<Input>::Layout == ColMajor>(), - output_backward.reshape(output_dims) - .contract(input + // Reorder output of contraction to valid filter shape. + array<TensorIndex, 5> kernel_shuffle; + if (isColMajor) { + // From: [in_depth, kernel_planes, kernel_rows, kernel_cols, out_depth] + // To: [out_depth, in_depth, kernel_planes, kernel_rows, kernel_cols] + kernel_shuffle = {4, 0, 1, 2, 3}; + } else { + // From: [out_depth, kernel_cols, kernel_rows, kernel_planes, in_depth] + // To: [kernel_cols, kernel_rows, kernel_planes, in_depth, out_depth] + kernel_shuffle = {1, 2, 3, 4, 0}; + } + + // Reverse kernel backprop dimensions. + array<TensorIndex, 5> kernel_reverse; + if (isColMajor) { + kernel_reverse = {false, false, true, true, true}; + } else { + kernel_reverse = {true, true, true, false, false}; + } + + // Create convolution input (aka source of patches) from output backward + // tensor by shuffling dimensions. + const auto the_input = + output_backward.shuffle(output_backward_shuffle).eval(); + + // Create convolution kernel (aka filter) from input by shuffling and + // reshaping. + const auto the_kernel = + input.shuffle(input_shuffle).reshape(input_dims).eval(); + + return choose(Cond<internal::traits<Input>::Layout == ColMajor>(), + the_kernel.contract( + the_input .extract_volume_patches( - kernelPlanes, kernelRows, kernelCols, stridePlanes, - strideRows, strideCols, 1, 1, 1, padding_top_z, - padding_bottom_z, padding_top, padding_bottom, - padding_left, padding_right) - .reshape(pre_contract_dims) - .shuffle(shuffle_dims), - contract_dims) - .reshape(kernel_dims), - input - .extract_volume_patches(kernelPlanes, kernelRows, kernelCols, - stridePlanes, strideRows, strideCols, 1, 1, 1, - padding_top_z, padding_bottom_z, padding_top, - padding_bottom, padding_left, padding_right) - .reshape(pre_contract_dims) - .shuffle(shuffle_dims) - .contract(output_backward.reshape(output_dims), contract_dims) - .reshape(kernel_dims)); + inputPlanes, inputRows, inputCols, 1, 1, 1, + stridePlanes, strideRows, strideCols, + top_pad_planes, bottom_pad_planes, top_pad_rows, + bottom_pad_rows, left_pad_cols, right_pad_cols) + .reshape(pre_contract_dims), + contract_dims), + the_input + .extract_volume_patches( + inputPlanes, inputRows, inputCols, 1, 1, 1, + stridePlanes, strideRows, strideCols, top_pad_planes, + bottom_pad_planes, top_pad_rows, bottom_pad_rows, + left_pad_cols, right_pad_cols) + .reshape(pre_contract_dims) + .contract(the_kernel, contract_dims)) + .reshape(post_contract_dims) + .shuffle(kernel_shuffle) + .reverse(kernel_reverse); } } // end namespace Eigen diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h index 8d06107553..960920c55b 100644 --- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h +++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h @@ -238,8 +238,8 @@ SpatialConvolutionBackwardInput( } } - // We will contract along the fused dimension that contains the kernelFilters, - // the kernelRows and the kernelCols. + // We will contract along the collapsed dimension that contains the + // kernelFilters, the kernelRows and the kernelCols. array<IndexPair<TensorIndex>, 1> contract_dims; if (isColMajor) { // col-major: kernel.contract(output.patches) @@ -332,23 +332,16 @@ EIGEN_ALWAYS_INLINE static const typename internal::conditional< const TensorReshapingOp< const DSizes<typename internal::traits<Input>::Index, 2>, const OutputBackward>, - const TensorShufflingOp< - const array<typename internal::traits<OutputBackward>::Index, - 2>, - const TensorReshapingOp< - const DSizes<typename internal::traits<Input>::Index, 2>, - const TensorImagePatchOp<Dynamic, Dynamic, - const Input> > > > >, + const TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, 2>, + const TensorImagePatchOp<Dynamic, Dynamic, const Input> > > >, TensorReshapingOp< const DSizes<typename internal::traits<Input>::Index, 4>, const TensorContractionOp< const array<IndexPair<typename internal::traits<Input>::Index>, 1>, - const TensorShufflingOp< - const array<typename internal::traits<OutputBackward>::Index, - 2>, - const TensorReshapingOp< - const DSizes<typename internal::traits<Input>::Index, 2>, - const TensorImagePatchOp<Dynamic, Dynamic, const Input> > >, + const TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, 2>, + const TensorImagePatchOp<Dynamic, Dynamic, const Input> >, const TensorReshapingOp< const DSizes<typename internal::traits<Input>::Index, 2>, const OutputBackward> > > >::type @@ -456,12 +449,16 @@ SpatialConvolutionBackwardKernel( eigen_assert(output_dims[0] == pre_contract_dims[0]); } - array<TensorIndex, 2> shuffle_dims; - shuffle_dims[0] = 1; - shuffle_dims[1] = 0; - + // We will contract along the collapsed dimension that contains the + // outputCols, outputRows and OTHERS. array<IndexPair<TensorIndex>, 1> contract_dims; - contract_dims[0] = IndexPair<TensorIndex>(1, 0); + if (isColMajor) { + // col-major: output_backward.contract(input.patches) + contract_dims[0] = IndexPair<TensorIndex>(1, 1); + } else { + // row-major: input.patches.contract(output_backward) + contract_dims[0] = IndexPair<TensorIndex>(0, 0); + } // After the contraction, the kernel will have the desired shape // out_depth X in_shape X kernel_rows X kernel_cols @@ -487,8 +484,7 @@ SpatialConvolutionBackwardKernel( kernelRows, kernelCols, row_stride, col_stride, row_in_stride, col_in_stride, 1, 1, padding_top, padding_bottom, padding_left, padding_right, OutScalar(0)) - .reshape(pre_contract_dims) - .shuffle(shuffle_dims), + .reshape(pre_contract_dims), contract_dims) .reshape(kernel_dims), input @@ -497,7 +493,6 @@ SpatialConvolutionBackwardKernel( padding_top, padding_bottom, padding_left, padding_right, OutScalar(0)) .reshape(pre_contract_dims) - .shuffle(shuffle_dims) .contract(output_backward.reshape(output_dims), contract_dims) .reshape(kernel_dims)); } diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc index 2229ec9659..673ec1458b 100644 --- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc +++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc @@ -1248,11 +1248,14 @@ TEST(EigenBackwardSpatialConvolutionsTest, const int output_cols = input_cols - patch_cols + 1; const int output_planes = input_planes - patch_planes + 1; - Tensor<float, 4> input(input_depth, input_planes, input_rows, input_cols); + // TODO(ezhulenev): Support backward kernel convolution without batch + // dimension. + Tensor<float, 5> input(input_depth, input_planes, input_rows, input_cols, + /*num_batches*/ 1); Tensor<float, 5> kernel(output_depth, input_depth, patch_planes, patch_rows, patch_cols); - Tensor<float, 4> output_backward(output_depth, output_planes, output_rows, - output_cols); + Tensor<float, 5> output_backward(output_depth, output_planes, output_rows, + output_cols, /*num_batches*/ 1); output_backward = output_backward.constant(11.0f) + output_backward.random(); input = input.constant(2.0f) + input.random(); @@ -1282,9 +1285,9 @@ TEST(EigenBackwardSpatialConvolutionsTest, if (output_i >= 0 && output_i < output_planes && output_j >= 0 && output_j < output_rows && output_k >= 0 && output_k < output_cols) { - expected += - input(id, i, j, k) * - output_backward(od, output_i, output_j, output_k); + expected += input(id, i, j, k, /*batch*/ 0) * + output_backward(od, output_i, output_j, + output_k, /*batch*/ 0); } } } @@ -1311,12 +1314,14 @@ TEST(EigenBackwardSpatialConvolutionsTest, const int output_cols = input_cols - patch_cols + 1; const int output_planes = input_planes - patch_planes + 1; - Tensor<float, 4, RowMajor> input(input_cols, input_rows, input_planes, - input_depth); + // TODO(ezhulenev): Support backward kernel convolution without batch + // dimension. + Tensor<float, 5, RowMajor> input(/*num_batches*/ 1, input_cols, input_rows, + input_planes, input_depth); Tensor<float, 5, RowMajor> kernel(patch_cols, patch_rows, patch_planes, input_depth, output_depth); - Tensor<float, 4, RowMajor> output_backward(output_cols, output_rows, - output_planes, output_depth); + Tensor<float, 5, RowMajor> output_backward( + /*num_batches*/ 1, output_cols, output_rows, output_planes, output_depth); output_backward = output_backward.constant(11.0f) + output_backward.random(); input = input.constant(2.0f) + input.random(); @@ -1346,9 +1351,9 @@ TEST(EigenBackwardSpatialConvolutionsTest, if (output_i >= 0 && output_i < output_planes && output_j >= 0 && output_j < output_rows && output_k >= 0 && output_k < output_cols) { - expected += - input(k, j, i, id) * - output_backward(output_k, output_j, output_i, od); + expected += input(/*batch*/ 0, k, j, i, id) * + output_backward(/*batch*/ 0, output_k, output_j, + output_i, od); } } } diff --git a/tensorflow/core/kernels/eigen_cuboid_convolution.h b/tensorflow/core/kernels/eigen_cuboid_convolution.h index 62e9f9123d..6a9a2accd8 100644 --- a/tensorflow/core/kernels/eigen_cuboid_convolution.h +++ b/tensorflow/core/kernels/eigen_cuboid_convolution.h @@ -21,6 +21,1412 @@ limitations under the License. namespace Eigen { +namespace internal { + +// WARNING: Most of the code here implicitly assumes that the matrix is in +// ColMajor layout. This is guaranteed by the tensor contraction (see +// TensorContraction.h). +// +// Inside Eigen a tensor contraction is represented by a matrix multiplication. +// We don't want to actually extract volume patches and reshape the result into +// a matrix (this involves allocating huge extra memory), so the patch +// extraction and reshape operations are implicit. +// +// TensorContractionInputMapper takes a matrix index and returns the coefficient +// (or the packet) of the "virtual tensor", that would be at that index if we +// were to actually reshape the result of patch extraction. +// +// TensorContractionSubMapper provides a similar view into the "virtual matrix" +// at the given vertical and horizontal offsets. +// +// "Virtual matrix" dimensions: +// *0: kernelChannels * kernelPlanes * kernelRows * kernelCols +// 1: out_planes * out_height * out_width * OTHERS (e.g batches, etc...) +// +// *) extracted patches are continuous in memory (innermost dimension assuming +// col major layout) +// +// With this dimensions: +// row - offset within a single patch (in code: patchId) +// col - index of the extracted patch (in code: patchIndex) +// patchIndex ∈ [0..num_patches * OTHERS] (batch and other dimensions) +// +template <typename NewDimension, DenseIndex Planes, DenseIndex Rows, + DenseIndex Cols, typename ArgType, typename Device, typename Scalar_, + typename Index, typename nocontract_t, typename contract_t, int Side, + int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, + int Alignment> +class TensorContractionInputMapper< + Scalar_, Index, Side, + TensorEvaluator<const TensorReshapingOp<NewDimension, + const TensorVolumePatchOp< + Planes, Rows, Cols, ArgType> >, + Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, + inner_dim_reordered, Alignment> { + public: + typedef Scalar_ Scalar; + typedef TensorContractionInputMapper< + Scalar, Index, Side, + TensorEvaluator<const TensorReshapingOp< + NewDimension, const TensorVolumePatchOp< + Planes, Rows, Cols, ArgType> >, + Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, + inner_dim_reordered, Alignment> + Self; + typedef TensorContractionSubMapper< + Scalar, Index, Side, + TensorEvaluator<const TensorReshapingOp< + NewDimension, const TensorVolumePatchOp< + Planes, Rows, Cols, ArgType> >, + Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, + inner_dim_reordered, Alignment> + SubMapper; + typedef SubMapper VectorMapper; + typedef SubMapper LinearMapper; + typedef typename packet_traits<Scalar>::type Packet; + + EIGEN_DEVICE_FUNC + TensorContractionInputMapper( + const TensorEvaluator< + const TensorReshapingOp< + NewDimension, + const TensorVolumePatchOp<Planes, Rows, Cols, ArgType> >, + Device>& tensor, + const nocontract_t&, const nocontract_t&, const contract_t&, + const contract_t&) + : m_impl(tensor.impl().impl()) { + if (internal::traits<ArgType>::Layout == ColMajor) { + m_patch_depth = tensor.impl().dimensions()[0]; + m_patch_planes = tensor.impl().dimensions()[1]; + m_patch_rows = tensor.impl().dimensions()[2]; + m_patch_cols = tensor.impl().dimensions()[3]; + m_num_patches = tensor.impl().dimensions()[4]; + } else { + const int NumDims = tensor.impl().dimensions().size(); + m_patch_depth = tensor.impl().dimensions()[NumDims - 1]; + m_patch_planes = tensor.impl().dimensions()[NumDims - 2]; + m_patch_rows = tensor.impl().dimensions()[NumDims - 3]; + m_patch_cols = tensor.impl().dimensions()[NumDims - 4]; + m_num_patches = tensor.impl().dimensions()[NumDims - 5]; + } + + // Strides for navigating through the single patch. + m_patch_plane_stride = m_patch_depth; + m_patch_row_stride = m_patch_planes * m_patch_plane_stride; + m_patch_col_stride = m_patch_rows * m_patch_row_stride; + + // Strides for the output tensor. + // IMPORTANT: These strides are used to locate an element in a patch at a + // depth zero (channel), which is not quite the same as "traditional" + // stride. + m_rowStride = m_patch_planes; + m_colStride = m_patch_rows * m_rowStride; + m_patchStride = m_colStride * m_patch_cols * m_patch_depth; + m_otherStride = m_patchStride * m_num_patches; + + m_outputPlanes = tensor.impl().outputPlanes(); + m_outputRows = tensor.impl().outputRows(); + m_outputCols = tensor.impl().outputCols(); + + m_outputPlanesRows = m_outputPlanes * m_outputRows; + + m_plane_strides = tensor.impl().userPlaneStride(); + m_row_strides = tensor.impl().userRowStride(); + m_col_strides = tensor.impl().userColStride(); + + m_in_plane_strides = tensor.impl().userInPlaneStride(); + m_in_row_strides = tensor.impl().userInRowStride(); + m_in_col_strides = tensor.impl().userInColStride(); + + m_patch_plane_inflate_strides = tensor.impl().planeInflateStride(); + m_patch_row_inflate_strides = tensor.impl().rowInflateStride(); + m_patch_col_inflate_strides = tensor.impl().colInflateStride(); + + if (internal::traits<ArgType>::Layout == ColMajor) { + m_inputDepth = tensor.impl().impl().dimensions()[0]; + m_inputPlanes = tensor.impl().impl().dimensions()[1]; + m_inputRows = tensor.impl().impl().dimensions()[2]; + m_inputCols = tensor.impl().impl().dimensions()[3]; + } else { + const int NumDims = tensor.impl().impl().dimensions().size(); + m_inputDepth = tensor.impl().impl().dimensions()[NumDims - 1]; + m_inputPlanes = tensor.impl().impl().dimensions()[NumDims - 2]; + m_inputRows = tensor.impl().impl().dimensions()[NumDims - 3]; + m_inputCols = tensor.impl().impl().dimensions()[NumDims - 4]; + } + + // Strides for navigating through the input tensor. + m_planeInputStride = m_inputDepth; + m_rowInputStride = m_inputDepth * m_inputPlanes; + m_colInputStride = m_inputDepth * m_inputRows * m_inputPlanes; + m_patchInputStride = + m_inputDepth * m_inputRows * m_inputCols * m_inputPlanes; + + m_planePaddingTop = tensor.impl().planePaddingTop(); + m_rowPaddingTop = tensor.impl().rowPaddingTop(); + m_colPaddingLeft = tensor.impl().colPaddingLeft(); + + m_fastNumPatches = internal::TensorIntDivisor<Index>(m_num_patches); + + m_fastPatchPlaneStride = + internal::TensorIntDivisor<Index>(m_patch_plane_stride); + m_fastPatchRowStride = + internal::TensorIntDivisor<Index>(m_patch_row_stride); + m_fastPatchColStride = + internal::TensorIntDivisor<Index>(m_patch_col_stride); + + m_fastInputPlaneStride = + internal::TensorIntDivisor<Index>(m_patch_plane_inflate_strides); + m_fastInputRowStride = + internal::TensorIntDivisor<Index>(m_patch_row_inflate_strides); + m_fastInputColStride = + internal::TensorIntDivisor<Index>(m_patch_col_inflate_strides); + + m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride); + m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride); + + m_fastDimZero = internal::TensorIntDivisor<Index>(m_patch_depth); + m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows); + m_fastOutputPlanes = internal::TensorIntDivisor<Index>(m_outputPlanes); + m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows); + m_fastOutputCols = internal::TensorIntDivisor<Index>(m_outputCols); + + m_fastOutputPlanesRows = + internal::TensorIntDivisor<Index>(m_outputPlanesRows); + } + + EIGEN_DEVICE_FUNC + TensorContractionInputMapper(const TensorContractionInputMapper& base_mapper) + : m_impl(base_mapper.m_impl) { + m_patch_depth = base_mapper.m_patch_depth; + m_patch_planes = base_mapper.m_patch_planes; + m_patch_rows = base_mapper.m_patch_rows; + m_patch_cols = base_mapper.m_patch_cols; + m_num_patches = base_mapper.m_num_patches; + + m_patch_plane_stride = base_mapper.m_patch_plane_stride; + m_patch_row_stride = base_mapper.m_patch_row_stride; + m_patch_col_stride = base_mapper.m_patch_col_stride; + + m_rowStride = base_mapper.m_rowStride; + m_colStride = base_mapper.m_colStride; + m_patchStride = base_mapper.m_patchStride; + m_otherStride = base_mapper.m_otherStride; + + m_planeInputStride = base_mapper.m_planeInputStride; + m_rowInputStride = base_mapper.m_rowInputStride; + m_colInputStride = base_mapper.m_colInputStride; + m_patchInputStride = base_mapper.m_patchInputStride; + m_otherInputStride = base_mapper.m_otherInputStride; + + m_inputDepth = base_mapper.m_inputDepth; + m_inputPlanes = base_mapper.m_inputPlanes; + m_inputRows = base_mapper.m_inputRows; + m_inputCols = base_mapper.m_inputCols; + + m_outputPlanes = base_mapper.m_outputPlanes; + m_outputRows = base_mapper.m_outputRows; + m_outputCols = base_mapper.m_outputCols; + + m_plane_strides = base_mapper.m_plane_strides; + m_row_strides = base_mapper.m_row_strides; + m_col_strides = base_mapper.m_col_strides; + + m_in_plane_strides = base_mapper.m_in_plane_strides; + m_in_row_strides = base_mapper.m_in_row_strides; + m_in_col_strides = base_mapper.m_in_col_strides; + + m_patch_plane_inflate_strides = base_mapper.m_patch_plane_inflate_strides; + m_patch_row_inflate_strides = base_mapper.m_patch_row_inflate_strides; + m_patch_col_inflate_strides = base_mapper.m_patch_col_inflate_strides; + + m_planePaddingTop = base_mapper.m_planePaddingTop; + m_rowPaddingTop = base_mapper.m_rowPaddingTop; + m_colPaddingLeft = base_mapper.m_colPaddingLeft; + + m_outputPlanesRows = base_mapper.m_outputPlanesRows; + + m_fastNumPatches = base_mapper.m_fastNumPatches; + m_fastPatchPlaneStride = base_mapper.m_fastPatchPlaneStride; + m_fastPatchRowStride = base_mapper.m_fastPatchRowStride; + m_fastPatchColStride = base_mapper.m_fastPatchColStride; + m_fastInputPlaneStride = base_mapper.m_fastInputPlaneStride; + m_fastInputRowStride = base_mapper.m_fastInputRowStride; + m_fastInputColStride = base_mapper.m_fastInputColStride; + m_fastRowStride = base_mapper.m_fastRowStride; + m_fastColStride = base_mapper.m_fastColStride; + m_fastOutputPlanes = base_mapper.m_fastOutputPlanes; + m_fastOutputRows = base_mapper.m_fastOutputRows; + m_fastOutputCols = base_mapper.m_fastOutputCols; + m_fastDimZero = base_mapper.m_fastDimZero; + m_fastOutputPlanesRows = base_mapper.m_fastOutputPlanesRows; + } + + // If true, turns off some optimizations for loading packets since the image + // patches are "non-standard" such as there are non-trivial strides or + // inflations in the input. + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool nonStandardPatches() const { + return m_in_plane_strides != 1 || m_in_row_strides != 1 || + m_in_col_strides != 1 || m_patch_plane_inflate_strides != 1 || + m_patch_row_inflate_strides != 1 || m_patch_col_inflate_strides != 1; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Scalar operator()(Index row) const { + Index planeIndex, rowIndex, colIndex, otherIndex; + computeBaseIndices(0, planeIndex, rowIndex, colIndex, otherIndex); + return loadCoeff(row, planeIndex, rowIndex, colIndex, otherIndex); + } + + // Load the coefficient at the patchIndex location instead of the usual + // m_rowIndex, m_colIndex, m_otherIndex. This is currently only used by the + // gpu code. + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row, Index patchIndex) const { + Index planeIndex, rowIndex, colIndex, otherIndex; + computeBaseIndices(patchIndex, planeIndex, rowIndex, colIndex, otherIndex); + return loadCoeff(row, planeIndex, rowIndex, colIndex, otherIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacket(Index row) const { + Index planeIndex, rowIndex, colIndex, otherIndex; + computeBaseIndices(0, planeIndex, rowIndex, colIndex, otherIndex); + return loadPacket(row, planeIndex, rowIndex, colIndex, otherIndex); + } + + // Load the packet at the patchIndex location instead of the usual m_rowIndex, + // m_colIndex, m_otherIndex. This is currently only used by the gpu code. + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacket(Index row, Index patchIndex) const { + Index planeIndex, rowIndex, colIndex, otherIndex; + computeBaseIndices(patchIndex, planeIndex, rowIndex, colIndex, otherIndex); + return loadPacket(row, planeIndex, rowIndex, colIndex, otherIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE const TensorEvaluator<ArgType, Device>& impl() const { + return m_impl; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_planeInputStride; } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchPlanes() const { return m_rowStride; } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchRows() const { return m_patch_rows; } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth, + const Index baseIndex) const { + const Index inputIndex = depth + baseIndex; + return m_impl.template packet<Unaligned>(inputIndex); + } + + private: + friend class TensorContractionSubMapper< + Scalar, Index, Side, + TensorEvaluator<const TensorReshapingOp< + NewDimension, const TensorVolumePatchOp< + Planes, Rows, Cols, ArgType> >, + Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, + inner_dim_reordered, Alignment>; + + // Load coefficient from a patch specified by the "within patch offset" + // (patchId) and the precomputed indices of the first element of the patch. + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar loadCoeff(Index patchId, Index planeIndex, + Index rowIndex, Index colIndex, + Index otherIndex) const { + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = patchId / m_fastDimZero; + + const Index colOffset = patchOffset / m_fastColStride; + const Index inputCol = colIndex + colOffset * m_in_col_strides; + const Index origInputCol = + (m_patch_col_inflate_strides == 1) + ? inputCol + : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); + + const Index rowOffset = + (patchOffset - colOffset * m_colStride) / m_fastRowStride; + const Index inputRow = rowIndex + rowOffset * m_in_row_strides; + const Index origInputRow = + (m_patch_row_inflate_strides == 1) + ? inputRow + : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); + + const Index planeOffset = + patchOffset - colOffset * m_colStride - rowOffset * m_rowStride; + const Index inputPlane = planeIndex + planeOffset * m_in_plane_strides; + const Index origInputPlane = + (m_patch_plane_inflate_strides == 1) + ? inputPlane + : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0); + + if (origInputCol < 0 || origInputRow < 0 || origInputPlane < 0 || + origInputCol >= m_inputCols || origInputRow >= m_inputRows || + origInputPlane >= m_inputPlanes || + (inputCol != origInputCol * m_patch_col_inflate_strides) || + (inputRow != origInputRow * m_patch_row_inflate_strides) || + (inputPlane != origInputPlane * m_patch_plane_inflate_strides)) { + return Scalar(0); + } + + const Index depth = patchId - patchOffset * patchDepth(); + const Index inputIndex = depth + origInputPlane * m_planeInputStride + + origInputRow * m_rowInputStride + + origInputCol * m_colInputStride + otherIndex; + + return m_impl.coeff(inputIndex); + } + + // This is the same as loadCoeff(...), but optimized for all `inflate_strides` + // and `in_strides` equal to 1 (template specialization without templates). + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar loadCoeffStandard(Index patchId, Index planeIndex, + Index rowIndex, Index colIndex, + Index otherIndex) const { + eigen_assert(!nonStandardPatches()); + + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = patchId / m_fastDimZero; + + const Index colOffset = patchOffset / m_fastColStride; + const Index rowOffset = + (patchOffset - colOffset * m_colStride) / m_fastRowStride; + const Index planeOffset = + patchOffset - colOffset * m_colStride - rowOffset * m_rowStride; + + const Index inputCol = colIndex + colOffset; + const Index inputRow = rowIndex + rowOffset; + const Index inputPlane = planeIndex + planeOffset; + + if (inputCol < 0 || inputCol >= m_inputCols || inputRow < 0 || + inputRow >= m_inputRows || inputPlane < 0 || + inputPlane >= m_inputPlanes) { + return Scalar(0); + } + + const Index depth = patchId - patchOffset * patchDepth(); + const Index inputIndex = depth + inputPlane * m_planeInputStride + + inputRow * m_rowInputStride + + inputCol * m_colInputStride + otherIndex; + + return m_impl.coeff(inputIndex); + } + + // Load packet from a patch specified by the "within patch offset" + // (patchId) and the precomputed indices of the first element of the patch. + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacket(Index patchId, Index planeIndex, + Index rowIndex, Index colIndex, + Index otherIndex) const { + const Index packetSize = internal::unpacket_traits<Packet>::size; + + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(patchId < + patchDepth() * patchPlanes() * patchRows() * patchCols()); + + if (nonStandardPatches()) { + return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex, + otherIndex); + } + return loadPacketStandard(patchId, planeIndex, rowIndex, colIndex, + otherIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index patchId, Index planeIndex, + Index rowIndex, Index colIndex, + Index otherIndex) const { + const Index packetSize = internal::unpacket_traits<Packet>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(patchId < + patchDepth() * patchPlanes() * patchRows() * patchCols()); + eigen_assert(!nonStandardPatches()); + + if ((patchDepth() % packetSize) == 0) { + return loadPacketFast(patchId, planeIndex, rowIndex, colIndex, + otherIndex); + } else { + // Offsets and input calculation here are identical to + // loadCoeffStandard(...), but repeated twice. + + const Index patchOffsets[2] = { + patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero}; + + const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, + patchOffsets[1] / m_fastColStride}; + eigen_assert(colOffsets[0] <= colOffsets[1]); + + const Index inputCols[2] = {colIndex + colOffsets[0], + colIndex + colOffsets[1]}; + if (inputCols[0] >= m_inputCols || inputCols[1] < 0) { + return internal::pset1<Packet>(Scalar(0)); + } + + if (inputCols[0] == inputCols[1]) { + const Index rowOffsets[2] = { + (patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride, + (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride}; + eigen_assert(rowOffsets[0] <= rowOffsets[1]); + const Index inputRows[2] = {rowIndex + rowOffsets[0], + rowIndex + rowOffsets[1]}; + + if (inputRows[0] >= m_inputRows || inputRows[1] < 0) { + return internal::pset1<Packet>(Scalar(0)); + } + + if (inputRows[0] == inputRows[1]) { + const Index planeOffsets[2] = { + patchOffsets[0] - colOffsets[0] * m_colStride - + rowOffsets[0] * m_rowStride, + patchOffsets[1] - colOffsets[1] * m_colStride - + rowOffsets[1] * m_rowStride}; + eigen_assert(planeOffsets[0] <= planeOffsets[1]); + const Index inputPlanes[2] = {planeIndex + planeOffsets[0], + planeIndex + planeOffsets[1]}; + + if (inputPlanes[0] >= m_inputPlanes || inputPlanes[1] < 0) { + return internal::pset1<Packet>(Scalar(0)); + } + + if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) { + const Index depth = patchId - patchOffsets[0] * patchDepth(); + const Index inputIndex = + depth + inputPlanes[0] * m_planeInputStride + + inputRows[0] * m_rowInputStride + + inputCols[0] * m_colInputStride + otherIndex; + return m_impl.template packet<Unaligned>(inputIndex); + } + } + } + } + + return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex, + otherIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index patchId, Index planeIndex, + Index rowIndex, Index colIndex, + Index otherIndex) const { + const Index packetSize = internal::unpacket_traits<Packet>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(patchId < + patchDepth() * patchPlanes() * patchRows() * patchCols()); + + eigen_assert(!nonStandardPatches()); + eigen_assert((patchDepth() % packetSize) == 0); + + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = patchId / m_fastDimZero; + eigen_assert((patchId + packetSize - 1) / m_fastDimZero == patchOffset); + + const Index colOffset = patchOffset / m_fastColStride; + const Index rowOffset = + (patchOffset - colOffset * m_colStride) / m_fastRowStride; + const Index planeOffset = + patchOffset - colOffset * m_colStride - rowOffset * m_rowStride; + + const Index inputCol = colIndex + colOffset; + const Index inputRow = rowIndex + rowOffset; + const Index inputPlane = planeIndex + planeOffset; + + if (inputCol < 0 || inputRow < 0 || inputPlane < 0 || + inputCol >= m_inputCols || inputRow >= m_inputRows || + inputPlane >= m_inputPlanes) { + return internal::pset1<Packet>(Scalar(0)); + } + + const Index depth = patchId - patchOffset * patchDepth(); + const Index inputIndex = depth + inputPlane * m_planeInputStride + + inputRow * m_rowInputStride + + inputCol * m_colInputStride + otherIndex; + return m_impl.template packet<Unaligned>(inputIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet + packetWithPossibleZero(Index patchId, Index planeIndex, Index rowIndex, + Index colIndex, Index otherIndex) const { + const int packetSize = internal::unpacket_traits<Packet>::size; + EIGEN_ALIGN_MAX + typename internal::remove_const<Scalar>::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = + loadCoeff(patchId + i, planeIndex, rowIndex, colIndex, otherIndex); + } + Packet rslt = internal::pload<Packet>(values); + return rslt; + } + + // Precompute the indices (plane, row, col, other) of the first element of + // the given patch index, within the output tensor of the TensorVolumePatchOp. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void computeBaseIndices( + Index patchIndex, Index& planeIndex, Index& rowIndex, Index& colIndex, + Index& otherIndex) const { + const size_t NumInputDims = array_size< + typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + + // Check if patchIndex might contain batch and other dimensions. + otherIndex = (NumInputDims == 4) ? 0 : patchIndex / m_fastNumPatches; + + // Compute index of the patch within the batch (and other dimensions). + const Index patch3DIndex = (NumInputDims == 4) + ? patchIndex + : (patchIndex - otherIndex * m_num_patches); + + otherIndex *= m_patchInputStride; + + colIndex = patch3DIndex / m_fastOutputPlanesRows; + rowIndex = + (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes; + planeIndex = + patch3DIndex - (colIndex * m_outputRows + rowIndex) * m_outputPlanes; + + colIndex = colIndex * m_col_strides - m_colPaddingLeft; + rowIndex = rowIndex * m_row_strides - m_rowPaddingTop; + planeIndex = planeIndex * m_plane_strides - m_planePaddingTop; + } + + Index m_patch_depth; // number of channels in the patch + Index m_patch_planes; // number of planes in the patch + Index m_patch_rows; // number of rows in the patch + Index m_patch_cols; // number of columns in the patch + Index m_num_patches; // number of patches to extract + + // Strides for navigating through the single patch. + Index m_patch_plane_stride; + Index m_patch_row_stride; + Index m_patch_col_stride; + + // Strides for the output tensor (depth is not the part of the stride). + Index m_rowStride; + Index m_colStride; + Index m_patchStride; + Index m_otherStride; + + Index m_planeInputStride; // Plane stride in the input tensor + Index m_rowInputStride; // Row stride in the input tensor + Index m_colInputStride; // Col stride in the input tensor + Index m_patchInputStride; // Patch stride in the input tensor + Index m_otherInputStride; + + Index m_inputDepth; // Depth of the input tensor + Index m_inputPlanes; // Number of planes in the input tensor + Index m_inputRows; // Number of rows in the input tensor + Index m_inputCols; // Number of cols in the input tensor + + Index m_outputPlanes; // Number of output planes + Index m_outputRows; // Number of output rows + Index m_outputCols; // Number of output cols + Index m_outputPlanesRows; // Cached outputPlanes * outputRows. + + Index m_plane_strides; // User specified plane stride + Index m_row_strides; // User specified row stride + Index m_col_strides; // User specified col stride + + // User specified plane/row/col atrous convolution strides. + Index m_in_plane_strides; + Index m_in_row_strides; + Index m_in_col_strides; + + // User specified plane/row/col inflation strides in the image patch. + Index m_patch_plane_inflate_strides; + Index m_patch_row_inflate_strides; + Index m_patch_col_inflate_strides; + + Index m_planePaddingTop; // Plane padding + Index m_rowPaddingTop; // Row padding + Index m_colPaddingLeft; // Column padding + + // Fast representation of various divisors. + internal::TensorIntDivisor<Index> m_fastNumPatches; + + internal::TensorIntDivisor<Index> m_fastPatchPlaneStride; + internal::TensorIntDivisor<Index> m_fastPatchRowStride; + internal::TensorIntDivisor<Index> m_fastPatchColStride; + + internal::TensorIntDivisor<Index> m_fastInputPlaneStride; + internal::TensorIntDivisor<Index> m_fastInputRowStride; + internal::TensorIntDivisor<Index> m_fastInputColStride; + + internal::TensorIntDivisor<Index> m_fastRowStride; + internal::TensorIntDivisor<Index> m_fastColStride; + + internal::TensorIntDivisor<Index> m_fastDimZero; // aka output depth + internal::TensorIntDivisor<Index> m_fastOutputPlanes; + internal::TensorIntDivisor<Index> m_fastOutputRows; + internal::TensorIntDivisor<Index> m_fastOutputCols; + internal::TensorIntDivisor<Index> m_fastOutputPlanesRows; + + const TensorEvaluator<ArgType, Device> m_impl; +}; + +template <typename NewDimension, DenseIndex Planes, DenseIndex Rows, + DenseIndex Cols, typename ArgType, typename Device, typename Scalar, + typename Index, typename nocontract_t, typename contract_t, int Side, + int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, + int Alignment> +class TensorContractionSubMapper< + Scalar, Index, Side, + TensorEvaluator<const TensorReshapingOp<NewDimension, + const TensorVolumePatchOp< + Planes, Rows, Cols, ArgType> >, + Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, + inner_dim_reordered, Alignment> { + public: + typedef typename packet_traits<Scalar>::type Packet; + typedef typename packet_traits<Scalar>::half HalfPacket; + + typedef TensorContractionInputMapper< + Scalar, Index, Side, + TensorEvaluator<const TensorReshapingOp< + NewDimension, const TensorVolumePatchOp< + Planes, Rows, Cols, ArgType> >, + Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, + inner_dim_reordered, Alignment> + ParentMapper; + typedef TensorContractionSubMapper< + Scalar, Index, Side, + TensorEvaluator<const TensorReshapingOp< + NewDimension, const TensorVolumePatchOp< + Planes, Rows, Cols, ArgType> >, + Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, + inner_dim_reordered, Alignment> + Self; + typedef Self LinearMapper; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper( + const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) + : m_base_mapper(base_mapper), + m_depth_offset(vert_offset), + m_col_offset(horiz_offset) { + m_base_mapper.computeBaseIndices(m_col_offset, m_planeIndex, m_rowIndex, + m_colIndex, m_otherIndex); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper( + const Self& base_mapper, Index vert_offset, Index horiz_offset) + : m_base_mapper(base_mapper.m_base_mapper), + m_depth_offset(vert_offset + base_mapper.m_depth_offset), + m_col_offset(horiz_offset + base_mapper.m_col_offset) { + m_base_mapper.computeBaseIndices(m_col_offset, m_planeIndex, m_rowIndex, + m_colIndex, m_otherIndex); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + return m_base_mapper.loadCoeff(i + m_depth_offset, m_planeIndex, m_rowIndex, + m_colIndex, m_otherIndex); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, + Index j) const { + return m_base_mapper(i + m_depth_offset, j + m_col_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return m_base_mapper.loadPacket(i + m_depth_offset, m_planeIndex, + m_rowIndex, m_colIndex, m_otherIndex); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, + Index j) const { + return m_base_mapper.template loadPacket<Alignment>(i + m_depth_offset, + j + m_col_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar + loadCoeffStandard(Index i) const { + return m_base_mapper.loadCoeffStandard( + i + m_depth_offset, m_planeIndex, m_rowIndex, m_colIndex, m_otherIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index i) const { + return m_base_mapper.loadPacketFast(i + m_depth_offset, m_planeIndex, + m_rowIndex, m_colIndex, m_otherIndex); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet + loadPacketStandard(Index i) const { + return m_base_mapper.loadPacketStandard( + i + m_depth_offset, m_planeIndex, m_rowIndex, m_colIndex, m_otherIndex); + } + template <typename Packet> + EIGEN_DEVICE_FUNC bool aligned(Index) const { + return false; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool nonStandardPatches() const { + return m_base_mapper.nonStandardPatches(); + } + + // Max(Col|Row|Plane|Depth): compute the upper limit for the column, row, + // plane and depth index respectively that fits into the peeled_k elements + // starting at m_depth_offset. + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const { + const Index max_col = + fastPatchColStride().divide(m_depth_offset + peeled_k); + return std::min<Index>(1 + max_col, patchCols()); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k, + const Index col) const { + const Index max_row = fastPatchRowStride().divide( + m_depth_offset + peeled_k - col * patchColStride()); + return std::min<Index>(1 + max_row, patchRows()); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index maxPlane(const Index peeled_k, const Index col, + const Index row) const { + const Index max_plane = fastPatchPlaneStride().divide( + m_depth_offset + peeled_k - col * patchColStride() - + row * patchRowStride()); + return std::min<Index>(1 + max_plane, patchPlanes()); + } + + // MaxDepth uses only the remaining number of elements in the peeled_k. + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index maxDepth(const Index num_elements, + const Index start_depth) const { + return std::min<Index>(start_depth + num_elements, patchDepth()); + } + + // Every register matters in this code, so sometimes to prevent register + // spilling, instead of the variable that you would expect to see, we use + // another one, that is guaranteed to have the same value. E.g. patch depth is + // always the same as input depth, and it's also the same as input plane + // stride. Bunch of other parameters have similar relations. + + typedef internal::TensorIntDivisor<Index> IndexDivisor; + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchDepth() const { + eigen_assert(m_base_mapper.m_patch_depth == + m_base_mapper.m_planeInputStride && + "Patch depth must be equal to plane input stride."); + return m_base_mapper.m_planeInputStride; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchPlanes() const { + eigen_assert(m_base_mapper.m_patch_planes == m_base_mapper.m_rowStride && + "Patch planes must be equal to row stride."); + return m_base_mapper.m_rowStride; + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchRows() const { + return m_base_mapper.m_patch_rows; + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchCols() const { + return m_base_mapper.m_patch_cols; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchPlaneStride() const { + eigen_assert(patchDepth() == m_base_mapper.m_patch_plane_stride && + "Patch depth must be equal to patch plane stride."); + return patchDepth(); + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchRowStride() const { + return m_base_mapper.m_patch_row_stride; + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchColStride() const { + return m_base_mapper.m_patch_col_stride; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE IndexDivisor fastPatchPlaneStride() const { + eigen_assert(patchDepth() == m_base_mapper.m_patch_plane_stride && + "Patch depth must be equal to patch plane stride."); + return m_base_mapper.m_fastDimZero; // patch_depth + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE IndexDivisor fastPatchRowStride() const { + return m_base_mapper.m_fastPatchRowStride; + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE IndexDivisor fastPatchColStride() const { + return m_base_mapper.m_fastPatchColStride; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth, + const Index baseIndex) const { + const Index inputIndex = depth + baseIndex; + return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool padPlane(const Index plane) const { + const Index p = m_planeIndex + plane; + return p < 0 || p >= m_base_mapper.m_inputPlanes; + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool padRow(const Index row) const { + const Index r = m_rowIndex + row; + return r < 0 || r >= m_base_mapper.m_inputRows; + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool padCol(const Index col) const { + const Index c = m_colIndex + col; + return c < 0 || c >= m_base_mapper.m_inputCols; + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index baseIndex(const Index plane, const Index row, + const Index col) const { + const Index p = m_planeIndex + plane; + const Index r = m_rowIndex + row; + const Index c = m_colIndex + col; + return p * m_base_mapper.m_planeInputStride + + r * m_base_mapper.m_rowInputStride + + c * m_base_mapper.m_colInputStride + m_otherIndex; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index planeOffset() const { + const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero; + const Index colOffset = patchOffset / m_base_mapper.m_fastColStride; + const Index rowOffset = + (patchOffset - colOffset * m_base_mapper.m_colStride) / + m_base_mapper.m_fastRowStride; + const Index planeOffset = patchOffset - + colOffset * m_base_mapper.m_colStride - + rowOffset * m_base_mapper.m_rowStride; + return planeOffset; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index rowOffset() const { + const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero; + const Index colOffset = patchOffset / m_base_mapper.m_fastColStride; + const Index rowOffset = + (patchOffset - colOffset * m_base_mapper.m_colStride) / + m_base_mapper.m_fastRowStride; + return rowOffset; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index colOffset() const { + const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero; + const Index colOffset = patchOffset / m_base_mapper.m_fastColStride; + return colOffset; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index depthOffset() const { + return m_depth_offset % patchDepth(); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper + getLinearMapper(Index i, Index j) const { + return LinearMapper(m_base_mapper, i + m_depth_offset, j + m_col_offset); + } + + private: + const ParentMapper& m_base_mapper; + Index m_depth_offset; // First row in the input matrix + Index m_col_offset; // First col in the input matrix + + // Knowing that: col_offset == patchIndex * OTHERS, we keep precomputed base + // indices for the first element in a patch specified by col_offset + // (see computeBaseIndices(...) for details). + Index m_planeIndex; + Index m_rowIndex; + Index m_colIndex; + Index m_otherIndex; +}; + +// Arrange a block of the right input matrix (in our case it's always a "virtual +// matrix" constructed from extracted volume patches) in contiguous memory. +// +// Given column major input (A0 beside A1 in memory): +// A0 B0 C0 D0 E0 F0 G0 H0 ... Z0 +// A1 B1 C1 D1 E1 F1 G1 H1 ... Z1 +// A2 B2 C2 D2 E2 F2 G2 H2 ... Z2 +// A3 B3 C3 D3 E3 F3 G3 H3 ... Z3 +// A4 B4 C4 D4 E4 F4 G4 H4 ... Z4 +// A5 B5 C5 D5 E5 F5 G5 H5 ... Z5 +// A6 B6 C6 D6 E6 F6 G6 H6 ... Z6 +// A7 B7 C7 D7 E7 F7 G7 H7 ... Z7 +// A8 ... +// ... +// +// *) A, B, C, ... - patches extracted from the original input. +// *) A0, A1, A2 ... - values from the same patch at different offsets. +// +// The traversal (packed rhs memory) order (B0 besides A0 in memory): +// A0 B0 C0 D0 A1 B1 C1 D1 ... +// E0 F0 G0 H0 E1 F1 G1 H1 ... +// ... +// Z0 Z1 Z2 Z3 Z4 Z5 Z6 Z7 ... <- doesn't belong to any block (nr = 4) +// +// This traversal order must be the same as in default gemm_pack_rhs defined in +// GeneralBlockPanelKernel.h. +// +// *) nr - number of registers along the 'n' dimension. +// See GeneralBlockPanelKernel.h and "Anatomy of High-Performance Matrix +// Multiplication" paper. +template <typename NewDimension, DenseIndex Planes, DenseIndex Rows, + DenseIndex Cols, typename ArgType, typename Device, typename Scalar, + typename Index, typename nocontract_t, typename contract_t, + int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, + int Alignment, int nr> +struct gemm_pack_rhs< + Scalar, Index, + TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator<const TensorReshapingOp< + NewDimension, const TensorVolumePatchOp< + Planes, Rows, Cols, ArgType> >, + Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, + inner_dim_reordered, Alignment>, + nr, ColMajor, false, false> { + typedef TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator<const TensorReshapingOp< + NewDimension, const TensorVolumePatchOp< + Planes, Rows, Cols, ArgType> >, + Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, + inner_dim_reordered, Alignment> + SubMapper; + + typedef SubMapper DataMapper; + typedef typename packet_traits<Scalar>::type Packet; + + EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_DEVICE_FUNC + EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs, + Index depth, Index cols, Index stride = 0, + Index offset = 0) const { + eigen_assert(stride == 0); + eigen_assert(offset == 0); + + const Index packet_cols4 = (cols / 4) * 4; + const Index peeled_k = (depth / packet_size) * packet_size; + const bool non_standard_patches = rhs.nonStandardPatches(); + + for (Index j2 = 0; j2 < packet_cols4; j2 += 4) { + const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0); + const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1); + const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2); + const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3); + + Index k = 0; + if ((packet_size % 4) == 0 && !non_standard_patches) { + // FAST PATH: + // Iterate over patch columns, rows and planes if we know that a single + // packet do not span across multiple planes, rows or columns. + if ((rhs.patchDepth() % packet_size) == 0) { + const Index start_col = rhs.colOffset(); + const Index max_col = rhs.maxCol(peeled_k); + + for (Index c = start_col; c < max_col; ++c) { + eigen_assert(k <= peeled_k); + + const Index start_row = (c == start_col) ? rhs.rowOffset() : 0; + const Index max_row = rhs.maxRow(peeled_k, c); + + const bool pad_col0 = dm0.padCol(c); + const bool pad_col1 = dm1.padCol(c); + const bool pad_col2 = dm2.padCol(c); + const bool pad_col3 = dm3.padCol(c); + + for (Index r = start_row; r < max_row; ++r) { + eigen_assert(k <= peeled_k); + + const Index start_plane = ((c == start_col) && (r == start_row)) + ? rhs.planeOffset() + : 0; + const Index max_plane = rhs.maxPlane(peeled_k, c, r); + + const bool pad_row0 = pad_col0 || dm0.padRow(r); + const bool pad_row1 = pad_col1 || dm1.padRow(r); + const bool pad_row2 = pad_col2 || dm2.padRow(r); + const bool pad_row3 = pad_col3 || dm3.padRow(r); + + for (Index p = start_plane; p < max_plane; ++p) { + eigen_assert(k <= peeled_k); + + const bool pad0 = pad_row0 || dm0.padPlane(p); + const bool pad1 = pad_row1 || dm1.padPlane(p); + const bool pad2 = pad_row2 || dm2.padPlane(p); + const bool pad3 = pad_row3 || dm3.padPlane(p); + + const Index idx0 = dm0.baseIndex(p, r, c); + const Index idx1 = dm1.baseIndex(p, r, c); + const Index idx2 = dm2.baseIndex(p, r, c); + const Index idx3 = dm3.baseIndex(p, r, c); + + const Index start_depth = + ((c == start_col) && (r == start_row) && (p == start_plane)) + ? rhs.depthOffset() + : 0; + const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth); + eigen_assert((max_depth - start_depth) % packet_size == 0); + + for (Index d = start_depth; d < max_depth; d += packet_size) { + eigen_assert(k < peeled_k); + PacketBlock<Packet, 4> kernel; + kernel.packet[0] = pad0 ? pset1<Packet>(Scalar(0)) + : rhs.packetNoPadding(d, idx0); + kernel.packet[1] = pad1 ? pset1<Packet>(Scalar(0)) + : rhs.packetNoPadding(d, idx1); + kernel.packet[2] = pad2 ? pset1<Packet>(Scalar(0)) + : rhs.packetNoPadding(d, idx2); + kernel.packet[3] = pad3 ? pset1<Packet>(Scalar(0)) + : rhs.packetNoPadding(d, idx3); + ptranspose(kernel); + pstoreu(block + 0 * packet_size, kernel.packet[0]); + pstoreu(block + 1 * packet_size, kernel.packet[1]); + pstoreu(block + 2 * packet_size, kernel.packet[2]); + pstoreu(block + 3 * packet_size, kernel.packet[3]); + block += 4 * packet_size; + k += packet_size; + } + } + } + } + + // The loop above should fill peeled_k elements. + eigen_assert(peeled_k == k); + + } else { + // Packet can span multiple planes, rows or columns, so we have to go + // though the slower "standard" path. + for (; k < peeled_k; k += packet_size) { + PacketBlock<Packet, 4> kernel; + kernel.packet[0] = dm0.loadPacketStandard(k); + kernel.packet[1] = dm1.loadPacketStandard(k); + kernel.packet[2] = dm2.loadPacketStandard(k); + kernel.packet[3] = dm3.loadPacketStandard(k); + ptranspose(kernel); + pstoreu(block + 0 * packet_size, kernel.packet[0]); + pstoreu(block + 1 * packet_size, kernel.packet[1]); + pstoreu(block + 2 * packet_size, kernel.packet[2]); + pstoreu(block + 3 * packet_size, kernel.packet[3]); + block += 4 * packet_size; + } + } + } + + // Copy the remaining coefficients of the column block after the peeled_k. + if (!non_standard_patches) { + for (; k < depth; k++) { + block[0] = dm0.loadCoeffStandard(k); + block[1] = dm1.loadCoeffStandard(k); + block[2] = dm2.loadCoeffStandard(k); + block[3] = dm3.loadCoeffStandard(k); + block += 4; + } + } else { + for (; k < depth; k++) { + block[0] = dm0(k); + block[1] = dm1(k); + block[2] = dm2(k); + block[3] = dm3(k); + block += 4; + } + } + } + + // Copy the remaining columns one at a time (nr==1). + for (Index j2 = packet_cols4; j2 < cols; ++j2) { + const SubMapper dm0 = rhs.getLinearMapper(0, j2); + for (Index k = 0; k < depth; k++) { + *block = dm0(k); + block += 1; + } + } + } +}; + +// Template specialization for packet_size = 2. We must special-case packet +// blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>. +template <typename NewDimension, DenseIndex Planes, DenseIndex Rows, + DenseIndex Cols, typename ArgType, typename Device, typename Scalar, + typename Index, typename nocontract_t, typename contract_t, + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, + int nr> +struct gemm_pack_rhs< + Scalar, Index, + TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator<const TensorReshapingOp< + NewDimension, const TensorVolumePatchOp< + Planes, Rows, Cols, ArgType> >, + Device>, + nocontract_t, contract_t, /*packet_size*/ 2, inner_dim_contiguous, + inner_dim_reordered, Alignment>, + nr, ColMajor, false, false> { + typedef TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator<const TensorReshapingOp< + NewDimension, const TensorVolumePatchOp< + Planes, Rows, Cols, ArgType> >, + Device>, + nocontract_t, contract_t, /*packet_size*/ 2, inner_dim_contiguous, + inner_dim_reordered, Alignment> + SubMapper; + typedef SubMapper DataMapper; + typedef typename packet_traits<Scalar>::type Packet; + + EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_DEVICE_FUNC + EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs, + Index depth, Index cols, Index stride = 0, + Index offset = 0) const { + eigen_assert(stride == 0); + eigen_assert(offset == 0); + + const int packet_size = 2; + + const Index packet_cols4 = (cols / 4) * 4; + const Index peeled_k = (depth / packet_size) * packet_size; + const bool non_standard_patches = rhs.nonStandardPatches(); + + for (Index j2 = 0; j2 < packet_cols4; j2 += 4) { + const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0); + const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1); + const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2); + const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3); + + Index k = 0; + if (!non_standard_patches) { + // FAST PATH: + // Iterate over patch columns, rows and planes if we know that a single + // packet do not span across multiple planes, rows or columns. + if ((rhs.patchDepth() % packet_size) == 0) { + const Index start_col = rhs.colOffset(); + const Index max_col = rhs.maxCol(peeled_k); + + for (Index c = start_col; c < max_col; ++c) { + eigen_assert(k <= peeled_k); + + const Index start_row = (c == start_col) ? rhs.rowOffset() : 0; + const Index max_row = rhs.maxRow(peeled_k, c); + + const bool pad_col0 = dm0.padCol(c); + const bool pad_col1 = dm1.padCol(c); + const bool pad_col2 = dm2.padCol(c); + const bool pad_col3 = dm3.padCol(c); + + for (Index r = start_row; r < max_row; ++r) { + eigen_assert(k <= peeled_k); + + const Index start_plane = ((c == start_col) && (r == start_row)) + ? rhs.planeOffset() + : 0; + const Index max_plane = rhs.maxPlane(peeled_k, c, r); + + const bool pad_row0 = dm0.padRow(r); + const bool pad_row1 = dm1.padRow(r); + const bool pad_row2 = dm2.padRow(r); + const bool pad_row3 = dm3.padRow(r); + + for (Index p = start_plane; p < max_plane; ++p) { + eigen_assert(k <= peeled_k); + + const bool pad0 = pad_col0 || pad_row0 || dm0.padPlane(p); + const bool pad1 = pad_col1 || pad_row1 || dm1.padPlane(p); + const bool pad2 = pad_col2 || pad_row2 || dm2.padPlane(p); + const bool pad3 = pad_col3 || pad_row3 || dm3.padPlane(p); + + const Index idx0 = dm0.baseIndex(p, r, c); + const Index idx1 = dm1.baseIndex(p, r, c); + const Index idx2 = dm2.baseIndex(p, r, c); + const Index idx3 = dm3.baseIndex(p, r, c); + + const Index start_depth = + ((c == start_col) && (r == start_row) && (p == start_plane)) + ? rhs.depthOffset() + : 0; + const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth); + eigen_assert((max_depth - start_depth) % packet_size == 0); + + for (Index d = start_depth; d < max_depth; d += packet_size) { + eigen_assert(k < peeled_k); + PacketBlock<Packet, 2> kernel0; + PacketBlock<Packet, 2> kernel1; + kernel0.packet[0] = pad0 ? pset1<Packet>(Scalar(0)) + : rhs.packetNoPadding(d, idx0); + kernel0.packet[1] = pad1 ? pset1<Packet>(Scalar(0)) + : rhs.packetNoPadding(d, idx1); + kernel1.packet[0] = pad2 ? pset1<Packet>(Scalar(0)) + : rhs.packetNoPadding(d, idx2); + kernel1.packet[1] = pad3 ? pset1<Packet>(Scalar(0)) + : rhs.packetNoPadding(d, idx3); + ptranspose(kernel0); + ptranspose(kernel1); + pstoreu(block + 0 * packet_size, kernel0.packet[0]); + pstoreu(block + 1 * packet_size, kernel1.packet[0]); + pstoreu(block + 2 * packet_size, kernel0.packet[1]); + pstoreu(block + 3 * packet_size, kernel1.packet[1]); + block += 4 * packet_size; + k += packet_size; + } + } + } + } + + // The loop above should fill peeled_k elements. + eigen_assert(peeled_k == k); + + } else { + for (; k < peeled_k; k += packet_size) { + PacketBlock<Packet, 2> kernel0; + PacketBlock<Packet, 2> kernel1; + kernel0.packet[0] = dm0.loadPacketStandard(k); + kernel0.packet[1] = dm1.loadPacketStandard(k); + kernel1.packet[0] = dm2.loadPacketStandard(k); + kernel1.packet[1] = dm3.loadPacketStandard(k); + ptranspose(kernel0); + ptranspose(kernel1); + pstoreu(block + 0 * packet_size, kernel0.packet[0]); + pstoreu(block + 1 * packet_size, kernel1.packet[0]); + pstoreu(block + 2 * packet_size, kernel0.packet[1]); + pstoreu(block + 3 * packet_size, kernel1.packet[1]); + block += 4 * packet_size; + } + } + } + + // Copy the remaining coefficients of the column block after the peeled_k. + if (!rhs.nonStandardPatches()) { + for (; k < depth; k++) { + block[0] = dm0.loadCoeffStandard(k); + block[1] = dm1.loadCoeffStandard(k); + block[2] = dm2.loadCoeffStandard(k); + block[3] = dm3.loadCoeffStandard(k); + block += 4; + } + } else { + for (; k < depth; k++) { + block[0] = dm0(k); + block[1] = dm1(k); + block[2] = dm2(k); + block[3] = dm3(k); + block += 4; + } + } + } + + // Copy the remaining columns one at a time (nr==1). + for (Index j2 = packet_cols4; j2 < cols; ++j2) { + const SubMapper dm0 = rhs.getLinearMapper(0, j2); + for (Index k = 0; k < depth; k++) { + *block = dm0(k); + block += 1; + } + } + } +}; + +// Special case for non-vectorized types such as float16 (packet_size = 1). +template <typename NewDimension, DenseIndex Planes, DenseIndex Rows, + DenseIndex Cols, typename ArgType, typename Device, typename Scalar, + typename Index, typename nocontract_t, typename contract_t, + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, + int nr> +struct gemm_pack_rhs< + Scalar, Index, + TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator<const TensorReshapingOp< + NewDimension, const TensorVolumePatchOp< + Planes, Rows, Cols, ArgType> >, + Device>, + nocontract_t, contract_t, /*packet_size*/ 1, inner_dim_contiguous, + inner_dim_reordered, Alignment>, + nr, ColMajor, false, false> { + typedef TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator<const TensorReshapingOp< + NewDimension, const TensorVolumePatchOp< + Planes, Rows, Cols, ArgType> >, + Device>, + nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, + Alignment> + SubMapper; + typedef SubMapper DataMapper; + + EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_DEVICE_FUNC + EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs, + Index depth, Index cols, Index stride = 0, + Index offset = 0) const { + eigen_assert(stride == 0); + eigen_assert(offset == 0); + + const Index packet_cols4 = (cols / 4) * 4; + + for (Index j2 = 0; j2 < packet_cols4; j2 += 4) { + const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0); + const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1); + const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2); + const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3); + + if (!rhs.nonStandardPatches()) { + for (Index k = 0; k < depth; k++) { + block[0] = dm0.loadCoeffStandard(k); + block[1] = dm1.loadCoeffStandard(k); + block[2] = dm2.loadCoeffStandard(k); + block[3] = dm3.loadCoeffStandard(k); + block += 4; + } + } else { + for (Index k = 0; k < depth; k++) { + block[0] = dm0(k); + block[1] = dm1(k); + block[2] = dm2(k); + block[3] = dm3(k); + block += 4; + } + } + } + + // Copy the remaining columns one at a time (nr==1). + for (Index j2 = packet_cols4; j2 < cols; ++j2) { + const SubMapper dm0 = rhs.getLinearMapper(0, j2); + for (Index k = 0; k < depth; k++) { + *block = dm0(k); + block += 1; + } + } + } +}; + +} // namespace internal + /** CuboidConvolution * \ingroup CXX11_NeuralNetworks_Module * @@ -98,7 +1504,7 @@ CuboidConvolution(const Input& input, const Kernel& kernel, isColMajor ? kern.dimensions()[1] : kern.dimensions()[3]; // Spatial size of the kernel. - const TensorIndex kernelDepth = + const TensorIndex kernelPlanes = isColMajor ? kern.dimensions()[2] : kern.dimensions()[2]; const TensorIndex kernelRows = isColMajor ? kern.dimensions()[3] : kern.dimensions()[1]; @@ -118,27 +1524,27 @@ CuboidConvolution(const Input& input, const Kernel& kernel, const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4); - TensorIndex out_depth; + TensorIndex out_planes; TensorIndex out_height; TensorIndex out_width; switch (padding_type) { case PADDING_VALID: - out_depth = Eigen::divup(inputPlanes - kernelDepth + 1, - static_cast<TensorIndex>(stridePlanes)); + out_planes = Eigen::divup(inputPlanes - kernelPlanes + 1, + static_cast<TensorIndex>(stridePlanes)); out_height = Eigen::divup(inputRows - kernelRows + 1, static_cast<TensorIndex>(strideRows)); out_width = Eigen::divup(inputCols - kernelCols + 1, static_cast<TensorIndex>(strideCols)); break; case PADDING_SAME: - out_depth = + out_planes = Eigen::divup(inputPlanes, static_cast<TensorIndex>(stridePlanes)); out_height = Eigen::divup(inputRows, static_cast<TensorIndex>(strideRows)); out_width = Eigen::divup(inputCols, static_cast<TensorIndex>(strideCols)); break; default: - out_depth = 0; + out_planes = 0; out_height = 0; out_width = 0; eigen_assert(false && "unexpected padding"); @@ -147,9 +1553,9 @@ CuboidConvolution(const Input& input, const Kernel& kernel, DSizes<TensorIndex, 2> kernel_dims; if (isColMajor) { kernel_dims[0] = kernelFilters; - kernel_dims[1] = kernelChannels * kernelDepth * kernelRows * kernelCols; + kernel_dims[1] = kernelChannels * kernelPlanes * kernelRows * kernelCols; } else { - kernel_dims[0] = kernelChannels * kernelDepth * kernelRows * kernelCols; + kernel_dims[0] = kernelChannels * kernelPlanes * kernelRows * kernelCols; kernel_dims[1] = kernelFilters; } @@ -160,15 +1566,15 @@ CuboidConvolution(const Input& input, const Kernel& kernel, DSizes<TensorIndex, 2> pre_contract_dims; if (isColMajor) { pre_contract_dims[0] = - kernelChannels * kernelDepth * kernelRows * kernelCols; - pre_contract_dims[1] = out_depth * out_height * out_width; + kernelChannels * kernelPlanes * kernelRows * kernelCols; + pre_contract_dims[1] = out_planes * out_height * out_width; for (int i = 4; i < NumDims; ++i) { pre_contract_dims[1] *= in.dimension(i); } } else { pre_contract_dims[1] = - kernelChannels * kernelDepth * kernelRows * kernelCols; - pre_contract_dims[0] = out_depth * out_height * out_width; + kernelChannels * kernelPlanes * kernelRows * kernelCols; + pre_contract_dims[0] = out_planes * out_height * out_width; for (int i = 0; i < NumDims - 4; ++i) { pre_contract_dims[0] *= in.dimension(i); } @@ -187,7 +1593,7 @@ CuboidConvolution(const Input& input, const Kernel& kernel, DSizes<TensorIndex, NumDims> post_contract_dims; if (isColMajor) { post_contract_dims[0] = kernelFilters; - post_contract_dims[1] = out_depth; + post_contract_dims[1] = out_planes; post_contract_dims[2] = out_height; post_contract_dims[3] = out_width; for (int i = 4; i < NumDims; ++i) { @@ -195,7 +1601,7 @@ CuboidConvolution(const Input& input, const Kernel& kernel, } } else { post_contract_dims[NumDims - 1] = kernelFilters; - post_contract_dims[NumDims - 2] = out_depth; + post_contract_dims[NumDims - 2] = out_planes; post_contract_dims[NumDims - 3] = out_height; post_contract_dims[NumDims - 4] = out_width; for (int i = 0; i < NumDims - 4; ++i) { @@ -208,13 +1614,13 @@ CuboidConvolution(const Input& input, const Kernel& kernel, kernel.reshape(kernel_dims) .contract(input .extract_volume_patches( - kernelDepth, kernelRows, kernelCols, stridePlanes, + kernelPlanes, kernelRows, kernelCols, stridePlanes, strideRows, strideCols, padding_type) .reshape(pre_contract_dims), contract_dims) .reshape(post_contract_dims), input - .extract_volume_patches(kernelDepth, kernelRows, kernelCols, + .extract_volume_patches(kernelPlanes, kernelRows, kernelCols, stridePlanes, strideRows, strideCols, padding_type) .reshape(pre_contract_dims) diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h index a4dff4b91c..e926d73f87 100644 --- a/tensorflow/core/kernels/eigen_spatial_convolutions.h +++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h @@ -22,8 +22,36 @@ namespace Eigen { namespace internal { -// TODO: Consolidate this part of the code with the image patch extraction code -// since they are both very similar. +// WARNING: Most of the code here implicitly assumes that the matrix is in +// ColMajor layout. This is guaranteed by the tensor contraction (see +// TensorContraction.h). +// +// Inside Eigen a tensor contraction is represented by a matrix multiplication. +// We don't want to actually extract image patches and reshape the result into +// a matrix (this involves allocating huge extra memory), so the patch +// extraction and reshape operations are implicit. +// +// TensorContractionInputMapper takes a matrix index and returns the coefficient +// (or the packet) of the "virtual tensor", that would be at that index if we +// were to actually reshape the result of patch extraction. +// +// TensorContractionSubMapper provides a similar view into the "virtual matrix" +// at the given vertical and horizontal offsets. +// +// "Virtual matrix" dimensions: +// *0: kernelChannels * kernelRows * kernelCols; +// 1: out_height * out_width; * OTHERS (e.g batches, etc...) +// +// *) extracted patches are continuous in memory (innermost dimension assuming +// col major layout) +// +// With this dimensions: +// row - offset within a single patch (in code: patchId) +// col - index of the extracted patch (in code: patchIndex) +// patchIndex ∈ [0..num_patches * OTHERS] (batch and other dimensions) +// +// TODO(ezhulenev): Consolidate this part of the code with the image patch +// extraction code since they are both very similar. template <typename NewDimension, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device, typename Scalar_, typename Index, typename nocontract_t, typename contract_t, int Side, int packet_size, @@ -77,12 +105,17 @@ class TensorContractionInputMapper< m_patch_cols = tensor.impl().dimensions()[2]; m_num_patches = tensor.impl().dimensions()[3]; } else { - const int NumDims = tensor.impl().dimensions().size(); + const size_t NumDims = tensor.impl().dimensions().size(); patch_depth = tensor.impl().dimensions()[NumDims - 1]; patch_rows = tensor.impl().dimensions()[NumDims - 2]; m_patch_cols = tensor.impl().dimensions()[NumDims - 3]; m_num_patches = tensor.impl().dimensions()[NumDims - 4]; } + + // Strides for navigating through the single patch. + m_patch_row_stride = patch_depth; + m_patch_col_stride = patch_rows * m_patch_row_stride; + m_patch_row_inflate_strides = tensor.impl().rowInflateStride(); m_patch_col_inflate_strides = tensor.impl().colInflateStride(); @@ -111,6 +144,10 @@ class TensorContractionInputMapper< m_rowPaddingTop = tensor.impl().rowPaddingTop(); m_colPaddingLeft = tensor.impl().colPaddingLeft(); + m_fastPatchRowStride = + internal::TensorIntDivisor<Index>(m_patch_row_stride); + m_fastPatchColStride = + internal::TensorIntDivisor<Index>(m_patch_col_stride); m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_patch_row_inflate_strides); m_fastInputColStride = @@ -126,6 +163,10 @@ class TensorContractionInputMapper< : m_impl(base_mapper.m_impl) { m_patch_cols = base_mapper.m_patch_cols; m_num_patches = base_mapper.m_num_patches; + + m_patch_row_stride = base_mapper.m_patch_row_stride; + m_patch_col_stride = base_mapper.m_patch_col_stride; + m_patch_row_inflate_strides = base_mapper.m_patch_row_inflate_strides; m_patch_col_inflate_strides = base_mapper.m_patch_col_inflate_strides; @@ -148,6 +189,8 @@ class TensorContractionInputMapper< m_rowPaddingTop = base_mapper.m_rowPaddingTop; m_colPaddingLeft = base_mapper.m_colPaddingLeft; + m_fastPatchRowStride = base_mapper.m_fastPatchRowStride; + m_fastPatchColStride = base_mapper.m_fastPatchColStride; m_fastInputRowStride = base_mapper.m_fastInputRowStride; m_fastInputColStride = base_mapper.m_fastInputColStride; m_fastNumPatches = base_mapper.m_fastNumPatches; @@ -238,6 +281,8 @@ class TensorContractionInputMapper< nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>; + // Load coefficient from a patch specified by the "within patch offset" + // (patchId) and the precomputed indices of the first element of the patch. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar loadCoeff(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const { @@ -250,6 +295,7 @@ class TensorContractionInputMapper< (m_patch_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); + const Index rowOffset = patchOffset - colOffset * m_colStride; const Index inputRow = rowIndex + rowOffset * m_in_row_strides; const Index origInputRow = @@ -268,6 +314,8 @@ class TensorContractionInputMapper< return m_impl.coeff(inputIndex); } + // This is the same as loadCoeff(...), but optimized for all `inflate_strides` + // and `in_strides` equal to 1 (template specialization without templates). EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar loadCoeffStandard(Index patchId, Index rowIndex, Index colIndex, @@ -276,10 +324,9 @@ class TensorContractionInputMapper< // Find the offset of the element wrt the location of the first element. const Index patchOffset = patchId / m_fastDimZero; - const Index colOffset = patchOffset / m_fastColStride; - const Index inputCol = colIndex + colOffset; const Index rowOffset = patchOffset - colOffset * m_colStride; + const Index inputCol = colIndex + colOffset; const Index inputRow = rowIndex + rowOffset; if (inputCol < 0 || inputCol >= m_inputCols || inputRow < 0 || inputRow >= m_inputRows) { @@ -291,6 +338,8 @@ class TensorContractionInputMapper< return m_impl.coeff(inputIndex); } + // Load packet from a patch specified by the "within patch offset" + // (patchId) and the precomputed indices of the first element of the patch. EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index patchId, Index rowIndex, Index colIndex, @@ -318,12 +367,14 @@ class TensorContractionInputMapper< if ((patchDepth() % packetSize) == 0) { return loadPacketFast(patchId, rowIndex, colIndex, otherIndex); } else { + // Offsets and input calculation here are identical to + // loadCoeffStandard(...), but repeated twice. + const Index patchOffsets[2] = { patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero}; const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; - const Index inputCols[2] = {colIndex + colOffsets[0], colIndex + colOffsets[1]}; if (inputCols[0] >= m_inputCols || inputCols[1] < 0) { @@ -371,8 +422,8 @@ class TensorContractionInputMapper< eigen_assert((patchId + packetSize - 1) / m_fastDimZero == patchOffset); const Index colOffset = patchOffset / m_fastColStride; - const Index inputCol = colIndex + colOffset; const Index rowOffset = patchOffset - colOffset * m_colStride; + const Index inputCol = colIndex + colOffset; const Index inputRow = rowIndex + rowOffset; if (inputCol < 0 || inputRow < 0 || inputCol >= m_inputCols || inputRow >= m_inputRows) { @@ -401,7 +452,7 @@ class TensorContractionInputMapper< EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void computeBaseIndices( Index patchIndex, Index& rowIndex, Index& colIndex, Index& otherIndex) const { - const int NumInputDims = array_size< + const size_t NumInputDims = array_size< typename TensorEvaluator<ArgType, Device>::Dimensions>::value; otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches; const Index patch2DIndex = (NumInputDims == 3) @@ -414,8 +465,15 @@ class TensorContractionInputMapper< rowIndex = rowIndex * m_row_strides - m_rowPaddingTop; } - Index m_patch_cols; // number of colums in the patch - Index m_num_patches; // number of patches to extract. + Index m_patch_cols; // number of columns in the patch + Index m_num_patches; // number of patches to extract. + + // Strides for navigating through the single patch. + Index m_patch_row_stride; + Index m_patch_col_stride; + internal::TensorIntDivisor<Index> m_fastPatchRowStride; + internal::TensorIntDivisor<Index> m_fastPatchColStride; + Index m_patch_row_inflate_strides; // the strides for row inflation in the // image patch Index m_patch_col_inflate_strides; // the strides for col inflation in the @@ -549,6 +607,40 @@ class TensorContractionSubMapper< return m_base_mapper.nonStandardPatches(); } + // Max(Col|Row|Depth): compute the upper limit for the column, row and depth + // index respectively that fits into the peeled_k elements starting at + // m_depth_offset. + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const { + const Index max_col = + fastPatchColStride().divide(m_depth_offset + peeled_k); + return std::min<Index>(1 + max_col, patchCols()); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k, + const Index col) const { + const Index max_row = fastPatchRowStride().divide( + m_depth_offset + peeled_k - col * patchColStride()); + return std::min<Index>(1 + max_row, patchRows()); + } + + // MaxDepth uses only the remaining number of elements in the peeled_k. + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index maxDepth(const Index num_elements, + const Index start_depth) const { + return std::min<Index>(start_depth + num_elements, patchDepth()); + } + + // Every register matters in this code, so sometimes to prevent register + // spilling, instead of the variable that you would expect to see, we use + // another one, that is guaranteed to have the same value. E.g. patch depth is + // always the same as input depth, and it's also the same as input row stride. + // Bunch of other parameters have similar relations. + + typedef internal::TensorIntDivisor<Index> IndexDivisor; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_base_mapper.m_rowInputStride; @@ -563,6 +655,28 @@ class TensorContractionSubMapper< } EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchRowStride() const { + eigen_assert(patchDepth() == m_base_mapper.m_patch_row_stride && + "Patch depth must be equal to patch row stride."); + return patchDepth(); + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchColStride() const { + return m_base_mapper.m_patch_col_stride; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE IndexDivisor fastPatchRowStride() const { + eigen_assert(patchDepth() == m_base_mapper.m_patch_row_stride && + "Patch depth must be equal to patch row stride."); + return m_base_mapper.m_fastDimZero; // patch_depth + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE IndexDivisor fastPatchColStride() const { + return m_base_mapper.m_fastPatchColStride; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth, const Index baseIndex) const { const Index inputIndex = depth + baseIndex; @@ -603,8 +717,7 @@ class TensorContractionSubMapper< EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index depthOffset() const { - const Index patchOffset = m_depth_offset % m_base_mapper.patchDepth(); - return patchOffset; + return m_depth_offset % patchDepth(); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper @@ -617,12 +730,44 @@ class TensorContractionSubMapper< Index m_depth_offset; // First row in the input matrix Index m_col_offset; // First col in the input matrix - Index m_rowIndex; // precomputed row index corresponding to the col offset - Index m_colIndex; // precomputed col index corresponding to the col offset - Index - m_otherIndex; // precomputed other index corresponding to the col offset + // Knowing that: col_offset == patchIndex * OTHERS, we keep precomputed base + // indices for the first element in a patch specified by col_offset + // (see computeBaseIndices(...) for details). + Index m_rowIndex; + Index m_colIndex; + Index m_otherIndex; }; +// Arrange a block of the right input matrix (in our case it's always a "virtual +// matrix" constructed from extracted image patches) in contiguous memory. +// +// Given column major input (A0 beside A1 in memory): +// A0 B0 C0 D0 E0 F0 G0 H0 ... Z0 +// A1 B1 C1 D1 E1 F1 G1 H1 ... Z1 +// A2 B2 C2 D2 E2 F2 G2 H2 ... Z2 +// A3 B3 C3 D3 E3 F3 G3 H3 ... Z3 +// A4 B4 C4 D4 E4 F4 G4 H4 ... Z4 +// A5 B5 C5 D5 E5 F5 G5 H5 ... Z5 +// A6 B6 C6 D6 E6 F6 G6 H6 ... Z6 +// A7 B7 C7 D7 E7 F7 G7 H7 ... Z7 +// A8 ... +// ... +// +// *) A, B, C, ... - patches extracted from the original input. +// *) A0, A1, A2 ... - values from the same patch at different offsets. +// +// The traversal (packed rhs memory) order (B0 besides A0 in memory): +// A0 B0 C0 D0 A1 B1 C1 D1 ... +// E0 F0 G0 H0 E1 F1 G1 H1 ... +// ... +// Z0 Z1 Z2 Z3 Z4 Z5 Z6 Z7 ... <- doesn't belong to any block (nr = 4) +// +// This traversal order must be the same as in default gemm_pack_rhs defined in +// GeneralBlockPanelKernel.h. +// +// *) nr - number of registers along the 'n' dimension. +// See GeneralBlockPanelKernel.h and "Anatomy of High-Performance Matrix +// Multiplication" paper. template <typename NewDimension, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device, typename Scalar, typename Index, typename nocontract_t, typename contract_t, int packet_size, @@ -649,9 +794,9 @@ struct gemm_pack_rhs< inner_dim_reordered, Alignment> SubMapper; typedef SubMapper DataMapper; + typedef typename packet_traits<Scalar>::type Packet; - EIGEN_DEVICE_FUNC - static inline Index ceil_div(Index a, Index b) { return (a + b - 1) / b; } + EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs, @@ -660,9 +805,6 @@ struct gemm_pack_rhs< eigen_assert(stride == 0); eigen_assert(offset == 0); - EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE); - typedef typename packet_traits<Scalar>::type Packet; - const Index packet_cols4 = (cols / 4) * 4; const Index peeled_k = (depth / packet_size) * packet_size; const bool non_standard_patches = rhs.nonStandardPatches(); @@ -675,30 +817,27 @@ struct gemm_pack_rhs< Index k = 0; if ((packet_size % 4) == 0 && !non_standard_patches) { - const Index patch_depth = rhs.patchDepth(); - if ((patch_depth % packet_size) == 0) { - const Index patch_cols = rhs.patchCols(); - const Index patch_rows = rhs.patchRows(); - - const Index startCol = rhs.colOffset(); - const Index max_cols = std::min<Index>( - ceil_div(peeled_k, patch_rows * patch_depth) + startCol, - patch_cols); - - for (Index c = startCol; c < max_cols; ++c) { - eigen_assert(k < peeled_k); - const Index startRow = (c == startCol) ? rhs.rowOffset() : 0; - const Index max_rows = std::min<Index>( - ceil_div(peeled_k - c * patch_rows * patch_depth, patch_depth) + - startRow, - patch_rows); + // FAST PATH: + // Iterate over patch columns and rows, if we know that a single + // packet do not span across multiple rows or columns. + if ((rhs.patchDepth() % packet_size) == 0) { + const Index start_col = rhs.colOffset(); + const Index max_col = rhs.maxCol(peeled_k); + + for (Index c = start_col; c < max_col; ++c) { + eigen_assert(k <= peeled_k); + + const Index start_row = (c == start_col) ? rhs.rowOffset() : 0; + const Index max_row = rhs.maxRow(peeled_k, c); const bool pad_col0 = dm0.padCol(c); const bool pad_col1 = dm1.padCol(c); const bool pad_col2 = dm2.padCol(c); const bool pad_col3 = dm3.padCol(c); - for (Index r = startRow; r < max_rows; ++r) { - eigen_assert(k < peeled_k); + + for (Index r = start_row; r < max_row; ++r) { + eigen_assert(k <= peeled_k); + const bool pad0 = pad_col0 || dm0.padRow(r); const bool pad1 = pad_col1 || dm1.padRow(r); const bool pad2 = pad_col2 || dm2.padRow(r); @@ -709,14 +848,13 @@ struct gemm_pack_rhs< const Index idx2 = dm2.baseIndex(r, c); const Index idx3 = dm3.baseIndex(r, c); - const Index startDepth = - ((c == startCol) && (r == startRow)) ? rhs.depthOffset() : 0; - const Index max_depth = - std::min<Index>(peeled_k - c * patch_rows * patch_depth - - r * patch_depth + startDepth, - patch_depth); - eigen_assert((max_depth - startDepth) % packet_size == 0); - for (Index d = startDepth; d < max_depth; d += packet_size) { + const Index start_depth = ((c == start_col) && (r == start_row)) + ? rhs.depthOffset() + : 0; + const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth); + eigen_assert((max_depth - start_depth) % packet_size == 0); + + for (Index d = start_depth; d < max_depth; d += packet_size) { eigen_assert(k < peeled_k); PacketBlock<Packet, 4> kernel; kernel.packet[0] = pad0 ? pset1<Packet>(Scalar(0)) @@ -738,19 +876,9 @@ struct gemm_pack_rhs< } } - for (; k < peeled_k; k += packet_size) { - PacketBlock<Packet, 4> kernel; - kernel.packet[0] = dm0.loadPacketFast(k); - kernel.packet[1] = dm1.loadPacketFast(k); - kernel.packet[2] = dm2.loadPacketFast(k); - kernel.packet[3] = dm3.loadPacketFast(k); - ptranspose(kernel); - pstoreu(block + 0 * packet_size, kernel.packet[0]); - pstoreu(block + 1 * packet_size, kernel.packet[1]); - pstoreu(block + 2 * packet_size, kernel.packet[2]); - pstoreu(block + 3 * packet_size, kernel.packet[3]); - block += 4 * packet_size; - } + // The loop above should fill peeled_k elements. + eigen_assert(peeled_k == k); + } else { for (; k < peeled_k; k += packet_size) { PacketBlock<Packet, 4> kernel; @@ -767,6 +895,8 @@ struct gemm_pack_rhs< } } } + + // Copy the remaining coefficients of the column block after the peeled_k. if (!rhs.nonStandardPatches()) { for (; k < depth; k++) { block[0] = dm0.loadCoeffStandard(k); @@ -824,9 +954,9 @@ struct gemm_pack_rhs< Alignment> SubMapper; typedef SubMapper DataMapper; + typedef typename packet_traits<Scalar>::type Packet; - EIGEN_DEVICE_FUNC - static inline Index ceil_div(Index a, Index b) { return (a + b - 1) / b; } + EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs, @@ -835,9 +965,6 @@ struct gemm_pack_rhs< eigen_assert(stride == 0); eigen_assert(offset == 0); - EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE); - typedef typename packet_traits<Scalar>::type Packet; - const int packet_size = 2; const Index packet_cols4 = (cols / 4) * 4; const Index peeled_k = (depth / packet_size) * packet_size; @@ -851,30 +978,27 @@ struct gemm_pack_rhs< Index k = 0; if (!non_standard_patches) { - const Index patch_depth = rhs.patchDepth(); - if ((patch_depth % packet_size) == 0) { - const Index patch_cols = rhs.patchCols(); - const Index patch_rows = rhs.patchRows(); - - const Index startCol = rhs.colOffset(); - const Index max_cols = std::min<Index>( - ceil_div(peeled_k, patch_rows * patch_depth) + startCol, - patch_cols); - - for (Index c = startCol; c < max_cols; ++c) { - eigen_assert(k < peeled_k); - const Index startRow = (c == startCol) ? rhs.rowOffset() : 0; - const Index max_rows = std::min<Index>( - ceil_div(peeled_k - c * patch_rows * patch_depth, patch_depth) + - startRow, - patch_rows); + // FAST PATH: + // Iterate over patch columns and rows if we know that a single + // packet do not span across multiple rows or columns. + if ((rhs.patchDepth() % packet_size) == 0) { + const Index start_col = rhs.colOffset(); + const Index max_col = rhs.maxCol(peeled_k); + + for (Index c = start_col; c < max_col; ++c) { + eigen_assert(k <= peeled_k); + + const Index start_row = (c == start_col) ? rhs.rowOffset() : 0; + const Index max_row = rhs.maxRow(peeled_k, c); const bool pad_col0 = dm0.padCol(c); const bool pad_col1 = dm1.padCol(c); const bool pad_col2 = dm2.padCol(c); const bool pad_col3 = dm3.padCol(c); - for (Index r = startRow; r < max_rows; ++r) { - eigen_assert(k < peeled_k); + + for (Index r = start_row; r < max_row; ++r) { + eigen_assert(k <= peeled_k); + const bool pad0 = pad_col0 || dm0.padRow(r); const bool pad1 = pad_col1 || dm1.padRow(r); const bool pad2 = pad_col2 || dm2.padRow(r); @@ -885,14 +1009,13 @@ struct gemm_pack_rhs< const Index idx2 = dm2.baseIndex(r, c); const Index idx3 = dm3.baseIndex(r, c); - const Index startDepth = - ((c == startCol) && (r == startRow)) ? rhs.depthOffset() : 0; - const Index max_depth = - std::min<Index>(peeled_k - c * patch_rows * patch_depth - - r * patch_depth + startDepth, - patch_depth); - eigen_assert((max_depth - startDepth) % packet_size == 0); - for (Index d = startDepth; d < max_depth; d += packet_size) { + const Index start_depth = ((c == start_col) && (r == start_row)) + ? rhs.depthOffset() + : 0; + const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth); + eigen_assert((max_depth - start_depth) % packet_size == 0); + + for (Index d = start_depth; d < max_depth; d += packet_size) { eigen_assert(k < peeled_k); PacketBlock<Packet, 2> kernel0; PacketBlock<Packet, 2> kernel1; @@ -916,22 +1039,12 @@ struct gemm_pack_rhs< } } - for (; k < peeled_k; k += packet_size) { - PacketBlock<Packet, 2> kernel0; - PacketBlock<Packet, 2> kernel1; - kernel0.packet[0] = dm0.loadPacketFast(k); - kernel0.packet[1] = dm1.loadPacketFast(k); - kernel1.packet[0] = dm2.loadPacketFast(k); - kernel1.packet[1] = dm3.loadPacketFast(k); - ptranspose(kernel0); - ptranspose(kernel1); - pstoreu(block + 0 * packet_size, kernel0.packet[0]); - pstoreu(block + 1 * packet_size, kernel1.packet[0]); - pstoreu(block + 2 * packet_size, kernel0.packet[1]); - pstoreu(block + 3 * packet_size, kernel1.packet[1]); - block += 4 * packet_size; - } + // The loop above should fill peeled_k elements. + eigen_assert(peeled_k == k); + } else { + // Packet can span multiple rows or columns, so we have to go + // though the slower "standard" path. for (; k < peeled_k; k += packet_size) { PacketBlock<Packet, 2> kernel0; PacketBlock<Packet, 2> kernel1; @@ -949,7 +1062,9 @@ struct gemm_pack_rhs< } } } - if (!rhs.nonStandardPatches()) { + + // Copy the remaining coefficients of the column block after the peeled_k. + if (!non_standard_patches) { for (; k < depth; k++) { block[0] = dm0.loadCoeffStandard(k); block[1] = dm1.loadCoeffStandard(k); @@ -968,7 +1083,7 @@ struct gemm_pack_rhs< } } - // copy the remaining columns one at a time (nr==1) + // Copy the remaining columns one at a time (nr==1). for (Index j2 = packet_cols4; j2 < cols; ++j2) { const SubMapper dm0 = rhs.getLinearMapper(0, j2); for (Index k = 0; k < depth; k++) { @@ -1006,8 +1121,7 @@ struct gemm_pack_rhs< SubMapper; typedef SubMapper DataMapper; - EIGEN_DEVICE_FUNC - static inline Index ceil_div(Index a, Index b) { return (a + b - 1) / b; } + EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs, @@ -1016,8 +1130,6 @@ struct gemm_pack_rhs< eigen_assert(stride == 0); eigen_assert(offset == 0); - EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE); - const Index packet_cols4 = (cols / 4) * 4; for (Index j2 = 0; j2 < packet_cols4; j2 += 4) { @@ -1045,7 +1157,7 @@ struct gemm_pack_rhs< } } - // copy the remaining columns one at a time (nr==1) + // Copy the remaining columns one at a time (nr==1). for (Index j2 = packet_cols4; j2 < cols; ++j2) { const SubMapper dm0 = rhs.getLinearMapper(0, j2); for (Index k = 0; k < depth; k++) { diff --git a/tensorflow/core/kernels/eigen_volume_patch.h b/tensorflow/core/kernels/eigen_volume_patch.h index a3d795813d..80ab745bfe 100644 --- a/tensorflow/core/kernels/eigen_volume_patch.h +++ b/tensorflow/core/kernels/eigen_volume_patch.h @@ -43,6 +43,7 @@ struct CustomTensorEvaluator { IsAligned = false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = NumDims == 6, RawAccess = false diff --git a/tensorflow/core/kernels/extract_volume_patches_op.cc b/tensorflow/core/kernels/extract_volume_patches_op.cc new file mode 100644 index 0000000000..52cd078a35 --- /dev/null +++ b/tensorflow/core/kernels/extract_volume_patches_op.cc @@ -0,0 +1,197 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +/* +See extract_image_patches_op* files and docs for extract_image_patches in +../ops/image_ops.cc. + +Rates are not supported as of now, but the comments hint how to edit the code +when rates are to be added. +*/ + +#define USE_EIGEN_TENSOR +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/extract_volume_patches_op.h" +#include <vector> +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/kernels/bounds_check.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/util/tensor_format.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +static inline void ParseAttributeVec5(OpKernelConstruction* context, + const string& attr_name, + std::vector<int32>* attr) { + OP_REQUIRES_OK(context, context->GetAttr(attr_name, attr)); + OP_REQUIRES( + context, (*attr)[0] == 1 && (*attr)[4] == 1, + errors::Unimplemented("Only support ", attr_name, " across space.")); + OP_REQUIRES(context, (*attr)[1] >= 1 && (*attr)[2] >= 1 && (*attr)[3] >= 1, + errors::OutOfRange(attr_name, " is out of range.")); +} + +template <typename Device, typename T> +class ExtractVolumePatchesOp : public UnaryOp<T> { + public: + explicit ExtractVolumePatchesOp(OpKernelConstruction* context) + : UnaryOp<T>(context) { + ParseAttributeVec5(context, "ksizes", &ksizes_); + ParseAttributeVec5(context, "strides", &strides_); + // ParseAttributeVec5(context, "rates", &rates_); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + } + + void Compute(OpKernelContext* context) override { + // Input tensor is of the following dimensions: + // [ batch, in_planes, in_rows, in_cols, channels ] + const Tensor& input = context->input(0); + OP_REQUIRES(context, input.dims() == 5, + errors::InvalidArgument("input must be 5-dimensional", + input.shape().DebugString())); + + const int batch = input.dim_size(0); + const int in_planes = input.dim_size(1); + const int in_rows = input.dim_size(2); + const int in_cols = input.dim_size(3); + const int depth = input.dim_size(4); + + const int ksize_planes = ksizes_[1]; + const int ksize_rows = ksizes_[2]; + const int ksize_cols = ksizes_[3]; + + const int stride_planes = strides_[1]; + const int stride_rows = strides_[2]; + const int stride_cols = strides_[3]; + + /* + // TODO(hsgkim): enable rates + // Rates are disabled as of now due to Eigen's definitions of + // `extract_volume_patch` functions; none of them accept rates + // as its argument and rates are fixed to (1, 1, 1, 1, 1). A + // workaround has to be found for this. + // In order to enable rates, uncomment the following lines and use + // ksize_*_eff instead of ksize_* for the second argument of + // GetWindowedOutputSize calls. + + const int rate_planes = rates_[1]; + const int rate_rows = rates_[2]; + const int rate_cols = rates_[3]; + + const int ksize_planes_eff = ksize_planes + + (ksize_planes - 1) * (rate_planes - 1); + const int ksize_rows_eff = ksize_rows + (ksize_rows - 1) * (rate_rows - 1); + const int ksize_cols_eff = ksize_cols + (ksize_cols - 1) * (rate_cols - 1); + */ + + int64 out_planes = 0, out_rows = 0, out_cols = 0; + int64 pad_planes = 0, pad_rows = 0, pad_cols = 0; + OP_REQUIRES_OK(context, + GetWindowedOutputSize(in_planes, ksize_planes, stride_planes, + padding_, &out_planes, &pad_planes)); + OP_REQUIRES_OK(context, + GetWindowedOutputSize(in_rows, ksize_rows, stride_rows, + padding_, &out_rows, &pad_rows)); + OP_REQUIRES_OK(context, + GetWindowedOutputSize(in_cols, ksize_cols, stride_cols, + padding_, &out_cols, &pad_cols)); + + const std::vector<int64> out_sizes = { + batch, out_planes, out_rows, out_cols, + ksize_planes * ksize_rows * ksize_cols * depth}; + TensorShape out_shape(out_sizes); + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); + + // If there is nothing to compute, return. + if (out_shape.num_elements() == 0) { + return; + } + + functor::ExtractVolumePatchesForward<Device, T>()( + context->eigen_device<Device>(), input.tensor<T, 5>(), ksize_planes, + ksize_rows, ksize_cols, stride_planes, stride_rows, stride_cols, + /* rate_planes, rate_rows, rate_cols, */ + BrainPadding2EigenPadding(padding_), output->tensor<T, 5>()); + } + + private: + std::vector<int32> ksizes_; + std::vector<int32> strides_; + // std::vector<int32> rates_; + + Padding padding_; + + TF_DISALLOW_COPY_AND_ASSIGN(ExtractVolumePatchesOp); +}; + +// Registration of the CPU implementations. +#define REGISTER(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("ExtractVolumePatches").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ + ExtractVolumePatchesOp<CPUDevice, T>); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER); + +#undef REGISTER + +#if GOOGLE_CUDA + +// Forward declarations of the functor specializations for GPU. +namespace functor { + +// clang-format off +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void ExtractVolumePatchesForward<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 5>::ConstTensor input, \ + int patch_planes, int patch_rows, int patch_cols, \ + int stride_planes, int stride_rows, int stride_cols, \ + /* int rate_planes, int rate_rows, int rate_cols, */ \ + const Eigen::PaddingType& padding, \ + typename TTypes<T, 5>::Tensor output); \ + extern template struct ExtractVolumePatchesForward<GPUDevice, T>; +// clang-format on + +TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); + +#undef DECLARE_GPU_SPEC + +} // namespace functor + +// Registration of the GPU implementations. +#define REGISTER(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("ExtractVolumePatches").Device(DEVICE_GPU).TypeConstraint<T>("T"), \ + ExtractVolumePatchesOp<GPUDevice, T>); + +TF_CALL_GPU_NUMBER_TYPES(REGISTER); + +#undef REGISTER + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/extract_volume_patches_op.h b/tensorflow/core/kernels/extract_volume_patches_op.h new file mode 100644 index 0000000000..7e0502b770 --- /dev/null +++ b/tensorflow/core/kernels/extract_volume_patches_op.h @@ -0,0 +1,58 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_KERNELS_EXTRACT_VOLUME_PATCHES_OP_H_ +#define TENSORFLOW_KERNELS_EXTRACT_VOLUME_PATCHES_OP_H_ + +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/kernels/eigen_volume_patch.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +template <typename Device, typename T> +struct ExtractVolumePatchesForward { + void operator()(const Device& d, typename TTypes<T, 5>::ConstTensor input, + int patch_planes, int patch_rows, int patch_cols, + int stride_planes, int stride_rows, int stride_cols, + /* int rate_planes, int rate_rows, int rate_cols, */ + const Eigen::PaddingType& padding, + typename TTypes<T, 5>::Tensor output) { + const int64 N = std::max(input.size(), output.size()); + if (N <= std::numeric_limits<Index32>::max()) { + auto output_32bit = To32Bit(output); + output_32bit.device(d) = + To32Bit(input) + .extract_volume_patches(patch_cols, patch_rows, patch_planes, + stride_cols, stride_rows, stride_planes, + padding) + .reshape(output_32bit.dimensions()); + } else { + output.device(d) = + input + .extract_volume_patches(patch_cols, patch_rows, patch_planes, + stride_cols, stride_rows, stride_planes, + padding) + .reshape(output.dimensions()); + } + } +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_EXTRACT_VOLUME_PATCHES_OP_H_ diff --git a/tensorflow/core/kernels/extract_volume_patches_op_gpu.cu.cc b/tensorflow/core/kernels/extract_volume_patches_op_gpu.cu.cc new file mode 100644 index 0000000000..c636493602 --- /dev/null +++ b/tensorflow/core/kernels/extract_volume_patches_op_gpu.cu.cc @@ -0,0 +1,38 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/extract_volume_patches_op.h" +#include "tensorflow/core/framework/register_types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +namespace functor { + +#define REGISTER(T) template struct ExtractVolumePatchesForward<GPUDevice, T>; + +TF_CALL_GPU_NUMBER_TYPES(REGISTER); + +#undef REGISTER + +} // end namespace functor +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD index 8bfa40304e..f2e0b2558f 100644 --- a/tensorflow/core/kernels/fuzzing/BUILD +++ b/tensorflow/core/kernels/fuzzing/BUILD @@ -43,4 +43,6 @@ tf_ops_fuzz_target_lib("example_proto_fast_parsing") tf_ops_fuzz_target_lib("parse_tensor_op") +tf_ops_fuzz_target_lib("decode_compressed") + tf_ops_fuzz_target_lib("decode_json_example") diff --git a/tensorflow/core/kernels/fuzzing/decode_compressed_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_compressed_fuzz.cc new file mode 100644 index 0000000000..0a56f4b63f --- /dev/null +++ b/tensorflow/core/kernels/fuzzing/decode_compressed_fuzz.cc @@ -0,0 +1,45 @@ +/* Copyright 2018 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" + +namespace tensorflow { +namespace fuzzing { + +class FuzzDecodeCompressed : public FuzzStringInputOp { + void BuildGraph(const Scope& scope) override { + auto input = + tensorflow::ops::Placeholder(scope.WithOpName("input1"), DT_STRING); + auto d1 = tensorflow::ops::DecodeCompressed( + scope.WithOpName("d1"), input, + tensorflow::ops::DecodeCompressed::CompressionType("")); + auto d2 = tensorflow::ops::DecodeCompressed( + scope.WithOpName("d2"), input, + tensorflow::ops::DecodeCompressed::CompressionType("ZLIB")); + auto d3 = tensorflow::ops::DecodeCompressed( + scope.WithOpName("d3"), input, + tensorflow::ops::DecodeCompressed::CompressionType("GZIP")); + Scope grouper = + scope.WithControlDependencies(std::vector<tensorflow::Operation>{ + d1.output.op(), d2.output.op(), d3.output.op()}); + (void)tensorflow::ops::NoOp(grouper.WithOpName("output")); + } +}; + +STANDARD_TF_FUZZ_FUNCTION(FuzzDecodeCompressed); + +} // namespace fuzzing +} // namespace tensorflow diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h index cd2873bdca..7710cf93d6 100644 --- a/tensorflow/core/kernels/gather_functor.h +++ b/tensorflow/core/kernels/gather_functor.h @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/framework/type_traits.h" +#include "tensorflow/core/framework/variant.h" #include "tensorflow/core/kernels/bounds_check.h" #include "tensorflow/core/platform/prefetch.h" #include "tensorflow/core/platform/types.h" diff --git a/tensorflow/core/kernels/histogram_op_gpu.cu.cc b/tensorflow/core/kernels/histogram_op_gpu.cu.cc index a88e9b0ddc..374a05850e 100644 --- a/tensorflow/core/kernels/histogram_op_gpu.cu.cc +++ b/tensorflow/core/kernels/histogram_op_gpu.cu.cc @@ -18,7 +18,7 @@ limitations under the License. #define EIGEN_USE_GPU #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "external/cub_archive/cub/device/device_histogram.cuh" +#include "third_party/cub/device/device_histogram.cuh" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc index bca1cff41c..2088c13586 100644 --- a/tensorflow/core/kernels/list_kernels.cc +++ b/tensorflow/core/kernels/list_kernels.cc @@ -77,9 +77,9 @@ static Status TensorListDeviceCopy( return Status::OK(); } -#define REGISTER_LIST_COPY(DIRECTION) \ - INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION( \ - TensorList, DIRECTION, TensorList::kTypeName, TensorListDeviceCopy) +#define REGISTER_LIST_COPY(DIRECTION) \ + INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION(TensorList, DIRECTION, \ + TensorListDeviceCopy) REGISTER_LIST_COPY(VariantDeviceCopyDirection::HOST_TO_DEVICE); REGISTER_LIST_COPY(VariantDeviceCopyDirection::DEVICE_TO_HOST); @@ -92,8 +92,7 @@ Status TensorListShape(const TensorList& t, TensorShape* s) { return Status::OK(); } -REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(TensorList, TensorList::kTypeName, - TensorListShape); +REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(TensorList, TensorListShape); bool TensorList::Decode(const VariantTensorData& data) { tensors = data.tensors(); @@ -625,12 +624,11 @@ REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(bfloat16); #undef REGISTER_TENSOR_LIST_FROM_TENSOR_CPU REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_CPU, - TensorList, TensorList::kTypeName, + TensorList, TensorListBinaryAdd<CPUDevice>); REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(ZEROS_LIKE_VARIANT_UNARY_OP, DEVICE_CPU, TensorList, - TensorList::kTypeName, TensorListZerosLike<CPUDevice>); } // namespace tensorflow diff --git a/tensorflow/core/kernels/list_kernels.cu.cc b/tensorflow/core/kernels/list_kernels.cu.cc index c591226b76..a00bf700ca 100644 --- a/tensorflow/core/kernels/list_kernels.cu.cc +++ b/tensorflow/core/kernels/list_kernels.cu.cc @@ -94,11 +94,10 @@ REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(bool); #undef REGISTER_TENSOR_LIST_FROM_TENSOR_GPU REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_GPU, - TensorList, TensorList::kTypeName, + TensorList, TensorListBinaryAdd<GPUDevice>); REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(ZEROS_LIKE_VARIANT_UNARY_OP, DEVICE_GPU, TensorList, - TensorList::kTypeName, TensorListZerosLike<GPUDevice>); } // namespace tensorflow diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc index 6b6a14e9a7..1ded012f3c 100644 --- a/tensorflow/core/kernels/logging_ops.cc +++ b/tensorflow/core/kernels/logging_ops.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include <iostream> +#include "absl/strings/str_split.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/strings/str_util.h" @@ -90,6 +91,59 @@ class PrintOp : public OpKernel { REGISTER_KERNEL_BUILDER(Name("Print").Device(DEVICE_CPU), PrintOp); +class PrintV2Op : public OpKernel { + public: + explicit PrintV2Op(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("output_stream", &output_stream_)); + + auto output_stream_index = + std::find(std::begin(valid_output_streams_), + std::end(valid_output_streams_), output_stream_); + + if (output_stream_index == std::end(valid_output_streams_)) { + string error_msg = strings::StrCat( + "Unknown output stream: ", output_stream_, ", Valid streams are:"); + for (auto valid_stream : valid_output_streams_) { + strings::StrAppend(&error_msg, " ", valid_stream); + } + OP_REQUIRES(ctx, false, errors::InvalidArgument(error_msg)); + } + } + + void Compute(OpKernelContext* ctx) override { + const Tensor* input_; + OP_REQUIRES_OK(ctx, ctx->input("input", &input_)); + const string& msg = input_->scalar<string>()(); + + if (output_stream_ == "stdout") { + std::cout << msg << std::endl; + } else if (output_stream_ == "stderr") { + std::cerr << msg << std::endl; + } else if (output_stream_ == "log(info)") { + LOG(INFO) << msg << std::endl; + } else if (output_stream_ == "log(warning)") { + LOG(WARNING) << msg << std::endl; + } else if (output_stream_ == "log(error)") { + LOG(ERROR) << msg << std::endl; + } else { + string error_msg = strings::StrCat( + "Unknown output stream: ", output_stream_, ", Valid streams are:"); + for (auto valid_stream : valid_output_streams_) { + strings::StrAppend(&error_msg, " ", valid_stream); + } + OP_REQUIRES(ctx, false, errors::InvalidArgument(error_msg)); + } + } + + const char* valid_output_streams_[6] = {"stdout", "stderr", "log(info)", + "log(warning)", "log(error)"}; + + private: + string output_stream_; +}; + +REGISTER_KERNEL_BUILDER(Name("PrintV2").Device(DEVICE_CPU), PrintV2Op); + class TimestampOp : public OpKernel { public: explicit TimestampOp(OpKernelConstruction* context) : OpKernel(context) {} diff --git a/tensorflow/core/kernels/logging_ops_test.cc b/tensorflow/core/kernels/logging_ops_test.cc index 5e6958f364..a259d995fa 100644 --- a/tensorflow/core/kernels/logging_ops_test.cc +++ b/tensorflow/core/kernels/logging_ops_test.cc @@ -23,11 +23,33 @@ limitations under the License. #include "tensorflow/core/framework/types.h" #include "tensorflow/core/kernels/ops_testutil.h" #include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" namespace tensorflow { namespace { +class PrintingV2GraphTest : public OpsTestBase { + protected: + Status Init(const string& output_stream = "log(warning)") { + TF_CHECK_OK(NodeDefBuilder("op", "PrintV2") + .Input(FakeInput(DT_STRING)) + .Attr("output_stream", output_stream) + .Finalize(node_def())); + return InitOp(); + } +}; + +TEST_F(PrintingV2GraphTest, StringSuccess) { + TF_ASSERT_OK(Init()); + AddInputFromArray<string>(TensorShape({}), {"bar"}); + TF_ASSERT_OK(RunOpKernel()); +} + +TEST_F(PrintingV2GraphTest, InvalidOutputStream) { + ASSERT_NE(::tensorflow::Status::OK(), (Init("invalid_output_stream"))); +} + class PrintingGraphTest : public OpsTestBase { protected: Status Init(DataType input_type1, DataType input_type2, string msg = "", diff --git a/tensorflow/core/kernels/mirror_pad_op.h b/tensorflow/core/kernels/mirror_pad_op.h index cc4b6941b9..62aa7d5c29 100644 --- a/tensorflow/core/kernels/mirror_pad_op.h +++ b/tensorflow/core/kernels/mirror_pad_op.h @@ -103,6 +103,7 @@ struct TensorEvaluator<const TensorMirrorPadOp<PaddingDimensions, ArgType>, IsAligned = false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = true, RawAccess = false diff --git a/tensorflow/core/kernels/mkl_conv_ops_test.cc b/tensorflow/core/kernels/mkl_conv_ops_test.cc new file mode 100644 index 0000000000..a055351337 --- /dev/null +++ b/tensorflow/core/kernels/mkl_conv_ops_test.cc @@ -0,0 +1,407 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/cc/ops/const_op.h" +#include "tensorflow/cc/ops/nn_ops.h" +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/session.h" + +#if defined(INTEL_MKL_DNN_ONLY) +#include "third_party/intel_mkl_dnn/include/mkldnn.h" +#include "tensorflow/core/util/mkl_util.h" +#endif + +// TODO(ezhulenev): Add numerical tests that will compare results of default +// (aka Eigen) convolutions with MKL convolutions. + +// -------------------------------------------------------------------------- // +// Performance Benchmarks. // +// -------------------------------------------------------------------------- // + +// Compare performance of default Tensorflow convolution kernels (Eigen) with +// MKL kernels on CPU. + +// Before running these benchmarks configure OpenMP environment variables: +// export KMP_BLOCKTIME=0 +// export OMP_NUM_THREADS=${num_threads} + +namespace tensorflow { + +struct Conv2DDimensions { + Conv2DDimensions(int n, int h, int w, int c, int fc, int fh, int fw) + : input_batches(n), + input_height(h), + input_width(w), + input_depth(c), + filter_count(fc), + filter_height(fh), + filter_width(fw) {} + + int input_batches; + int input_height; + int input_width; + int input_depth; + int filter_count; + int filter_height; + int filter_width; +}; + +static Tensor GetRandomTensor(const TensorShape& shape) { + Tensor tensor(DT_FLOAT, TensorShape(shape)); + tensor.flat<float>() = tensor.flat<float>().setRandom(); + return tensor; +} + +// Get a random Tensor for the Conv2D input. +static Tensor GetRandomInputTensor(const Conv2DDimensions& dims) { + return GetRandomTensor({dims.input_batches, dims.input_height, + dims.input_width, dims.input_depth}); +} + +// Get a random Tensor for the Conv2D filter. +static Tensor GetRandomFilterTensor(const Conv2DDimensions& dims) { + return GetRandomTensor({dims.filter_height, dims.filter_width, + dims.input_depth, dims.filter_count}); +} + +// Get a random Tensor for the Conv2D output (assuming SAME padding). +static Tensor GetRandomOutputTensor(const Conv2DDimensions& dims) { + return GetRandomTensor({dims.input_batches, dims.input_height, + dims.input_width, dims.filter_count}); +} + +// Get a Tensor encoding Conv2D input shape. +static Tensor GetInputSizesTensor(const Conv2DDimensions& dims) { + return test::AsTensor<int32>({dims.input_batches, dims.input_height, + dims.input_width, dims.input_depth}); +} + +// Get a Tensor encoding Conv2D filter shape. +static Tensor GetFilterSizesTensor(const Conv2DDimensions& dims) { + return test::AsTensor<int32>({dims.filter_height, dims.filter_width, + dims.input_depth, dims.filter_count}); +} + +#if defined(INTEL_MKL_DNN_ONLY) +static Tensor NonMklTensor() { + MklDnnShape non_mkl_shape; + non_mkl_shape.SetMklTensor(false); + + auto size = static_cast<int64>(non_mkl_shape.GetSerializeBufferSize()); + Tensor tensor(DT_UINT8, {size}); + + non_mkl_shape.SerializeMklDnnShape(tensor.flat<uint8>().data(), + size * sizeof(uint8)); + return tensor; +} +#endif + +static Graph* DefaultConv2D(const Conv2DDimensions& dims) { + auto* graph = new Graph(OpRegistry::Global()); + + Tensor input_t = GetRandomInputTensor(dims); + Tensor filter_t = GetRandomFilterTensor(dims); + + Node* input = test::graph::Constant(graph, input_t, "input"); + Node* filter = test::graph::Constant(graph, filter_t, "filter"); + + Node* conv2d; + TF_CHECK_OK(NodeBuilder(graph->NewName("conv_2d"), "Conv2D") + .Input(input) + .Input(filter) + .Attr("T", DT_FLOAT) + .Attr("strides", {1, 1, 1, 1}) + .Attr("padding", "SAME") + .Finalize(graph, &conv2d)); + + return graph; +} + +#if defined(INTEL_MKL_DNN_ONLY) +static Graph* MklConv2D(const Conv2DDimensions& dims) { + auto* graph = new Graph(OpRegistry::Global()); + + Tensor input_t = GetRandomInputTensor(dims); + Tensor filter_t = GetRandomFilterTensor(dims); + + Node* input = test::graph::Constant(graph, input_t, "input"); + Node* filter = test::graph::Constant(graph, filter_t, "filter"); + + Node* not_mkl_shape = test::graph::Constant(graph, NonMklTensor(), "not_mkl"); + + Node* conv2d; + TF_CHECK_OK(NodeBuilder(graph->NewName("mkl_conv_2d"), "_MklConv2D") + .Input(input) + .Input(filter) + .Input(not_mkl_shape) + .Input(not_mkl_shape) + .Attr("T", DT_FLOAT) + .Attr("strides", {1, 1, 1, 1}) + .Attr("padding", "SAME") + .Attr("_kernel", "MklOp") + .Finalize(graph, &conv2d)); + + return graph; +} +#endif + +static Graph* DefaultConv2DBwdInput(const Conv2DDimensions& dims) { + auto* graph = new Graph(OpRegistry::Global()); + + Tensor input_sizes_t = GetInputSizesTensor(dims); + Tensor filter_t = GetRandomFilterTensor(dims); + Tensor out_backprop_t = GetRandomOutputTensor(dims); // assuming SAME padding + + Node* input_sizes = + test::graph::Constant(graph, input_sizes_t, "input_sizes"); + Node* filter = test::graph::Constant(graph, filter_t, "filter"); + Node* out_backprop = + test::graph::Constant(graph, out_backprop_t, "out_backprop"); + + Node* conv2d_bwd_input; + TF_CHECK_OK( + NodeBuilder(graph->NewName("conv_2d_bwd_input"), "Conv2DBackpropInput") + .Input(input_sizes) + .Input(filter) + .Input(out_backprop) + .Attr("T", DT_FLOAT) + .Attr("strides", {1, 1, 1, 1}) + .Attr("padding", "SAME") + .Finalize(graph, &conv2d_bwd_input)); + + return graph; +} + +#if defined(INTEL_MKL_DNN_ONLY) +static Graph* MklConv2DBwdInput(const Conv2DDimensions& dims) { + auto* graph = new Graph(OpRegistry::Global()); + + Tensor input_sizes_t = GetInputSizesTensor(dims); + Tensor filter_t = GetRandomFilterTensor(dims); + Tensor out_backprop_t = GetRandomOutputTensor(dims); // assuming SAME padding + + Node* input_sizes = + test::graph::Constant(graph, input_sizes_t, "input_sizes"); + Node* filter = test::graph::Constant(graph, filter_t, "filter"); + Node* out_backprop = + test::graph::Constant(graph, out_backprop_t, "out_backprop"); + + Node* not_mkl_shape = test::graph::Constant(graph, NonMklTensor(), "not_mkl"); + + Node* conv2d_bwd_input; + TF_CHECK_OK(NodeBuilder(graph->NewName("conv_2d_bwd_input"), + "_MklConv2DBackpropInput") + .Input(input_sizes) + .Input(filter) + .Input(out_backprop) + .Input(not_mkl_shape) + .Input(not_mkl_shape) + .Input(not_mkl_shape) + .Attr("T", DT_FLOAT) + .Attr("strides", {1, 1, 1, 1}) + .Attr("padding", "SAME") + .Attr("_kernel", "MklOp") + .Finalize(graph, &conv2d_bwd_input)); + + return graph; +} +#endif + +static Graph* DefaultConv2DBwdFilter(const Conv2DDimensions& dims) { + auto* graph = new Graph(OpRegistry::Global()); + + Tensor input_t = GetRandomInputTensor(dims); + Tensor filter_sizes_t = GetFilterSizesTensor(dims); + Tensor filter_t = GetRandomFilterTensor(dims); + Tensor out_backprop_t = GetRandomOutputTensor(dims); // assuming SAME padding + + Node* input = test::graph::Constant(graph, input_t, "input"); + Node* filter_sizes = + test::graph::Constant(graph, filter_sizes_t, "filter_sizes"); + Node* out_backprop = + test::graph::Constant(graph, out_backprop_t, "out_backprop"); + + Node* conv2d_bwd_filter; + TF_CHECK_OK( + NodeBuilder(graph->NewName("conv_2d_bwd_filter"), "Conv2DBackpropFilter") + .Input(input) + .Input(filter_sizes) + .Input(out_backprop) + .Attr("T", DT_FLOAT) + .Attr("strides", {1, 1, 1, 1}) + .Attr("padding", "SAME") + .Finalize(graph, &conv2d_bwd_filter)); + + return graph; +} + +#if defined(INTEL_MKL_DNN_ONLY) +static Graph* MklConv2DBwdFilter(const Conv2DDimensions& dims) { + Graph* graph = new Graph(OpRegistry::Global()); + + Tensor input_t = GetRandomInputTensor(dims); + Tensor filter_sizes_t = GetFilterSizesTensor(dims); + Tensor filter_t = GetRandomFilterTensor(dims); + Tensor out_backprop_t = GetRandomOutputTensor(dims); // assuming SAME padding + + Node* input = test::graph::Constant(graph, input_t, "input"); + Node* filter_sizes = + test::graph::Constant(graph, filter_sizes_t, "filter_sizes"); + Node* out_backprop = + test::graph::Constant(graph, out_backprop_t, "out_backprop"); + + Node* not_mkl_shape = test::graph::Constant(graph, NonMklTensor(), "not_mkl"); + + Node* conv2d_bwd_filter; + TF_CHECK_OK(NodeBuilder(graph->NewName("conv_2d_bwd_filter"), + "_MklConv2DBackpropFilter") + .Input(input) + .Input(filter_sizes) + .Input(out_backprop) + .Input(not_mkl_shape) + .Input(not_mkl_shape) + .Input(not_mkl_shape) + .Attr("T", DT_FLOAT) + .Attr("strides", {1, 1, 1, 1}) + .Attr("padding", "SAME") + .Attr("_kernel", "MklOp") + .Finalize(graph, &conv2d_bwd_filter)); + + return graph; +} +#endif + +// Macro arguments names: --------------------------------------------------- // +// N: batch size +// H: height +// W: width +// C: channels +// FC: filter count +// FH: filter height +// FW: filter width + +#define BM_CONCAT(a, b) a##b + +#define BM_NAME(p, type, N, H, W, C, FC, FH, FW) \ + BM_CONCAT(BM_##p##_##type##_in_##N##_##H##_##W##_##C, _f_##FC##_##FH##_##FW) + +// Flops computation in these benchmarks are the same as in +// eigen_benchmark_cpu_test.cc. + +#define BM_Conv2DT(kind, N, H, W, C, FC, FH, FW, type, LABEL) \ + static void BM_NAME(Conv2D_##kind, type, N, H, W, C, FC, FH, \ + FW)(int iters) { \ + testing::SetLabel(LABEL); \ + \ + int64 num_computed_elements = (N) * (H) * (W) * (FC); \ + int64 flops_per_iter = num_computed_elements * ((C) * (FH) * (FW)); \ + testing::ItemsProcessed(static_cast<int64>(iters) * flops_per_iter); \ + \ + Conv2DDimensions dims(N, H, W, C, FC, FW, FH); \ + test::Benchmark(#type, BM_CONCAT(kind, Conv2D)(dims)).Run(iters); \ + } \ + BENCHMARK(BM_NAME(Conv2D_##kind, type, N, H, W, C, FC, FH, FW)) + +#if defined(INTEL_MKL_DNN_ONLY) +#define BM_Conv2D(N, H, W, C, FC, FH, FW, type, LABEL) \ + BM_Conv2DT(Default, N, H, W, C, FC, FH, FW, type, LABEL); \ + BM_Conv2DT(Mkl, N, H, W, C, FC, FH, FW, type, LABEL); +#else +#define BM_Conv2D(N, H, W, C, FC, FH, FW, type, LABEL) \ + BM_Conv2DT(Default, N, H, W, C, FC, FH, FW, type, LABEL); +#endif + +#define BM_Conv2DBwdInputT(kind, N, H, W, C, FC, FH, FW, type, LABEL) \ + static void BM_NAME(Conv2DBwdInput_##kind, type, N, H, W, C, FC, FH, \ + FW)(int iters) { \ + testing::SetLabel(LABEL); \ + \ + int64 num_computed_elements = (N) * (H) * (W) * (C); \ + int64 flops_per_iter = num_computed_elements * ((C) * (FH) * (FW)); \ + testing::ItemsProcessed(static_cast<int64>(iters) * flops_per_iter); \ + \ + Conv2DDimensions dims(N, H, W, C, FC, FW, FH); \ + test::Benchmark(#type, BM_CONCAT(kind, Conv2DBwdInput)(dims)).Run(iters); \ + } \ + BENCHMARK(BM_NAME(Conv2DBwdInput_##kind, type, N, H, W, C, FC, FH, FW)) + +#if defined(INTEL_MKL_DNN_ONLY) +#define BM_Conv2DBwdInput(N, H, W, C, FC, FH, FW, type, LABEL) \ + BM_Conv2DBwdInputT(Default, N, H, W, C, FC, FH, FW, type, LABEL); \ + BM_Conv2DBwdInputT(Mkl, N, H, W, C, FC, FH, FW, type, LABEL); +#else +#define BM_Conv2DBwdInput(N, H, W, C, FC, FH, FW, type, LABEL) \ + BM_Conv2DBwdInputT(Default, N, H, W, C, FC, FH, FW, type, LABEL); +#endif + +#define BM_Conv2DBwdFilterT(kind, N, H, W, C, FC, FH, FW, type, LABEL) \ + static void BM_NAME(Conv2DBwdFilter_##kind, type, N, H, W, C, FC, FH, \ + FW)(int iters) { \ + testing::SetLabel(LABEL); \ + \ + int64 num_computed_elements = (FH) * (FW) * (C) * (FC); \ + int64 flops_per_iter = num_computed_elements * ((N) * (H) * (W)); \ + testing::ItemsProcessed(static_cast<int64>(iters) * flops_per_iter); \ + \ + Conv2DDimensions dims(N, H, W, C, FC, FW, FH); \ + test::Benchmark(#type, BM_CONCAT(kind, Conv2DBwdFilter)(dims)).Run(iters); \ + } \ + BENCHMARK(BM_NAME(Conv2DBwdFilter_##kind, type, N, H, W, C, FC, FH, FW)) + +#if defined(INTEL_MKL_DNN_ONLY) +#define BM_Conv2DBwdFilter(N, H, W, C, FC, FH, FW, type, LABEL) \ + BM_Conv2DBwdFilterT(Default, N, H, W, C, FC, FH, FW, type, LABEL); \ + BM_Conv2DBwdFilterT(Mkl, N, H, W, C, FC, FH, FW, type, LABEL); +#else +#define BM_Conv2DBwdFilter(N, H, W, C, FC, FH, FW, type, LABEL) \ + BM_Conv2DBwdFilterT(Default, N, H, W, C, FC, FH, FW, type, LABEL); +#endif + +// ImageNet Convolutions ---------------------------------------------------- // + +BM_Conv2D(32, 28, 28, 96, 128, 3, 3, cpu, "conv3a_00_3x3"); +BM_Conv2D(32, 28, 28, 16, 32, 5, 5, cpu, "conv3a_00_5x5"); +BM_Conv2D(32, 28, 28, 128, 192, 3, 3, cpu, "conv3_00_3x3"); +BM_Conv2D(32, 28, 28, 32, 96, 5, 5, cpu, "conv3_00_5x5"); +BM_Conv2D(32, 14, 14, 96, 204, 3, 3, cpu, "conv4a_00_3x3"); +BM_Conv2D(32, 14, 14, 16, 48, 5, 5, cpu, "conv4a_00_5x5"); +BM_Conv2D(32, 14, 14, 112, 224, 3, 3, cpu, "conv4b_00_3x3"); + +BM_Conv2DBwdInput(32, 28, 28, 96, 128, 3, 3, cpu, "conv3a_00_3x3"); +BM_Conv2DBwdInput(32, 28, 28, 16, 32, 5, 5, cpu, "conv3a_00_5x5"); +BM_Conv2DBwdInput(32, 28, 28, 128, 192, 3, 3, cpu, "conv3_00_3x3"); +BM_Conv2DBwdInput(32, 28, 28, 32, 96, 5, 5, cpu, "conv3_00_5x5"); +BM_Conv2DBwdInput(32, 14, 14, 96, 204, 3, 3, cpu, "conv4a_00_3x3"); +BM_Conv2DBwdInput(32, 14, 14, 16, 48, 5, 5, cpu, "conv4a_00_5x5"); +BM_Conv2DBwdInput(32, 14, 14, 112, 224, 3, 3, cpu, "conv4b_00_3x3"); + +BM_Conv2DBwdFilter(32, 28, 28, 96, 128, 3, 3, cpu, "conv3a_00_3x3"); +BM_Conv2DBwdFilter(32, 28, 28, 16, 32, 5, 5, cpu, "conv3a_00_5x5"); +BM_Conv2DBwdFilter(32, 28, 28, 128, 192, 3, 3, cpu, "conv3_00_3x3"); +BM_Conv2DBwdFilter(32, 28, 28, 32, 96, 5, 5, cpu, "conv3_00_5x5"); +BM_Conv2DBwdFilter(32, 14, 14, 96, 204, 3, 3, cpu, "conv4a_00_3x3"); +BM_Conv2DBwdFilter(32, 14, 14, 16, 48, 5, 5, cpu, "conv4a_00_5x5"); +BM_Conv2DBwdFilter(32, 14, 14, 112, 224, 3, 3, cpu, "conv4b_00_3x3"); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/multinomial_op.cc b/tensorflow/core/kernels/multinomial_op.cc index 7a64788448..82dfece4a2 100644 --- a/tensorflow/core/kernels/multinomial_op.cc +++ b/tensorflow/core/kernels/multinomial_op.cc @@ -75,7 +75,7 @@ struct MultinomialFunctor<CPUDevice, T, OutputType> { // lambda. Since we want to let each worker have its own copy, we pass // "gen" by reference and explicitly do a copy assignment here. random::PhiloxRandom gen_copy = gen; - // Skip takes units of 128 bytes. +3 is so rounding doesn't lead to + // Skip takes units of 128 bits. +3 is so rounding doesn't lead to // us using the same state in different batches. gen_copy.Skip(start_row * (num_samples + 3) / 4); random::SimplePhilox simple_philox(&gen_copy); diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc index 7bb403290d..fc1c9003aa 100644 --- a/tensorflow/core/kernels/partitioned_function_ops.cc +++ b/tensorflow/core/kernels/partitioned_function_ops.cc @@ -127,12 +127,12 @@ class PartitionedCallOp : public AsyncOpKernel { optimization_options.graph = &graph; optimization_options.flib_def = overlay_lib; optimization_options.device_set = &device_set; - Placer placer(graph.get(), &device_set); OP_REQUIRES_OK_ASYNC( ctx, OptimizationPassRegistry::Global()->RunGrouping( OptimizationPassRegistry::PRE_PLACEMENT, optimization_options), done); + Placer placer(graph.get(), &device_set); OP_REQUIRES_OK_ASYNC(ctx, placer.Run(), done); OP_REQUIRES_OK_ASYNC( ctx, @@ -210,7 +210,7 @@ class PartitionedCallOp : public AsyncOpKernel { TF_RETURN_IF_ERROR(node->attrs().Find("T", &attr_value)); DataType dtype = attr_value->type(); if (dtype == DT_RESOURCE) { - ResourceHandle handle = args[index].flat<ResourceHandle>()(0); + const ResourceHandle& handle = args[index].flat<ResourceHandle>()(0); node->set_assigned_device_name(handle.device()); } } diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h index 5fb1c92f94..272aa3b4f5 100644 --- a/tensorflow/core/kernels/queue_base.h +++ b/tensorflow/core/kernels/queue_base.h @@ -19,6 +19,7 @@ limitations under the License. #include <deque> #include <vector> +#include "absl/base/macros.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/queue_interface.h" #include "tensorflow/core/framework/tensor.h" @@ -82,6 +83,9 @@ class QueueBase : public QueueInterface { // NOTE(mrry): This method is deprecated. Use // `tensorflow::batch_util::CopySliceToElement()` defined in // "./batch_util.h" instead. + ABSL_DEPRECATED( + "Use `tensorflow::batch_util::CopySliceToElement()` defined in " + "\"./batch_util.h\" instead.") static Status CopyElementToSlice(const Tensor& element, Tensor* parent, int64 index); diff --git a/tensorflow/core/kernels/queue_ops.cc b/tensorflow/core/kernels/queue_ops.cc index c4d404259b..97ddc852f7 100644 --- a/tensorflow/core/kernels/queue_ops.cc +++ b/tensorflow/core/kernels/queue_ops.cc @@ -65,7 +65,7 @@ class FakeQueueOp : public OpKernel { } void Compute(OpKernelContext* context) override { - ResourceHandle ref = context->input(0).flat<ResourceHandle>()(0); + const ResourceHandle& ref = context->input(0).flat<ResourceHandle>()(0); handle_.AccessTensor(context)->flat<string>()(0) = ref.container(); handle_.AccessTensor(context)->flat<string>()(1) = ref.name(); context->set_output_ref(0, &mu_, handle_.AccessTensor(context)); diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc index e37232539f..04a53697c0 100644 --- a/tensorflow/core/kernels/random_op.cc +++ b/tensorflow/core/kernels/random_op.cc @@ -231,7 +231,13 @@ class RandomUniformIntOp : public OpKernel { errors::InvalidArgument("maxval must be 0-D, got shape ", maxval.shape().DebugString())); - // Verify that minval < maxval + // Allocate output, and exit early if possible + Tensor* output; + OP_REQUIRES_OK(ctx, AllocateOutputWithShape(ctx, shape, 0, &output)); + if (output->NumElements() == 0) return; + + // Verify that minval < maxval. This check intentionally happens after the + // early exit for empty output. Zero impossible things are fine. IntType lo = minval.scalar<IntType>()(); IntType hi = maxval.scalar<IntType>()(); OP_REQUIRES( @@ -243,8 +249,6 @@ class RandomUniformIntOp : public OpKernel { Distribution; Distribution dist(lo, hi); - Tensor* output; - OP_REQUIRES_OK(ctx, AllocateOutputWithShape(ctx, shape, 0, &output)); auto output_flat = output->flat<IntType>(); functor::FillPhiloxRandom<Device, Distribution>()( ctx, ctx->eigen_device<Device>(), diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h index 88b3c2ac76..bb8254eaac 100644 --- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h +++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h @@ -21,11 +21,11 @@ limitations under the License. #define EIGEN_USE_GPU #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "external/cub_archive/cub/device/device_reduce.cuh" -#include "external/cub_archive/cub/device/device_segmented_reduce.cuh" -#include "external/cub_archive/cub/iterator/counting_input_iterator.cuh" -#include "external/cub_archive/cub/iterator/transform_input_iterator.cuh" -#include "external/cub_archive/cub/warp/warp_reduce.cuh" +#include "third_party/cub/device/device_reduce.cuh" +#include "third_party/cub/device/device_segmented_reduce.cuh" +#include "third_party/cub/iterator/counting_input_iterator.cuh" +#include "third_party/cub/iterator/transform_input_iterator.cuh" +#include "third_party/cub/warp/warp_reduce.cuh" #include "cuda/include/cuComplex.h" #include "tensorflow/core/kernels/reduction_ops.h" #include "tensorflow/core/lib/core/bits.h" diff --git a/tensorflow/core/kernels/reduction_ops_max.cc b/tensorflow/core/kernels/reduction_ops_max.cc index 9cf953f4bf..8bfa44b2d0 100644 --- a/tensorflow/core/kernels/reduction_ops_max.cc +++ b/tensorflow/core/kernels/reduction_ops_max.cc @@ -50,6 +50,8 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS); .TypeConstraint<int64>("Tidx") \ .HostMemory("reduction_indices"), \ ReductionOp<GPUDevice, type, int64, Eigen::internal::MaxReducer<type>>); + +REGISTER_GPU_KERNELS(Eigen::half); REGISTER_GPU_KERNELS(float); REGISTER_GPU_KERNELS(double); REGISTER_GPU_KERNELS(int64); diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc index ebcfb673d1..26705a8d34 100644 --- a/tensorflow/core/kernels/resource_variable_ops.cc +++ b/tensorflow/core/kernels/resource_variable_ops.cc @@ -79,7 +79,7 @@ ReadVariableOp::ReadVariableOp(OpKernelConstruction* c) : OpKernel(c) { void ReadVariableOp::Compute(OpKernelContext* ctx) { Var* variable = nullptr; - ResourceHandle handle = HandleFromInput(ctx, 0); + const ResourceHandle& handle = HandleFromInput(ctx, 0); const auto status = LookupResource(ctx, handle, &variable); OP_REQUIRES(ctx, status.ok(), errors::FailedPrecondition( diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc index 15a707a9c6..cded417986 100644 --- a/tensorflow/core/kernels/reverse_sequence_op.cc +++ b/tensorflow/core/kernels/reverse_sequence_op.cc @@ -64,7 +64,7 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) { OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(batch_dim), errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim, "), ", "(", seq_lens.NumElements(), - " vs. ", input.dim_size(batch_dim))); + " vs. ", input.dim_size(batch_dim), ")")); for (size_t d = 0; d < seq_lens_vec.size(); ++d) { OP_REQUIRES(context, seq_lens_vec[d] >= 0, @@ -91,7 +91,7 @@ void CheckErrorsGPU(OpKernelContext* context, int batch_dim, int seq_dim) { OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(batch_dim), errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim, "), ", "(", seq_lens.NumElements(), - " vs. ", input.dim_size(batch_dim))); + " vs. ", input.dim_size(batch_dim), ")")); } template <> @@ -127,6 +127,7 @@ class ReverseSequenceOp : public OpKernel { auto seq_lens_t = seq_lens.vec<Tlen>(); CheckErrors<Device, Tlen>(context, batch_dim_, seq_dim_); + if (!context->status().ok()) return; const int input_dims = input.dims(); diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc index e0194605ce..2f8aede427 100644 --- a/tensorflow/core/kernels/scatter_nd_op.cc +++ b/tensorflow/core/kernels/scatter_nd_op.cc @@ -145,6 +145,7 @@ class ScatterNdUpdateOp : public OpKernel { if (dtype_ == DT_RESOURCE) { Var* v; OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v)); + core::ScopedUnref scoped_unref(v); mutex_lock m(*v->mu()); DoCompute(c); } else if (use_exclusive_lock_) { diff --git a/tensorflow/core/kernels/searchsorted_op.cc b/tensorflow/core/kernels/searchsorted_op.cc new file mode 100644 index 0000000000..dc627ac77a --- /dev/null +++ b/tensorflow/core/kernels/searchsorted_op.cc @@ -0,0 +1,249 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/searchsorted_op.h" + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/kernels/bounds_check.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +namespace functor { +template <typename T, typename OutType> +struct UpperBoundFunctor<CPUDevice, T, OutType> { + static Status Compute(OpKernelContext* context, + const typename TTypes<T, 1>::ConstTensor& sorted_inputs, + const typename TTypes<T, 1>::ConstTensor& values, + int batch_size, int num_inputs, int num_values, + typename TTypes<OutType, 1>::Tensor* output) { + // TODO(eriche): If anyone ever needs this to be faster, we can multithread. + for (int b = 0; b < batch_size; ++b) { + const T* sorted_inputs_ptr = sorted_inputs.data() + b * num_inputs; + OutType* output_ptr = output->data() + b * num_values; + for (int i = 0; i < num_values; ++i) { + output_ptr[i] = + std::upper_bound(sorted_inputs_ptr, sorted_inputs_ptr + num_inputs, + values(i + b * num_values)) - + sorted_inputs_ptr; + } + } + + return Status::OK(); + } +}; + +template <typename T, typename OutType> +struct LowerBoundFunctor<CPUDevice, T, OutType> { + static Status Compute(OpKernelContext* context, + const typename TTypes<T, 1>::ConstTensor& sorted_inputs, + const typename TTypes<T, 1>::ConstTensor& values, + int batch_size, int num_inputs, int num_values, + typename TTypes<OutType, 1>::Tensor* output) { + // TODO(eriche): If anyone ever needs this to be faster, we can multithread. + for (int b = 0; b < batch_size; ++b) { + const T* sorted_inputs_ptr = sorted_inputs.data() + b * num_inputs; + OutType* output_ptr = output->data() + b * num_values; + for (int i = 0; i < num_values; ++i) { + output_ptr[i] = + std::lower_bound(sorted_inputs_ptr, sorted_inputs_ptr + num_inputs, + values(i + b * num_values)) - + sorted_inputs_ptr; + } + } + + return Status::OK(); + } +}; +} // namespace functor + +template <typename Device, typename T, typename OutType> +class UpperBoundOp : public OpKernel { + public: + explicit UpperBoundOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + const Tensor& sorted_inputs_t = ctx->input(0); + const Tensor& values_t = ctx->input(1); + + // must have same batch dim_size for both + OP_REQUIRES(ctx, sorted_inputs_t.dim_size(0) == values_t.dim_size(0), + Status(error::INVALID_ARGUMENT, + "Leading dim_size of both tensors must match.")); + + // this is required because we do indexing in int32 on the GPU + OP_REQUIRES(ctx, values_t.NumElements() < std::numeric_limits<int>::max(), + Status(error::INVALID_ARGUMENT, + "values tensor size must less than INT_MAX")); + + Tensor* output_t; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, values_t.shape(), &output_t)); + + if (output_t->dtype() == DT_INT32) { + OP_REQUIRES(ctx, + FastBoundsCheck(sorted_inputs_t.dim_size(1), + std::numeric_limits<int>::max()), + errors::InvalidArgument("trailing dim_size must less than " + "INT_MAX for int32 output type, was ", + sorted_inputs_t.dim_size(1))); + } + + auto output = output_t->template flat<OutType>(); + const auto sorted_inputs = sorted_inputs_t.template flat<T>(); + const auto values = values_t.template flat<T>(); + OP_REQUIRES_OK( + ctx, functor::UpperBoundFunctor<Device, T, OutType>::Compute( + ctx, sorted_inputs, values, sorted_inputs_t.dim_size(0), + sorted_inputs_t.dim_size(1), values_t.dim_size(1), &output)); + } +}; + +template <typename Device, typename T, typename OutType> +class LowerBoundOp : public OpKernel { + public: + explicit LowerBoundOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + const Tensor& sorted_inputs_t = ctx->input(0); + const Tensor& values_t = ctx->input(1); + + // must have same batch dim_size for both + OP_REQUIRES(ctx, sorted_inputs_t.dim_size(0) == values_t.dim_size(0), + Status(error::INVALID_ARGUMENT, + "Leading dim_size of both tensors must match.")); + + // this is required because we do indexing in int32 on the GPU + OP_REQUIRES(ctx, values_t.NumElements() < std::numeric_limits<int>::max(), + Status(error::INVALID_ARGUMENT, + "values tensor size must less than INT_MAX")); + + Tensor* output_t; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, values_t.shape(), &output_t)); + + if (output_t->dtype() == DT_INT32) { + OP_REQUIRES(ctx, + FastBoundsCheck(sorted_inputs_t.dim_size(1), + std::numeric_limits<int>::max()), + errors::InvalidArgument("trailing dim_size must less than " + "INT_MAX for int32 output type, was ", + sorted_inputs_t.dim_size(1))); + } + + auto output = output_t->template flat<OutType>(); + const auto sorted_inputs = sorted_inputs_t.template flat<T>(); + const auto values = values_t.template flat<T>(); + OP_REQUIRES_OK( + ctx, functor::LowerBoundFunctor<Device, T, OutType>::Compute( + ctx, sorted_inputs, values, sorted_inputs_t.dim_size(0), + sorted_inputs_t.dim_size(1), values_t.dim_size(1), &output)); + } +}; + +#define REGISTER_KERNELS(type) \ + REGISTER_KERNEL_BUILDER(Name("UpperBound") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<int32>("out_type"), \ + UpperBoundOp<CPUDevice, type, int32>); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS); +#undef REGISTER_KERNELS + +#define REGISTER_KERNELS(type) \ + REGISTER_KERNEL_BUILDER(Name("UpperBound") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<int64>("out_type"), \ + UpperBoundOp<CPUDevice, type, int64>); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS); +#undef REGISTER_KERNELS + +#if GOOGLE_CUDA + +#define REGISTER_KERNELS(type) \ + REGISTER_KERNEL_BUILDER(Name("UpperBound") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<int32>("out_type"), \ + UpperBoundOp<GPUDevice, type, int32>); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS); +#undef REGISTER_KERNELS + +#define REGISTER_KERNELS(type) \ + REGISTER_KERNEL_BUILDER(Name("UpperBound") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<int64>("out_type"), \ + UpperBoundOp<GPUDevice, type, int64>); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS); +#undef REGISTER_KERNELS + +#endif // GOOGLE_CUDA + +#define REGISTER_KERNELS(type) \ + REGISTER_KERNEL_BUILDER(Name("LowerBound") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<int32>("out_type"), \ + LowerBoundOp<CPUDevice, type, int32>); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS); +#undef REGISTER_KERNELS + +#define REGISTER_KERNELS(type) \ + REGISTER_KERNEL_BUILDER(Name("LowerBound") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<int64>("out_type"), \ + LowerBoundOp<CPUDevice, type, int64>); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS); +#undef REGISTER_KERNELS + +#if GOOGLE_CUDA + +#define REGISTER_KERNELS(type) \ + REGISTER_KERNEL_BUILDER(Name("LowerBound") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<int32>("out_type"), \ + LowerBoundOp<GPUDevice, type, int32>); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS); +#undef REGISTER_KERNELS + +#define REGISTER_KERNELS(type) \ + REGISTER_KERNEL_BUILDER(Name("LowerBound") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<int64>("out_type"), \ + LowerBoundOp<GPUDevice, type, int64>); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS); +#undef REGISTER_KERNELS + +#endif // GOOGLE_CUDA +} // namespace tensorflow diff --git a/tensorflow/core/kernels/searchsorted_op.h b/tensorflow/core/kernels/searchsorted_op.h new file mode 100644 index 0000000000..f075bf0fa2 --- /dev/null +++ b/tensorflow/core/kernels/searchsorted_op.h @@ -0,0 +1,52 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_KERNELS_SEARCHSORTED_OP_H_ +#define TENSORFLOW_CORE_KERNELS_SEARCHSORTED_OP_H_ + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/errors.h" + +namespace tensorflow { +namespace functor { + +template <typename Device, typename T, typename OutType> +struct UpperBoundFunctor { + // Searches for values in sorted_inputs and returns the greatest possible + // index where they maintain sorted order. + static Status Compute(OpKernelContext* context, + const typename TTypes<T, 1>::ConstTensor& sorted_inputs, + const typename TTypes<T, 1>::ConstTensor& values, + int batch_size, int num_inputs, int num_values, + typename TTypes<OutType, 1>::Tensor* output); +}; + +template <typename Device, typename T, typename OutType> +struct LowerBoundFunctor { + // Searches for values in sorted_inputs and returns the lowest possible + // index where they maintain sorted order. + static Status Compute(OpKernelContext* context, + const typename TTypes<T, 1>::ConstTensor& sorted_inputs, + const typename TTypes<T, 1>::ConstTensor& values, + int batch_size, int num_inputs, int num_values, + typename TTypes<OutType, 1>::Tensor* output); +}; +} // namespace functor + +} // end namespace tensorflow +#endif // TENSORFLOW_CORE_KERNELS_SEARCHSORTED_OP_H_ diff --git a/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc b/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc new file mode 100644 index 0000000000..263b5bf298 --- /dev/null +++ b/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc @@ -0,0 +1,126 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/searchsorted_op.h" + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" + +namespace tensorflow { +typedef Eigen::GpuDevice GPUDevice; + +namespace { +template <typename T, typename OutType> +__global__ void UpperBoundKernel(const T* sorted_inputs, int batch_size, + int sorted_inputs_size, int values_size, + const T* values, OutType* outputs) { + CUDA_1D_KERNEL_LOOP(work_unit_id, values_size * batch_size) { + int bid = work_unit_id / values_size; + T value = values[work_unit_id]; + outputs[work_unit_id] = cuda_helper::upper_bound<T, OutType>( + sorted_inputs + bid * sorted_inputs_size, sorted_inputs_size, value); + } +} + +template <typename T, typename OutType> +__global__ void LowerBoundKernel(const T* sorted_inputs, int batch_size, + int sorted_inputs_size, int values_size, + const T* values, OutType* outputs) { + CUDA_1D_KERNEL_LOOP(work_unit_id, values_size * batch_size) { + int bid = work_unit_id / values_size; + T value = values[work_unit_id]; + outputs[work_unit_id] = cuda_helper::lower_bound<T, OutType>( + sorted_inputs + bid * sorted_inputs_size, sorted_inputs_size, value); + } +} +} // namespace + +namespace functor { +template <typename T, typename OutType> +struct UpperBoundFunctor<GPUDevice, T, OutType> { + static Status Compute(OpKernelContext* context, + const typename TTypes<T, 1>::ConstTensor& sorted_inputs, + const typename TTypes<T, 1>::ConstTensor& values, + int batch_size, int num_inputs, int num_values, + typename TTypes<OutType, 1>::Tensor* output) { + const cudaStream_t& stream = GetCudaStream(context); + CudaLaunchConfig config = + GetCudaLaunchConfig(values.size(), context->eigen_gpu_device()); + + UpperBoundKernel<T> + <<<config.block_count, config.thread_per_block, 0, stream>>>( + sorted_inputs.data(), batch_size, num_inputs, num_values, + values.data(), output->data()); + + return Status::OK(); + } +}; + +template <typename T, typename OutType> +struct LowerBoundFunctor<GPUDevice, T, OutType> { + static Status Compute(OpKernelContext* context, + const typename TTypes<T, 1>::ConstTensor& sorted_inputs, + const typename TTypes<T, 1>::ConstTensor& values, + int batch_size, int num_inputs, int num_values, + typename TTypes<OutType, 1>::Tensor* output) { + const cudaStream_t& stream = GetCudaStream(context); + CudaLaunchConfig config = + GetCudaLaunchConfig(values.size(), context->eigen_gpu_device()); + + LowerBoundKernel<T> + <<<config.block_count, config.thread_per_block, 0, stream>>>( + sorted_inputs.data(), batch_size, num_inputs, num_values, + values.data(), output->data()); + + return Status::OK(); + } +}; +} // namespace functor + +#define REGISTER_GPU_SPEC(type) \ + template struct functor::UpperBoundFunctor<GPUDevice, type, int32>; + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_GPU_SPEC); +#undef REGISTER_GPU_SPEC + +#define REGISTER_GPU_SPEC(type) \ + template struct functor::UpperBoundFunctor<GPUDevice, type, int64>; + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_GPU_SPEC); +#undef REGISTER_GPU_SPEC + +#define REGISTER_GPU_SPEC(type) \ + template struct functor::LowerBoundFunctor<GPUDevice, type, int32>; + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_GPU_SPEC); +#undef REGISTER_GPU_SPEC + +#define REGISTER_GPU_SPEC(type) \ + template struct functor::LowerBoundFunctor<GPUDevice, type, int64>; + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_GPU_SPEC); +#undef REGISTER_GPU_SPEC +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/shape_op_test.cc b/tensorflow/core/kernels/shape_op_test.cc index 9cd590ae61..30cb1e0a7f 100644 --- a/tensorflow/core/kernels/shape_op_test.cc +++ b/tensorflow/core/kernels/shape_op_test.cc @@ -28,6 +28,7 @@ limitations under the License. #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/abi.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -60,8 +61,7 @@ Status GetShapeFromKnownVecSize(const KnownVecSize& ks, TensorShape* s) { REGISTER_UNARY_VARIANT_DECODE_FUNCTION(KnownVecSize, "KNOWN VECTOR SIZE TYPE"); -REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(KnownVecSize, "KNOWN VECTOR SIZE TYPE", - GetShapeFromKnownVecSize); +REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(KnownVecSize, GetShapeFromKnownVecSize); static void ExpectHasError(const Status& s, StringPiece substr) { EXPECT_TRUE(str_util::StrContains(s.ToString(), substr)) @@ -94,9 +94,9 @@ TEST_F(ShapeOpTest, Simple) { Status s = session.Run({{input, variant_tensor}}, {shape_output}, &outputs); EXPECT_FALSE(s.ok()); ExpectHasError( - s, - "No unary variant shape function found for Variant type_name: " - "NO KNOWN SHAPE"); + s, strings::StrCat( + "No unary variant shape function found for Variant type_index: ", + port::MaybeAbiDemangle(MakeTypeIndex<NoKnownShape>().name()))); } { diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc index 393818730b..a4a59dbcbc 100644 --- a/tensorflow/core/kernels/split_lib_gpu.cu.cc +++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc @@ -54,6 +54,7 @@ void SplitCustom<Device, T>::operator()( TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS); TF_CALL_complex64(DEFINE_GPU_KERNELS); TF_CALL_complex128(DEFINE_GPU_KERNELS); +TF_CALL_int64(DEFINE_GPU_KERNELS); TF_CALL_bfloat16(DEFINE_GPU_KERNELS); #undef DEFINE_GPU_KERNELS diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc index 7cc3c532c9..11db72bfa3 100644 --- a/tensorflow/core/kernels/split_op.cc +++ b/tensorflow/core/kernels/split_op.cc @@ -49,7 +49,12 @@ class SplitOpBase : public OpKernel { void ComputeEasyCases(OpKernelContext* context, bool* done) { const Tensor& input = context->input(1); const TensorShape& input_shape = input.shape(); - const int32 split_dim_orig = context->input(0).flat<int32>()(0); + const Tensor& split_dim_tensor = context->input(0); + OP_REQUIRES( + context, split_dim_tensor.shape().dims() == 0, + errors::InvalidArgument("split_dim must be a scalar but has rank ", + split_dim_tensor.shape().dims())); + const int32 split_dim_orig = split_dim_tensor.flat<int32>()(0); const int32 split_dim = split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig; const int32 num_split = num_outputs(); diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc index 65296f61fd..add4afafc9 100644 --- a/tensorflow/core/kernels/stack_ops.cc +++ b/tensorflow/core/kernels/stack_ops.cc @@ -131,10 +131,8 @@ class Stack : public ResourceBase { }; Status GetStack(OpKernelContext* ctx, Stack** stack) { - string key; if (ctx->input_dtype(0) == DT_RESOURCE) { - auto resource = ctx->input(0).flat<ResourceHandle>()(0); - key = resource.name(); + return LookupResource(ctx, HandleFromInput(ctx, 0), stack); } else { Tensor Tstack_handle = ctx->mutable_input(0, false); if (Tstack_handle.NumElements() != 2) { @@ -144,18 +142,18 @@ Status GetStack(OpKernelContext* ctx, Stack** stack) { } const string& container = Tstack_handle.flat<string>()(0); const string& stack_name = Tstack_handle.flat<string>()(1); - key = strings::StrCat(container, stack_name); - } - ResourceMgr* rm = ctx->resource_manager(); - if (rm == nullptr) { - return errors::Internal("No resource manager."); - } - auto* step_container = ctx->step_container(); - if (step_container == nullptr) { - return errors::Internal("No step container."); + string key = strings::StrCat(container, stack_name); + ResourceMgr* rm = ctx->resource_manager(); + if (rm == nullptr) { + return errors::Internal("No resource manager."); + } + auto* step_container = ctx->step_container(); + if (step_container == nullptr) { + return errors::Internal("No step container."); + } + TF_RETURN_IF_ERROR(rm->Lookup(step_container->name(), key, stack)); + return Status::OK(); } - TF_RETURN_IF_ERROR(rm->Lookup(step_container->name(), key, stack)); - return Status::OK(); } std::atomic<int64> Stack::stack_counter{0}; diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc index 7b537fef5b..f0575de4d9 100644 --- a/tensorflow/core/kernels/strided_slice_op.cc +++ b/tensorflow/core/kernels/strided_slice_op.cc @@ -306,6 +306,7 @@ class StridedSliceAssignOp : public OpKernel { Var* v; OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0), &v)); + core::ScopedUnref scoped_unref(v); mutex_lock ml(*v->mu()); OP_REQUIRES_OK(context, PrepareToUpdateVariable<Device, T>(context, v->tensor())); diff --git a/tensorflow/core/kernels/string_format_op.cc b/tensorflow/core/kernels/string_format_op.cc new file mode 100644 index 0000000000..e4a1887f8d --- /dev/null +++ b/tensorflow/core/kernels/string_format_op.cc @@ -0,0 +1,65 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include <iostream> +#include "absl/strings/str_split.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/platform/logging.h" + +namespace tensorflow { + +class StringFormatOp : public OpKernel { + public: + explicit StringFormatOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + string template_; + OP_REQUIRES_OK(ctx, ctx->GetAttr("template", &template_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("placeholder", &placeholder_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("summarize", &summarize_)); + + split_template_ = absl::StrSplit(template_, placeholder_); + int64 num_placeholders = split_template_.size() - 1; + OP_REQUIRES(ctx, ctx->num_inputs() == num_placeholders, + errors::InvalidArgument(strings::StrCat( + "num placeholders in template and num inputs must match: ", + num_placeholders, " vs. ", ctx->num_inputs()))); + } + + void Compute(OpKernelContext* ctx) override { + Tensor* formatted_string = nullptr; + OP_REQUIRES_OK(ctx, + ctx->allocate_output(0, TensorShape({}), &formatted_string)); + + string msg; + strings::StrAppend(&msg, split_template_[0].c_str()); + for (int i = 0; i < ctx->num_inputs(); ++i) { + strings::StrAppend(&msg, ctx->input(i).SummarizeValue(summarize_, true)); + strings::StrAppend(&msg, split_template_[i + 1].c_str()); + } + + formatted_string->scalar<string>()() = msg; + } + + private: + int32 summarize_ = 0; + string placeholder_; + std::vector<std::string> split_template_; +}; + +REGISTER_KERNEL_BUILDER(Name("StringFormat").Device(DEVICE_CPU), + StringFormatOp); + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/string_format_op_test.cc b/tensorflow/core/kernels/string_format_op_test.cc new file mode 100644 index 0000000000..13130a5797 --- /dev/null +++ b/tensorflow/core/kernels/string_format_op_test.cc @@ -0,0 +1,66 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" + +namespace tensorflow { +namespace { + +class StringFormatGraphTest : public OpsTestBase { + protected: + Status Init(int num_inputs, DataType input_type, + const string& template_ = "%s", const string& placeholder = "%s", + int summarize = 3) { + TF_CHECK_OK(NodeDefBuilder("op", "StringFormat") + .Input(FakeInput(num_inputs, input_type)) + .Attr("template", template_) + .Attr("placeholder", placeholder) + .Attr("summarize", summarize) + .Finalize(node_def())); + return InitOp(); + } +}; + +TEST_F(StringFormatGraphTest, Int32Success_7) { + TF_ASSERT_OK(Init(1, DT_INT32, "First tensor: %s")); + + AddInputFromArray<int32>(TensorShape({7}), {1, 2, 3, 4, 5, 6, 7}); + TF_ASSERT_OK(RunOpKernel()); + Tensor expected(allocator(), DT_STRING, TensorShape({})); + test::FillValues<string>(&expected, {"First tensor: [1 2 3 ... 5 6 7]"}); + test::ExpectTensorEqual<string>(expected, *GetOutput(0)); +} + +TEST_F(StringFormatGraphTest, Int32Success_3_3) { + TF_ASSERT_OK(Init(1, DT_INT32, "First tensor: %s", "%s", 1)); + + AddInputFromArray<int32>(TensorShape({3, 3}), {1, 2, 3, 4, 5, 6, 7, 8, 9}); + TF_ASSERT_OK(RunOpKernel()); + Tensor expected(allocator(), DT_STRING, TensorShape({})); + test::FillValues<string>(&expected, {"First tensor: [[1 ... 3]\n ..." + "\n [7 ... 9]]"}); + test::ExpectTensorEqual<string>(expected, *GetOutput(0)); +} + +} // end namespace +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/string_length_op.cc b/tensorflow/core/kernels/string_length_op.cc index a6829b29d9..435a7abdca 100644 --- a/tensorflow/core/kernels/string_length_op.cc +++ b/tensorflow/core/kernels/string_length_op.cc @@ -14,13 +14,18 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/string_util.h" namespace tensorflow { namespace { class StringLengthOp : public OpKernel { public: - using OpKernel::OpKernel; + explicit StringLengthOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + string unit; + OP_REQUIRES_OK(ctx, ctx->GetAttr("unit", &unit)); + OP_REQUIRES_OK(ctx, ParseCharUnit(unit, &unit_)); + } void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); @@ -32,10 +37,22 @@ class StringLengthOp : public OpKernel { auto src = input.flat<string>(); auto dst = output->flat<int32>(); - for (int n = 0; n < src.size(); ++n) { - dst(n) = src(n).size(); + switch (unit_) { + case CharUnit::BYTE: + for (int n = 0; n < src.size(); ++n) { + dst(n) = src(n).size(); + } + break; + case CharUnit::UTF8_CHAR: + for (int n = 0; n < src.size(); ++n) { + dst(n) = UTF8StrLen(src(n)); + } + break; } } + + private: + CharUnit unit_ = CharUnit::BYTE; }; REGISTER_KERNEL_BUILDER(Name("StringLength").Device(DEVICE_CPU), diff --git a/tensorflow/core/kernels/string_util.cc b/tensorflow/core/kernels/string_util.cc new file mode 100644 index 0000000000..3a9803a052 --- /dev/null +++ b/tensorflow/core/kernels/string_util.cc @@ -0,0 +1,63 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/core/kernels/string_util.h" + +#include "tensorflow/core/lib/core/errors.h" + +namespace { +inline bool IsTrailByte(char x) { return static_cast<signed char>(x) < -0x40; } +} // namespace + +namespace tensorflow { + +// Sets unit value based on str. +Status ParseUnicodeEncoding(const string& str, UnicodeEncoding* encoding) { + if (str == "UTF8") { + *encoding = UnicodeEncoding::UTF8; + } else { + return errors::InvalidArgument(strings::StrCat( + "Invalid encoding \"", str, "\": Should be one of: BYTE")); + } + return Status::OK(); +} + +// Sets unit value based on str. +Status ParseCharUnit(const string& str, CharUnit* unit) { + if (str == "BYTE") { + *unit = CharUnit::BYTE; + } else if (str == "UTF8_CHAR") { + *unit = CharUnit::UTF8_CHAR; + } else { + return errors::InvalidArgument(strings::StrCat( + "Invalid unit \"", str, "\": Should be one of: BYTE, UTF8_CHAR")); + } + return Status::OK(); +} + +// Return the number of Unicode characters in a UTF-8 string. +// Result may be incorrect if the input string is not valid UTF-8. +int32 UTF8StrLen(const string& string) { + const int32 byte_size = string.size(); + const char* const end = string.data() + byte_size; + const char* ptr = string.data(); + int32 skipped_count = 0; + while (ptr < end) { + skipped_count += IsTrailByte(*ptr++) ? 1 : 0; + } + const int32 result = byte_size - skipped_count; + return result; +} + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/string_util.h b/tensorflow/core/kernels/string_util.h new file mode 100644 index 0000000000..390cf57702 --- /dev/null +++ b/tensorflow/core/kernels/string_util.h @@ -0,0 +1,45 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_ +#define TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_ + +#include "tensorflow/core/lib/core/status.h" + +namespace tensorflow { + +// Enumeration for unicode encodings. Used by ops such as +// tf.strings.unicode_encode and tf.strings.unicode_decode. +// TODO(edloper): Add support for: +// UTF16, UTF32, UTF16BE, UTF32BE, UTF16LE, UTF32LE +enum class UnicodeEncoding { UTF8 }; + +// Enumeration for character units. Used by string such as +// tf.strings.length and tf.substr. +// TODO(edloper): Add support for: UTF32_CHAR, etc. +enum class CharUnit { BYTE, UTF8_CHAR }; + +// Sets `encoding` based on `str`. +Status ParseUnicodeEncoding(const string& str, UnicodeEncoding* encoding); + +// Sets `unit` value based on `str`. +Status ParseCharUnit(const string& str, CharUnit* unit); + +// Returns the number of Unicode characters in a UTF-8 string. +// Result may be incorrect if the input string is not valid UTF-8. +int32 UTF8StrLen(const string& string); + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_ diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc index 22e45918a0..07f1d6e767 100644 --- a/tensorflow/core/kernels/substr_op.cc +++ b/tensorflow/core/kernels/substr_op.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include <cstddef> +#include <cstdlib> #include <string> #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" @@ -25,6 +27,8 @@ limitations under the License. #include "tensorflow/core/framework/types.h" #include "tensorflow/core/kernels/bounds_check.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/stringpiece.h" +#include "tensorflow/core/platform/types.h" #include "tensorflow/core/util/bcast.h" namespace tensorflow { @@ -64,26 +68,28 @@ class SubstrOp : public OpKernel { const T len = tensorflow::internal::SubtleMustCopy(len_tensor.scalar<T>()()); for (size_t i = 0; i < input_tensor.NumElements(); ++i) { - string in = input(i); + StringPiece in(input(i)); OP_REQUIRES( - context, FastBoundsCheck(pos, in.size() + 1), + context, FastBoundsCheck(std::abs(pos), in.size() + 1), errors::InvalidArgument("pos ", pos, " out of range for string", "b'", in, "' at index ", i)); - output(i) = in.substr(pos, len); + StringPiece sub_in = in.substr(AdjustedPosIndex(pos, in), len); + output(i).assign(sub_in.data(), sub_in.size()); } } else { // Perform Op element-wise with tensor pos/len auto pos_flat = pos_tensor.flat<T>(); auto len_flat = len_tensor.flat<T>(); for (size_t i = 0; i < input_tensor.NumElements(); ++i) { - string in = input(i); + StringPiece in(input(i)); const T pos = tensorflow::internal::SubtleMustCopy(pos_flat(i)); const T len = tensorflow::internal::SubtleMustCopy(len_flat(i)); OP_REQUIRES( - context, FastBoundsCheck(pos, in.size() + 1), + context, FastBoundsCheck(std::abs(pos), in.size() + 1), errors::InvalidArgument("pos ", pos, " out of range for string", "b'", in, "' at index ", i)); - output(i) = in.substr(pos, len); + StringPiece sub_in = in.substr(AdjustedPosIndex(pos, in), len); + output(i).assign(sub_in.data(), sub_in.size()); } } } else { @@ -142,14 +148,16 @@ class SubstrOp : public OpKernel { // Iterate through broadcasted tensors and perform substr for (int i = 0; i < output_shape.dim_size(0); ++i) { - string in = input_bcast(i); + StringPiece in(input_bcast(i)); const T pos = tensorflow::internal::SubtleMustCopy(pos_bcast(i)); const T len = tensorflow::internal::SubtleMustCopy(len_bcast(i)); OP_REQUIRES( - context, FastBoundsCheck(pos, input_bcast(i).size() + 1), + context, + FastBoundsCheck(std::abs(pos), input_bcast(i).size() + 1), errors::InvalidArgument("pos ", pos, " out of range for string", "b'", in, "' at index ", i)); - output(i) = in.substr(pos, len); + StringPiece sub_in = in.substr(AdjustedPosIndex(pos, in), len); + output(i).assign(sub_in.data(), sub_in.size()); } break; } @@ -192,16 +200,18 @@ class SubstrOp : public OpKernel { // Iterate through broadcasted tensors and perform substr for (int i = 0; i < output_shape.dim_size(0); ++i) { for (int j = 0; j < output_shape.dim_size(1); ++j) { - string in = input_bcast(i, j); + StringPiece in(input_bcast(i, j)); const T pos = tensorflow::internal::SubtleMustCopy(pos_bcast(i, j)); const T len = tensorflow::internal::SubtleMustCopy(len_bcast(i, j)); - OP_REQUIRES(context, FastBoundsCheck(pos, in.size() + 1), - errors::InvalidArgument( - "pos ", pos, " out of range for ", "string b'", - in, "' at index (", i, ", ", j, ")")); - output(i, j) = in.substr(pos, len); + OP_REQUIRES( + context, FastBoundsCheck(std::abs(pos), in.size() + 1), + errors::InvalidArgument("pos ", pos, " out of range for ", + "string b'", in, "' at index (", i, + ", ", j, ")")); + StringPiece sub_in = in.substr(AdjustedPosIndex(pos, in), len); + output(i, j).assign(sub_in.data(), sub_in.size()); } } break; @@ -213,6 +223,16 @@ class SubstrOp : public OpKernel { } } } + + private: + // This adjusts the requested position. Note it does not perform any bound + // checks. + T AdjustedPosIndex(const T pos_requested, const StringPiece s) { + if (pos_requested < 0) { + return s.size() + pos_requested; + } + return pos_requested; + } }; #define REGISTER_SUBSTR(type) \ diff --git a/tensorflow/core/kernels/substr_op_test.cc b/tensorflow/core/kernels/substr_op_test.cc new file mode 100644 index 0000000000..2e07050260 --- /dev/null +++ b/tensorflow/core/kernels/substr_op_test.cc @@ -0,0 +1,105 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include <string> + +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/graph/testlib.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { + +// Test data from the TensorFlow README.md. +const char* lines[] = { + "**TensorFlow** is an open source software library for numerical " + "computation using data flow graphs.", + "The graph nodes represent mathematical operations, while the graph edges " + "represent the multidimensional data arrays (tensors) that flow between " + "them.", + "This flexible architecture enables you to deploy computation to one or " + "more CPUs or GPUs in a desktop, server, or mobile device without " + "rewriting code.", + "TensorFlow also includes " + "[TensorBoard](https://www.tensorflow.org/guide/" + "summaries_and_tensorboard), a data visualization toolkit.", + "TensorFlow was originally developed by researchers and engineers working " + "on the Google Brain team within Google's Machine Intelligence Research " + "organization for the purposes of conducting machine learning and deep " + "neural networks research.", + "The system is general enough to be applicable in a wide variety of other " + "domains, as well.", + "TensorFlow provides stable Python API and C APIs as well as without API " + "backwards compatibility guarantee like C++, Go, Java, JavaScript and " + "Swift."}; + +Tensor GetTestTensor(int batch) { + const int sz = TF_ARRAYSIZE(lines); + Tensor t(DT_STRING, {batch}); + auto s = t.flat<string>(); + for (int i = 0; i < batch; ++i) { + s(i) = lines[i % sz]; + } + return t; +} + +Graph* SetupSubstrGraph(const Tensor& input, const int32 pos, const int32 len) { + Graph* g = new Graph(OpRegistry::Global()); + Tensor position(DT_INT32, TensorShape({})); + position.flat<int32>().setConstant(pos); + Tensor length(DT_INT32, TensorShape({})); + length.flat<int32>().setConstant(len); + + TF_CHECK_OK(NodeBuilder("substr_op", "Substr") + .Input(test::graph::Constant(g, input)) + .Input(test::graph::Constant(g, position)) + .Input(test::graph::Constant(g, length)) + .Finalize(g, nullptr /* node */)); + return g; +} + +void BM_Substr(int iters, int batch_size) { + testing::StopTiming(); + testing::ItemsProcessed(static_cast<int64>(iters)); + testing::UseRealTime(); + Tensor input = GetTestTensor(batch_size); + Graph* g = SetupSubstrGraph(input, 3, 30); + testing::StartTiming(); + test::Benchmark("cpu", g).Run(iters); +} + +BENCHMARK(BM_Substr)->Arg(1)->Arg(8)->Arg(16)->Arg(32)->Arg(64)->Arg(128)->Arg( + 256); + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/tensor_array.cc b/tensorflow/core/kernels/tensor_array.cc index 765467bc1e..0e6c0ddccc 100644 --- a/tensorflow/core/kernels/tensor_array.cc +++ b/tensorflow/core/kernels/tensor_array.cc @@ -62,7 +62,8 @@ TF_CALL_complex128(TENSOR_ARRAY_WRITE_OR_ADD_GPU); } #define TENSOR_ARRAY_SET_ZERO_CPU(T) TENSOR_ARRAY_SET_ZERO(CPUDevice, T) -TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_CPU) +TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_CPU); +TF_CALL_bool(TENSOR_ARRAY_SET_ZERO_CPU); #undef TENSOR_ARRAY_SET_ZERO_CPU #if GOOGLE_CUDA diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h index e8dc4fad21..384a63e945 100644 --- a/tensorflow/core/kernels/tensor_array.h +++ b/tensorflow/core/kernels/tensor_array.h @@ -81,7 +81,8 @@ Status TensorSetZero(OpKernelContext* ctx, Tensor* value) { Status TensorSetZero<Device, T>(OpKernelContext * ctx, Tensor * value); #define TENSOR_ARRAY_SET_ZERO_CPU(T) TENSOR_ARRAY_SET_ZERO(CPUDevice, T) -TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_CPU) +TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_CPU); +TF_CALL_bool(TENSOR_ARRAY_SET_ZERO_CPU); #undef TENSOR_ARRAY_SET_ZERO_CPU #if GOOGLE_CUDA diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc index 2ec2651c04..a97a71b344 100644 --- a/tensorflow/core/kernels/tensor_array_ops.cc +++ b/tensorflow/core/kernels/tensor_array_ops.cc @@ -259,6 +259,7 @@ REGISTER_KERNEL_BUILDER(Name("TensorArrayV3").Device(DEVICE_CPU), TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU); TF_CALL_complex64(REGISTER_GPU); TF_CALL_complex128(REGISTER_GPU); +TF_CALL_int64(REGISTER_GPU); REGISTER_GPU(bfloat16); #undef REGISTER_GPU @@ -290,7 +291,7 @@ class TensorArrayGradOp : public TensorArrayCreationOp { } } else { container = "_tensor_arrays"; - auto resource = ctx->input(0).flat<ResourceHandle>()(0); + const auto& resource = ctx->input(0).flat<ResourceHandle>()(0); if (StringPiece(resource.name()).substr(0, container.size()) != container) { return errors::InvalidArgument("Wrong input container. ", @@ -576,6 +577,7 @@ TF_CALL_ALL_TYPES(REGISTER_READ) TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU); TF_CALL_complex64(REGISTER_GPU); TF_CALL_complex128(REGISTER_GPU); +TF_CALL_int64(REGISTER_GPU); REGISTER_GPU(bfloat16); #undef REGISTER_GPU @@ -1218,6 +1220,7 @@ TF_CALL_ALL_TYPES(REGISTER_SCATTER_AND_UNPACK); TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU); TF_CALL_complex64(REGISTER_GPU); TF_CALL_complex128(REGISTER_GPU); +TF_CALL_int64(REGISTER_GPU); #undef REGISTER_GPU #endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/topk_op_gpu.cu.cc b/tensorflow/core/kernels/topk_op_gpu.cu.cc index ca296d5aa0..2fbe1fe7cb 100644 --- a/tensorflow/core/kernels/topk_op_gpu.cu.cc +++ b/tensorflow/core/kernels/topk_op_gpu.cu.cc @@ -20,9 +20,9 @@ limitations under the License. #include <cmath> #include <vector> #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "external/cub_archive/cub/device/device_segmented_radix_sort.cuh" -#include "external/cub_archive/cub/iterator/counting_input_iterator.cuh" -#include "external/cub_archive/cub/iterator/transform_input_iterator.cuh" +#include "third_party/cub/device/device_segmented_radix_sort.cuh" +#include "third_party/cub/iterator/counting_input_iterator.cuh" +#include "third_party/cub/iterator/transform_input_iterator.cuh" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" diff --git a/tensorflow/core/kernels/training_op_helpers.cc b/tensorflow/core/kernels/training_op_helpers.cc index d3c4f62071..83b83fcdb9 100644 --- a/tensorflow/core/kernels/training_op_helpers.cc +++ b/tensorflow/core/kernels/training_op_helpers.cc @@ -21,6 +21,7 @@ mutex* GetTrainingVariableMutex(OpKernelContext* ctx, int input) { if (ctx->input_dtype(input) == DT_RESOURCE) { Var* var; if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) { + core::ScopedUnref scoped_unref(var); return var->mu(); } else { ctx->CtxFailureWithWarning( diff --git a/tensorflow/core/kernels/unravel_index_op.cc b/tensorflow/core/kernels/unravel_index_op.cc index 62e814ff77..8d839ba85a 100644 --- a/tensorflow/core/kernels/unravel_index_op.cc +++ b/tensorflow/core/kernels/unravel_index_op.cc @@ -97,10 +97,12 @@ class UnravelIndexOp : public OpKernel { auto output = output_tensor->matrix<Tidx>(); - Eigen::array<int64, 2> reshape{{dims_tensor.NumElements(), 1}}; - Eigen::array<int64, 2> bcast({1, indices_tensor.NumElements()}); - Eigen::array<int64, 2> indices_reshape{{1, indices_tensor.NumElements()}}; - Eigen::array<int64, 2> indices_bcast({dims_tensor.NumElements(), 1}); + Eigen::array<Eigen::Index, 2> reshape{{dims_tensor.NumElements(), 1}}; + Eigen::array<Eigen::Index, 2> bcast({1, indices_tensor.NumElements()}); + Eigen::array<Eigen::Index, 2> indices_reshape{ + {1, indices_tensor.NumElements()}}; + Eigen::array<Eigen::Index, 2> indices_bcast( + {dims_tensor.NumElements(), 1}); output = indices_tensor.vec<Tidx>() .reshape(indices_reshape) diff --git a/tensorflow/core/kernels/where_op_gpu.cu.h b/tensorflow/core/kernels/where_op_gpu.cu.h index 8879d9dd4c..2255597651 100644 --- a/tensorflow/core/kernels/where_op_gpu.cu.h +++ b/tensorflow/core/kernels/where_op_gpu.cu.h @@ -21,10 +21,10 @@ limitations under the License. #define EIGEN_USE_GPU #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "external/cub_archive/cub/device/device_reduce.cuh" -#include "external/cub_archive/cub/device/device_select.cuh" -#include "external/cub_archive/cub/iterator/counting_input_iterator.cuh" -#include "external/cub_archive/cub/iterator/transform_input_iterator.cuh" +#include "third_party/cub/device/device_reduce.cuh" +#include "third_party/cub/device/device_select.cuh" +#include "third_party/cub/iterator/counting_input_iterator.cuh" +#include "third_party/cub/iterator/transform_input_iterator.cuh" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/kernels/bounds_check.h" |