diff options
Diffstat (limited to 'tensorflow/core')
62 files changed, 1668 insertions, 190 deletions
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index c25aac3acf..7fa0b79766 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -454,6 +454,7 @@ tf_cuda_library( "framework/reader_interface.h", "framework/reader_op_kernel.h", "framework/register_types.h", + "framework/register_types_traits.h", "framework/resource_mgr.h", "framework/resource_op_kernel.h", "framework/selective_registration.h", @@ -611,6 +612,7 @@ tf_gen_op_libs( "list_ops", "lookup_ops", "logging_ops", + "manip_ops", "math_ops", "nn_ops", "no_op", @@ -693,6 +695,7 @@ cc_library( ":list_ops_op_lib", ":logging_ops_op_lib", ":lookup_ops_op_lib", + ":manip_ops_op_lib", ":math_ops_op_lib", ":nn_ops_op_lib", ":no_op_op_lib", @@ -831,6 +834,7 @@ cc_library( "//tensorflow/core/kernels:list_kernels", "//tensorflow/core/kernels:lookup", "//tensorflow/core/kernels:logging", + "//tensorflow/core/kernels:manip", "//tensorflow/core/kernels:math", "//tensorflow/core/kernels:multinomial_op", "//tensorflow/core/kernels:nn", @@ -1153,6 +1157,7 @@ cc_library( deps = [ ":protos_all_cc_impl", "//third_party/eigen3", + "@nsync//:nsync_cpp", "@protobuf_archive//:protobuf", ], alwayslink = 1, diff --git a/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt index 8da76684e5..97fd39f647 100644 --- a/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt @@ -16,5 +16,6 @@ END description: <<END Note that this routine only supports wildcard characters in the basename portion of the pattern, not in the directory portion. +Note also that the order of filenames returned can be non-deterministic. END } diff --git a/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt b/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt new file mode 100644 index 0000000000..b308ad1f9d --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt @@ -0,0 +1,52 @@ +op { + graph_op_name: "Roll" + in_arg { + name: "shift" + description: <<END +Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which +elements are shifted positively (towards larger indices) along the dimension +specified by `axis[i]`. Negative shifts will roll the elements in the opposite +direction. +END + } + in_arg { + name: "axis" + description: <<END +Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift +`shift[i]` should occur. If the same axis is referenced more than once, the +total shift for that axis will be the sum of all the shifts that belong to that +axis. +END + } + out_arg { + name: "output" + description: <<END +Has the same shape and size as the input. The elements are shifted +positively (towards larger indices) by the offsets of `shift` along the +dimensions of `axis`. +END + } + summary: "Rolls the elements of a tensor along an axis." + description: <<END +The elements are shifted positively (towards larger indices) by the offset of +`shift` along the dimension of `axis`. Negative `shift` values will shift +elements in the opposite direction. Elements that roll passed the last position +will wrap around to the first and vice versa. Multiple shifts along multiple +axes may be specified. + +For example: + +``` +# 't' is [0, 1, 2, 3, 4] +roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2] + +# shifting along multiple dimensions +# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]] +roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]] + +# shifting along the same axis multiple times +# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]] +roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]] +``` +END +} diff --git a/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt new file mode 100644 index 0000000000..97c380700a --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt @@ -0,0 +1,32 @@ +op { + graph_op_name: "UnravelIndex" + in_arg { + name: "indices" + description: <<END +An 0-D or 1-D `int` Tensor whose elements are indices into the +flattened version of an array of dimensions dims. +END + } + in_arg { + name: "dims" + description: <<END +An 1-D `int` Tensor. The shape of the array to use for unraveling +indices. +END + } + out_arg { + name: "output" + description: <<END +An 2-D (or 1-D if indices is 0-D) tensor where each row has the +same shape as the indices array. +END + } + summary: "Converts a flat index or array of flat indices into a tuple of" + description: <<END +coordinate arrays. + +@compatibility(numpy) +Equivalent to np.unravel_index +@end_compatibility +END +} diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc index 04b5541863..a9485a835e 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc @@ -762,7 +762,8 @@ int64 MinSystemMemory(int64 available_memory) { // is necessary. min_system_memory *= 2; #endif -#if defined(NVIDIA_TEGRA) + +#if defined(ANDROID_TEGRA) // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM // and Video RAM min_system_memory = 1 << 30; diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD index f4ee841032..9e152aa082 100644 --- a/tensorflow/core/distributed_runtime/BUILD +++ b/tensorflow/core/distributed_runtime/BUILD @@ -145,6 +145,7 @@ cc_library( "//tensorflow/core:core_cpu_internal", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", + "//tensorflow/core:worker_proto_cc", ], ) diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc index dcc25e4426..878a1398c9 100644 --- a/tensorflow/core/distributed_runtime/master_session.cc +++ b/tensorflow/core/distributed_runtime/master_session.cc @@ -1448,6 +1448,7 @@ Status MasterSession::DoPartialRun(CallOptions* opts, const auto count = run_state->count; pss.collect_timeline = req.options().trace_level() == RunOptions::FULL_TRACE; + pss.collect_rpcs = req.options().trace_level() == RunOptions::FULL_TRACE; pss.report_tensor_allocations_upon_oom = req.options().report_tensor_allocations_upon_oom(); @@ -1610,6 +1611,7 @@ Status MasterSession::DoRunWithLocalExecution( TRACEPRINTF("stepid %llu", step_id); pss.collect_timeline = req.options().trace_level() == RunOptions::FULL_TRACE; + pss.collect_rpcs = req.options().trace_level() == RunOptions::FULL_TRACE; pss.report_tensor_allocations_upon_oom = req.options().report_tensor_allocations_upon_oom(); // Build the cost model every 'build_cost_model_every' steps after skipping an diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc index 95811476f7..b20e744a97 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc @@ -444,6 +444,24 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts, }); } +void GrpcWorker::LoggingAsync(const LoggingRequest* request, + LoggingResponse* response, StatusCallback done) { + auto env = this->env(); + if (env) { + auto session_mgr = (SessionMgr*)env->session_mgr; + if (session_mgr) { + session_mgr->SetLogging(request->rpc_logging()); + for (const auto& step_id : request->fetch_step_id()) { + session_mgr->RetrieveLogs(step_id, response); + } + if (request->clear()) { + session_mgr->ClearLogs(); + } + } + } + done(Status::OK()); +} + WorkerEnv* GrpcWorker::env() { return env_; } std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* env) { diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h index 78a21fd9f6..fbddbda9e6 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h +++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h @@ -40,6 +40,9 @@ class GrpcWorker : public Worker { ::grpc::ByteBuffer* response, StatusCallback done); + virtual void LoggingAsync(const LoggingRequest* request, + LoggingResponse* response, StatusCallback done); + WorkerEnv* env(); private: diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc index 8db49e7f15..90664c3612 100644 --- a/tensorflow/core/distributed_runtime/session_mgr.cc +++ b/tensorflow/core/distributed_runtime/session_mgr.cc @@ -64,8 +64,13 @@ Status SessionMgr::CreateSession(const string& session, TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache)); } + if (worker_cache != nullptr & default_worker_cache_.get() != nullptr) { + worker_cache->SetLogging(this->is_logging_active_); + } + CHECK(!worker_env_->local_devices.empty()) << "The WorkerEnv must have at least one device in `local_devices`."; + std::vector<Device*> renamed_devices; for (Device* d : worker_env_->local_devices) { renamed_devices.push_back(RenamedDevice::NewRenamedDevice( @@ -113,4 +118,77 @@ std::shared_ptr<WorkerSession> SessionMgr::LegacySession() { return legacy_session_; } +void SessionMgr::SetLogging(bool active) { + mutex_lock l(mu_); + this->is_logging_active_ = active; + // Legacy Session + if (legacy_session_) { + auto* worker_cache = legacy_session_->worker_cache.get(); + if (worker_cache) { + worker_cache->SetLogging(active); + } + } + + for (const auto& session_kv : sessions_) { + auto session = session_kv.second.get(); + if (session) { + auto* worker_cache = session->worker_cache.get(); + if (worker_cache) { + worker_cache->SetLogging(active); + } + } + } +} + +void SessionMgr::RetrieveLogs(tensorflow::int64 step_id, + LoggingResponse* response) { + mutex_lock l(mu_); + // Legacy Session + if (legacy_session_) { + auto* worker_cache = legacy_session_->worker_cache.get(); + if (worker_cache) { + auto step_stats = StepStats(); + if (worker_cache->RetrieveLogs(step_id, &step_stats)) { + auto* labeled_step_stats = response->add_step(); + labeled_step_stats->set_step_id(step_id); + labeled_step_stats->mutable_step_stats()->Swap(&step_stats); + } + } + } + for (const auto& session_kv : sessions_) { + auto session = session_kv.second.get(); + if (session) { + auto* worker_cache = session->worker_cache.get(); + if (worker_cache) { + auto step_stats = StepStats(); + if (worker_cache->RetrieveLogs(step_id, &step_stats)) { + auto* labeled_step_stats = response->add_step(); + labeled_step_stats->set_step_id(step_id); + labeled_step_stats->mutable_step_stats()->Swap(&step_stats); + } + } + } + } +} + +void SessionMgr::ClearLogs() { + mutex_lock l(mu_); + // Legacy Session + if (legacy_session_) { + auto* worker_cache = legacy_session_->worker_cache.get(); + if (worker_cache) { + worker_cache->ClearLogs(); + } + } + + for (const auto& session_kv : sessions_) { + auto session = session_kv.second.get(); + if (session) { + auto* worker_cache = session->worker_cache.get(); + if (worker_cache) { + worker_cache->ClearLogs(); + } + } + } +} } // namespace tensorflow diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h index 3ce260d12e..4c9702d522 100644 --- a/tensorflow/core/distributed_runtime/session_mgr.h +++ b/tensorflow/core/distributed_runtime/session_mgr.h @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/protobuf/tensorflow_server.pb.h" +#include "tensorflow/core/protobuf/worker.pb.h" namespace tensorflow { @@ -56,6 +57,12 @@ class SessionMgr { static string WorkerNameFromServerDef(const ServerDef& server_def); + void SetLogging(bool active); + + void RetrieveLogs(tensorflow::int64 step_id, LoggingResponse* response); + + void ClearLogs(); + private: const WorkerEnv* const worker_env_; // Not owned. @@ -75,6 +82,8 @@ class SessionMgr { std::unique_ptr<WorkerCacheInterface> default_worker_cache_; std::shared_ptr<WorkerSession> legacy_session_; + bool is_logging_active_ = false; + const WorkerCacheFactory worker_cache_factory_; std::shared_ptr<WorkerSession> WorkerSessionForSessionUnlocked( diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h index e448a60f5e..e90596980f 100644 --- a/tensorflow/core/framework/register_types.h +++ b/tensorflow/core/framework/register_types.h @@ -53,7 +53,7 @@ limitations under the License. */ #if !defined(IS_MOBILE_PLATFORM) || defined(SUPPORT_SELECTIVE_REGISTRATION) || \ - defined(NVIDIA_TEGRA) + defined(ANDROID_TEGRA) // All types are supported, so all macros are invoked. // diff --git a/tensorflow/core/framework/variant_op_registry.cc b/tensorflow/core/framework/variant_op_registry.cc index 395329da3b..ee07db1aee 100644 --- a/tensorflow/core/framework/variant_op_registry.cc +++ b/tensorflow/core/framework/variant_op_registry.cc @@ -182,7 +182,7 @@ Status VariantDeviceCopy( // Special casing UnaryOpFn per op and per device. UnaryVariantOpRegistry::VariantUnaryOpFn* UnaryVariantOpRegistry::GetUnaryOpFn( VariantUnaryOp op, StringPiece device, StringPiece type_name) { - auto found = unary_op_fns.find(std::make_tuple(op, device, type_name)); + auto found = unary_op_fns.find({op, device, type_name}); if (found == unary_op_fns.end()) return nullptr; return &found->second; } @@ -195,12 +195,10 @@ void UnaryVariantOpRegistry::RegisterUnaryOpFn( CHECK_EQ(existing, nullptr) << "Unary VariantUnaryOpFn for type_name: " << type_name << " already registered for device type: " << device; - unary_op_fns.insert( - std::pair<std::tuple<VariantUnaryOp, StringPiece, StringPiece>, - VariantUnaryOpFn>( - std::make_tuple(op, GetPersistentStringPiece(device), - GetPersistentStringPiece(type_name)), - unary_op_fn)); + unary_op_fns.insert(std::pair<FuncTuple<VariantUnaryOp>, VariantUnaryOpFn>( + {op, GetPersistentStringPiece(device), + GetPersistentStringPiece(type_name)}, + unary_op_fn)); } namespace { @@ -229,7 +227,7 @@ REGISTER_VARIANT_ZEROS_LIKE_TYPE(bool); UnaryVariantOpRegistry::VariantBinaryOpFn* UnaryVariantOpRegistry::GetBinaryOpFn(VariantBinaryOp op, StringPiece device, StringPiece type_name) { - auto found = binary_op_fns.find(std::make_tuple(op, device, type_name)); + auto found = binary_op_fns.find({op, device, type_name}); if (found == binary_op_fns.end()) return nullptr; return &found->second; } @@ -242,12 +240,10 @@ void UnaryVariantOpRegistry::RegisterBinaryOpFn( CHECK_EQ(existing, nullptr) << "Unary VariantBinaryOpFn for type_name: " << type_name << " already registered for device type: " << device; - binary_op_fns.insert( - std::pair<std::tuple<VariantBinaryOp, StringPiece, StringPiece>, - VariantBinaryOpFn>( - std::make_tuple(op, GetPersistentStringPiece(device), - GetPersistentStringPiece(type_name)), - add_fn)); + binary_op_fns.insert(std::pair<FuncTuple<VariantBinaryOp>, VariantBinaryOpFn>( + {op, GetPersistentStringPiece(device), + GetPersistentStringPiece(type_name)}, + add_fn)); } namespace { diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h index 13f6908cae..e94100e994 100644 --- a/tensorflow/core/framework/variant_op_registry.h +++ b/tensorflow/core/framework/variant_op_registry.h @@ -166,6 +166,21 @@ class UnaryVariantOpRegistry { device_copy_fns; // Map std::tuple<Op, device, type_name> to function. + + // this breaks by falling victim to "too perfect forwarding" + // see https://stackoverflow.com/questions/44475317/variadic-template-issue + // and references therein + template <typename Op> + struct FuncTuple { + FuncTuple(const Op& op, const StringPiece& dev, const StringPiece& tname) + : op_type_(op), device_(dev), typename_(tname){}; + Op op_type_; + StringPiece device_, typename_; + }; + // friend declaration for operator== + // needed for clang + template <typename Op> + friend bool operator==(const FuncTuple<Op>& l, const FuncTuple<Op>& r); struct TupleHash { template <typename Op> std::size_t operator()( @@ -176,18 +191,25 @@ class UnaryVariantOpRegistry { ret = Hash64Combine(ret, sp_hasher_(std::get<2>(x))); return ret; } + + template <typename Op> + std::size_t operator()(const FuncTuple<Op>& x) const { + // The hash of an enum is just its value as a std::size_t. + std::size_t ret = static_cast<std::size_t>(x.op_type_); + ret = Hash64Combine(ret, sp_hasher_(x.device_)); + ret = Hash64Combine(ret, sp_hasher_(x.typename_)); + return ret; + } StringPieceHasher sp_hasher_; }; - std::unordered_map<std::tuple<VariantUnaryOp, StringPiece, StringPiece>, - VariantUnaryOpFn, TupleHash> + std::unordered_map<FuncTuple<VariantUnaryOp>, VariantUnaryOpFn, TupleHash> unary_op_fns; - std::unordered_map<std::tuple<VariantBinaryOp, StringPiece, StringPiece>, - VariantBinaryOpFn, TupleHash> + std::unordered_map<FuncTuple<VariantBinaryOp>, VariantBinaryOpFn, TupleHash> binary_op_fns; // Find or insert a string into a persistent string storage - // container; return the StringPiece pointing to the permanent - // string location. + // container; return the StringPiece pointing to the permanent string + // location. static StringPiece GetPersistentStringPiece(const string& str) { const auto string_storage = PersistentStringStorage(); auto found = string_storage->find(str); @@ -199,7 +221,12 @@ class UnaryVariantOpRegistry { } } }; - +template <typename Op> +inline bool operator==(const UnaryVariantOpRegistry::FuncTuple<Op>& lhs, + const UnaryVariantOpRegistry::FuncTuple<Op>& rhs) { + return (lhs.op_type_ == rhs.op_type_) && (lhs.device_ == rhs.device_) && + (lhs.typename_ == rhs.typename_); +} // Gets a TensorShape from a Tensor containing a scalar Variant. // Returns an Internal error if the Variant does not have a registered shape // function, or if it's a serialized Variant that cannot be decoded. diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index 68c3136019..7d3be15299 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -42,7 +42,7 @@ limitations under the License. namespace tensorflow { -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML // This pass implements rewriting of graph to support following scenarios: // (A) Merging nodes in the graph @@ -2211,7 +2211,7 @@ Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) { return Status::OK(); } -#else // INTEL_MKL_DNN +#else // INTEL_MKL_ML // This pass implements rewriting of graph to support following scenarios: // (A) Merging nodes in the graph @@ -2452,9 +2452,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // NOTE: names are alphabetically sorted. rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn), CopyAttrsAddN, AddNRewrite}); - /* rinfo_.push_back({csinfo_.add, - mkl_op_registry::GetMklOpName(csinfo_.add), - CopyAttrsDataType, AlwaysRewrite}); */ + rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add), + CopyAttrsDataType, AlwaysRewrite}); rinfo_.push_back({csinfo_.avg_pool, mkl_op_registry::GetMklOpName(csinfo_.avg_pool), CopyAttrsPooling, AlwaysRewrite}); @@ -2502,14 +2501,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass { rinfo_.push_back({csinfo_.max_pool_grad, mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad), CopyAttrsPooling, AlwaysRewrite}); - /* + rinfo_.push_back({csinfo_.maximum, mkl_op_registry::GetMklOpName(csinfo_.maximum), CopyAttrsDataType, AlwaysRewrite}); rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul), CopyAttrsDataType, AlwaysRewrite}); - */ rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu), CopyAttrsDataType, AlwaysRewrite}); rinfo_.push_back({csinfo_.relu_grad, @@ -2529,14 +2527,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass { rinfo_.push_back({csinfo_.softmax, mkl_op_registry::GetMklOpName(csinfo_.softmax), CopyAttrsDataType, AlwaysRewrite}); - /* + rinfo_.push_back({csinfo_.squared_difference, mkl_op_registry::GetMklOpName(csinfo_.squared_difference), CopyAttrsDataType, AlwaysRewrite}); rinfo_.push_back({csinfo_.sub, mkl_op_registry::GetMklOpName(csinfo_.sub), CopyAttrsDataType, AlwaysRewrite}); - */ // Add info about which ops to add workspace edge to and the slots. wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3}); @@ -4317,7 +4314,7 @@ Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) { return Status::OK(); } -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML } // namespace tensorflow #endif diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc index 320d5a48c7..5e2a465e22 100644 --- a/tensorflow/core/graph/mkl_layout_pass_test.cc +++ b/tensorflow/core/graph/mkl_layout_pass_test.cc @@ -38,7 +38,7 @@ limitations under the License. namespace tensorflow { -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML namespace { @@ -1899,7 +1899,7 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000); } // namespace -#else // INTEL_MKL_DNN +#else // INTEL_MKL_ML namespace { @@ -3532,7 +3532,7 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000); } // namespace -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML } // namespace tensorflow diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc index d5b026eae3..0d88d1ff72 100644 --- a/tensorflow/core/graph/testlib.cc +++ b/tensorflow/core/graph/testlib.cc @@ -273,6 +273,16 @@ Node* Reverse(Graph* g, Node* tensor, Node* axis) { return Binary(g, "ReverseV2", tensor, axis); } +Node* Roll(Graph* g, Node* input, Node* shift, Node* axis) { + Node* ret; + TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Roll", g->op_registry()) + .Input(input) + .Input(shift) + .Input(axis) + .Finalize(g, &ret)); + return ret; +} + Node* Error(Graph* g, Node* input, const string& errmsg) { Node* ret; TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Error") diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h index 06597778bb..eb9038d619 100644 --- a/tensorflow/core/graph/testlib.h +++ b/tensorflow/core/graph/testlib.h @@ -117,6 +117,10 @@ Node* RandomGamma(Graph* g, Node* shape, Node* alpha); // Output dtype determined by lam. Node* RandomPoisson(Graph* g, Node* shape, Node* lam); +// Rolls tensor by an offset of <shift> along the corresponding +// <axis> dimensions. +Node* Roll(Graph* g, Node* input, Node* shift, Node* axis); + // Generates random parameters from the truncated standard normal distribution // of the nput shape Node* TruncatedNormal(Graph* g, Node* input, DataType dtype); diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index fd99409c9b..e7192ec42f 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -629,6 +629,7 @@ cc_library( ":transpose_op", ":unique_op", ":unpack_op", + ":unravel_index_op", ":where_op", ], ) @@ -884,6 +885,12 @@ tf_kernel_library( ) tf_kernel_library( + name = "unravel_index_op", + prefix = "unravel_index_op", + deps = ARRAY_DEPS, +) + +tf_kernel_library( name = "where_op", srcs = ["where_op.cc"], hdrs = ["where_op.h"], @@ -2582,6 +2589,45 @@ tf_cc_tests( ], ) +cc_library( + name = "manip", + deps = [ + ":roll_op", + ], +) + +MANIP_DEPS = [ + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:manip_ops_op_lib", + "//third_party/eigen3", +] + +tf_kernel_library( + name = "roll_op", + prefix = "roll_op", + deps = MANIP_DEPS, +) + +tf_cc_test( + name = "roll_op_test", + size = "small", + srcs = ["roll_op_test.cc"], + deps = [ + ":ops_testutil", + ":ops_util", + ":roll_op", + "//tensorflow/core:core_cpu", + "//tensorflow/core:core_cpu_internal", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + ], +) + MATH_DEPS = [ ":bounds_check", ":fill_functor", diff --git a/tensorflow/core/kernels/compare_and_bitpack_op.cc b/tensorflow/core/kernels/compare_and_bitpack_op.cc index 9f626a274a..224fe534e3 100644 --- a/tensorflow/core/kernels/compare_and_bitpack_op.cc +++ b/tensorflow/core/kernels/compare_and_bitpack_op.cc @@ -110,7 +110,19 @@ struct ComputeShard<T, typename TTypes<bool>::ConstMatrix input, typename TTypes<uint8>::Matrix output, bool /*thresh*/, int64 start, int64 limit) { - // NOTE(ebrevdo): This assumes memory is little-endian. +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + for (int64 i = start; i < limit; ++i) { + uint8* out = output.data() + i; + const int64 block = *reinterpret_cast<const int64*>(input.data() + 8 * i); + *out = ((((block & (1LL << (7 * 8))) >> (7 * 8 - 7))) | + (((block & (1LL << (6 * 8))) >> (6 * 8 - 6))) | + (((block & (1LL << (5 * 8))) >> (5 * 8 - 5))) | + (((block & (1LL << (4 * 8))) >> (4 * 8 - 4))) | + (((block & (1LL << (3 * 8))) >> (3 * 8 - 3))) | + (((block & (1LL << (2 * 8))) >> (2 * 8 - 2))) | + (((block & (1LL << 8)) >> (1 * 8 - 1))) | (((block & (1LL))))); + } +#else for (int64 i = start; i < limit; ++i) { uint8* out = output.data() + i; const int64 block = *reinterpret_cast<const int64*>(input.data() + 8 * i); @@ -123,6 +135,7 @@ struct ComputeShard<T, (((block & (1LL << (2 * 8))) >> (2 * 8 - 5))) | (((block & (1LL << 8)) >> (1 * 8 - 6))) | (((block & (1LL)) << 7))); } +#endif } }; diff --git a/tensorflow/core/kernels/decode_bmp_op.cc b/tensorflow/core/kernels/decode_bmp_op.cc index c778278e8f..b7d120a617 100644 --- a/tensorflow/core/kernels/decode_bmp_op.cc +++ b/tensorflow/core/kernels/decode_bmp_op.cc @@ -39,6 +39,13 @@ class DecodeBmpOp : public OpKernel { errors::InvalidArgument("channels must be 0, 1, 3 or 4, got ", channels_)); } + inline int32 ByteSwapInt32ForBigEndian(int32 x) { +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + return le32toh(x); +#else + return x; +#endif + } void Compute(OpKernelContext* context) override { const Tensor& contents = context->input(0); @@ -56,14 +63,18 @@ class DecodeBmpOp : public OpKernel { input.size(), " bytes")); const uint8* img_bytes = reinterpret_cast<const uint8*>(input.data()); - const int32 header_size = internal::SubtleMustCopy( + int32 header_size_ = internal::SubtleMustCopy( *(reinterpret_cast<const int32*>(img_bytes + 10))); - const int32 width = internal::SubtleMustCopy( + const int32 header_size = ByteSwapInt32ForBigEndian(header_size_); + int32 width_ = internal::SubtleMustCopy( *(reinterpret_cast<const int32*>(img_bytes + 18))); - const int32 height = internal::SubtleMustCopy( + const int32 width = ByteSwapInt32ForBigEndian(width_); + int32 height_ = internal::SubtleMustCopy( *(reinterpret_cast<const int32*>(img_bytes + 22))); - const int32 bpp = internal::SubtleMustCopy( + const int32 height = ByteSwapInt32ForBigEndian(height_); + int32 bpp_ = internal::SubtleMustCopy( *(reinterpret_cast<const int32*>(img_bytes + 28))); + const int32 bpp = ByteSwapInt32ForBigEndian(bpp_); if (channels_) { OP_REQUIRES(context, (channels_ == bpp / 8), diff --git a/tensorflow/core/kernels/fractional_pool_common.h b/tensorflow/core/kernels/fractional_pool_common.h index df0bbbfa06..2d7a230fc0 100644 --- a/tensorflow/core/kernels/fractional_pool_common.h +++ b/tensorflow/core/kernels/fractional_pool_common.h @@ -57,7 +57,7 @@ static inline void RandomShuffle(Iter first, Iter last, const Random& uniform) { // * sum(generated_diff_pooling_sequence) = input_length // * Let's define floor(input_length / output_length) = K, then // K <= generated_diff_pooling_sequence[i] <= K+1 -// For example, when input_length = 10, output_length = 6, the followings are +// For example, when input_length = 10, output_length = 6, the following are // valid pooling sequence: // * [1, 2, 2, 1, 2, 2] // * [1, 1, 2, 2, 2, 2] diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc index 89d37d2f87..b539b00009 100644 --- a/tensorflow/core/kernels/mkl_aggregate_ops.cc +++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc @@ -28,7 +28,7 @@ limitations under the License. #include "mkl_dnn_types.h" #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" using mkldnn::stream; using mkldnn::sum; @@ -37,7 +37,7 @@ using mkldnn::sum; namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template <typename Device, typename T> class MklAddNOp : public OpKernel { @@ -285,7 +285,7 @@ class MklAddNOp : public OpKernel { } MklAddNOpContext; }; -#else // INTEL_MKL_DNN +#else // INTEL_MKL_ML template <typename Device, typename T> class MklAddNOp : public OpKernel { public: @@ -317,8 +317,11 @@ class MklAddNOp : public OpKernel { : src2_tensor.dims(); // if the shapes of two tensors are not same raise op error TensorShape src1_shape, src2_shape; - src1_shape = src1_tensor.shape(); - src2_shape = src2_tensor.shape(); + src1_shape = input1_in_mkl_format ? src1_mkl_shape.GetTfShape() + : src1_tensor.shape(); + src2_shape = input2_in_mkl_format ? src2_mkl_shape.GetTfShape() + : src2_tensor.shape(); + if (!src1_shape.IsSameSize(src2_shape)) { ctx->SetStatus(errors::InvalidArgument( "Inputs to operation ", this->name(), " of type ", diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc index a7c569ee05..d545d34fdf 100644 --- a/tensorflow/core/kernels/mkl_avgpooling_op.cc +++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc @@ -24,7 +24,7 @@ #include "tensorflow/core/kernels/mkl_pooling_ops_common.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" using mkldnn::algorithm; using mkldnn::engine; @@ -40,8 +40,7 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; -// For now, MKL-ML is default. So making MKL-DNN not a default choice. -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template <typename Device, typename T> class MklAvgPoolingOp : public OpKernel { @@ -429,7 +428,7 @@ class MklAvgPoolingGradOp : public OpKernel { TensorFormat data_format_; }; // MklAvgPoolingGradOp -#else // INTEL_MKL_DNN is defined +#else template <typename Device, typename T> class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> { @@ -466,6 +465,28 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> { memory::dims output_dims_mkl_order; this->GetOutputDims(pool_params, &output_dims_mkl_order); + // If input is an empty tensor, allocate an empty output tensor and return + if (input_tensor.NumElements() == 0) { + MklDnnShape output_mkl_shape; + output_mkl_shape.SetMklTensor(false); + TensorShape output_tf_shape; + if (pool_params.data_format == TensorFormat::FORMAT_NCHW) { + output_tf_shape = MklDnnDimsToTFShape(output_dims_mkl_order); + } else { + memory::dims output_dims_NHWC_order; + output_dims_NHWC_order = {pool_params.tensor_in_batch, + static_cast<int>(pool_params.out_height), + static_cast<int>(pool_params.out_width), + pool_params.out_depth}; + output_tf_shape = MklDnnDimsToTFShape(output_dims_NHWC_order); + } + const int kOutputIndex = 0; + AllocateOutputSetMklShape(context, kOutputIndex, &output_tensor, + output_tf_shape, output_mkl_shape); + CHECK_NOTNULL(output_tensor); + return; + } + // If input is in Mkl layout, then just get the memory format from it // directly, instead of using input data_format to AvgPool. if (dnn_shape_input.IsMklTensor()) { @@ -678,7 +699,7 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> { } }; // MklAvgPoolingGradOp -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML REGISTER_KERNEL_BUILDER(Name("_MklAvgPool") .Device(DEVICE_CPU) diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc index 7da63604d2..f1f267e849 100644 --- a/tensorflow/core/kernels/mkl_concat_op.cc +++ b/tensorflow/core/kernels/mkl_concat_op.cc @@ -30,7 +30,7 @@ limitations under the License. #include "mkl_dnn_types.h" #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" using mkldnn::concat; @@ -62,7 +62,7 @@ class EigenConcatBaseOp : public OpKernel { // we need to have empty Compute because Compute is pure virtual function. void Compute(OpKernelContext* c) {} -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML void Compute(OpKernelContext* c, const std::vector<Tensor>& values) { const Tensor* concat_dim_tensor; @@ -230,7 +230,7 @@ class EigenConcatBaseOp : public OpKernel { #endif }; -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML // -------------------------------------------------------------------------- // Mkl Concat Op diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc index ef3f8cfec1..1401bc65a4 100644 --- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc @@ -42,7 +42,7 @@ limitations under the License. #include "mkl_dnn_types.h" #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" using mkldnn::convolution_backward_weights; @@ -55,7 +55,7 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template <typename Device, class T> class MklConv2DCustomBackpropFilterOp : public OpKernel { @@ -655,7 +655,7 @@ class MklConv2DCustomBackpropFilterOp TF_CALL_float(REGISTER_MKL_FILTER_KERNELS); #undef REGISTER_MKL_FILTER_KERNELS -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML } // namespace tensorflow diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc index a6745489f4..eeed009531 100644 --- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc @@ -44,7 +44,7 @@ limitations under the License. #include "tensorflow/core/util/use_cudnn.h" #include "tensorflow/core/util/work_sharder.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" using mkldnn::convolution_backward_data; @@ -56,7 +56,7 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template <typename Device, class T> class MklConv2DCustomBackpropInputOp : public OpKernel { @@ -493,7 +493,7 @@ class MklConv2DCustomBackpropInputOp } }; -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML #define REGISTER_MKL_CPU_KERNELS(T) \ REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput") \ diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index e44fba754b..2953426d58 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -41,7 +41,8 @@ limitations under the License. #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML + #include "mkldnn.hpp" using mkldnn::prop_kind; @@ -58,8 +59,8 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; -// For now, MKL-ML is default. So making MKL-DNN not a default choice. -#ifndef INTEL_MKL_DNN +// MKL-DNN is now default. MKL-ML must be specified explicitly. +#ifdef INTEL_MKL_ML template <typename Device, typename T, bool biasEnabled> class MklConv2DOp : public OpKernel { diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h index 8b65eaea0d..9dd88221a8 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.h +++ b/tensorflow/core/kernels/mkl_conv_ops.h @@ -40,7 +40,7 @@ limitations under the License. #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" using mkldnn::prop_kind; @@ -52,7 +52,7 @@ using mkldnn::convolution_forward; namespace tensorflow { -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML class MklDnnConvUtil { protected: @@ -553,7 +553,7 @@ class MklConv2DBackpropCommonOp : public OpKernel { Padding padding_; TensorFormat data_format_; }; -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML ///////////////////////////////////////////////////////////////////// /// Dummy Mkl op that is just used for operators that are intermediate diff --git a/tensorflow/core/kernels/mkl_cwise_ops_common.cc b/tensorflow/core/kernels/mkl_cwise_ops_common.cc index c065724e0d..58f0c30f32 100644 --- a/tensorflow/core/kernels/mkl_cwise_ops_common.cc +++ b/tensorflow/core/kernels/mkl_cwise_ops_common.cc @@ -1,4 +1,4 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0(the "License"); you may not use this file except in compliance with the License. diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc index 0b6d838e09..8313224d7f 100644 --- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc +++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc @@ -25,7 +25,7 @@ limitations under the License. #include "mkl_dnn_types.h" #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" using mkldnn::batch_normalization_backward; @@ -41,7 +41,7 @@ using mkldnn::use_scale_shift; namespace tensorflow { using CPUDevice = Eigen::ThreadPoolDevice; -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template <typename Device, typename T> class MklFusedBatchNormOp : public OpKernel { @@ -683,7 +683,7 @@ class MklFusedBatchNormGradOp : public OpKernel { }; #endif -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML template <typename Device, typename T> class MklFusedBatchNormOp : public OpKernel { diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc index 9ee27ee21c..6c027f8e72 100644 --- a/tensorflow/core/kernels/mkl_identity_op.cc +++ b/tensorflow/core/kernels/mkl_identity_op.cc @@ -28,14 +28,14 @@ limitations under the License. #include "mkl_dnn_types.h" #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" #endif namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template <typename Device, typename T> class MklIdentityOp : public OpKernel { diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc index 73d41efce1..5a8799ae93 100644 --- a/tensorflow/core/kernels/mkl_input_conversion_op.cc +++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc @@ -31,7 +31,7 @@ limitations under the License. #include "tensorflow/core/kernels/mkl_tfconv_op.h" #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" using mkldnn::stream; @@ -59,7 +59,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; // convert the TF format input to MKL format /////////////////////////////////////////////////////////// -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template <typename Device, typename T> class MklInputConversionOp : public OpKernel { public: @@ -293,14 +293,58 @@ class MklInputConversionOp : public OpKernel { // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // If both inputs are in MKL format if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) { - // If both have the same shape, pass them through if (tf_shapes_are_same) { - VLOG(1) << "MklInputConversionOp: No conversion needed, " - << "copying MKL inputs with identical shapes to output"; - - ForwardMklTensorInToOut(context, 0, 0); - ForwardMklTensorInToOut(context, 1, 1); - return; + auto input0_md = input_shape_0.GetMklLayout(); + auto input1_md = input_shape_1.GetMklLayout(); + + // If both have the same shape and same format, pass them through + if (input0_md.data.format == input1_md.data.format) { + VLOG(1) << "MklInputConversionOp: No conversion needed, " + << "copying MKL inputs with identical shapes to output"; + + ForwardMklTensorInToOut(context, 0, 0); + ForwardMklTensorInToOut(context, 1, 1); + return; + } else { + VLOG(1) << "MklInputConversionOp: Shape is same, but format is " + "different, " + << "need to convert to same format"; + + // Convert input0, and keep input1 unchanged + // Create MklDnnShape for output mkl tensor based on input0 + Tensor* tensor_out; + MklDnnShape mkl_output_mkl_shape; + mkl_output_mkl_shape.SetMklTensor(true); + mkl_output_mkl_shape.SetElemType(MklDnnType<T>()); + mkl_output_mkl_shape.SetTfLayout(input_shape_0.GetDimension(), + input_shape_0.GetSizesAsMklDnnDims(), + input_shape_0.GetTfDataFormat()); + + // Get MKL layout from input1 as destination layout + mkl_output_mkl_shape.SetMklLayout(&input1_md); + + // Create output Mkl tensor for index 0 + AllocateOutputSetMklShape(context, 0, &tensor_out, + input_tensor_0.shape(), + mkl_output_mkl_shape); + + // Create MklDnnData object for input0 tesnsor + auto cpu_engine = engine(engine::cpu, 0); + MklDnnData<T> input(&cpu_engine); + input.SetUsrMem(input0_md, &input_tensor_0); + + // Create reorder from input0's layout to input1's layout + std::vector<primitive> net; + CHECK_EQ(input.CheckReorderToOpMem( + memory::primitive_desc(input1_md, cpu_engine), + tensor_out, &net), + true); + stream(stream::kind::eager).submit(net).wait(); + + // Input1 will be passed through + ForwardMklTensorInToOut(context, 1, 1); + return; + } } // Sanity check diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc index a8b45004b7..5f0a12a1fb 100644 --- a/tensorflow/core/kernels/mkl_lrn_op.cc +++ b/tensorflow/core/kernels/mkl_lrn_op.cc @@ -38,7 +38,7 @@ limitations under the License. #include "tensorflow/core/util/work_sharder.h" #endif -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" using mkldnn::lrn_across_channels; using mkldnn::lrn_backward; @@ -67,7 +67,7 @@ void GetBandMatrix(int depth, int depth_radius, } // namespace -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template <typename T> class MklLRNOp : public OpKernel { @@ -1343,7 +1343,7 @@ class MklLRNGradOp : public OpKernel { float beta_; }; -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML #define REGISTER_MKL_LRN_CPU(T) \ REGISTER_KERNEL_BUILDER(Name("_MklLRN") \ diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc index 0de27ccd60..14607f26e0 100644 --- a/tensorflow/core/kernels/mkl_maxpooling_op.cc +++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc @@ -22,7 +22,7 @@ limitations under the License. #include "tensorflow/core/util/mkl_util.h" #include "tensorflow/core/util/padding.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include <algorithm> #include "mkldnn.hpp" using mkldnn::algorithm; @@ -39,8 +39,8 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; -// For now, MKL-ML is default. So making MKL-DNN not a default choice. -#ifndef INTEL_MKL_DNN +// MKL-DNN is now default. MKL-ML must be specified explicitly. +#ifdef INTEL_MKL_ML // An implementation of MaxPooling (forward). template <typename Device, typename T> @@ -494,7 +494,7 @@ class MklMaxPoolingGradOp : public OpKernel { bool workspace_enabled_; }; // MklMaxPoolingGradOp -#else // INTEL_MKL_DNN is defined +#else // An implementation of MaxPooling (forward). template <typename Device, typename T> @@ -793,7 +793,7 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> { } }; // MklMaxPoolingGradOp -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML REGISTER_KERNEL_BUILDER(Name("_MklMaxPool") .Device(DEVICE_CPU) diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc index ef8597b057..5ef6ce2a57 100644 --- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc +++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc @@ -42,7 +42,7 @@ void MklPoolParameters::Init(OpKernelContext* context, Init(context, ksize, stride, padding, data_format); } -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML // Initialization for MKL format void MklPoolParameters::Init(OpKernelContext* context, const std::vector<int32>& ksize, @@ -72,7 +72,7 @@ void MklPoolParameters::Init(OpKernelContext* context, Init(context, ksize, stride, padding, data_format); } -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML // Common Initialization for TensorFlow and MKL formats void MklPoolParameters::Init(OpKernelContext* context, const std::vector<int32>& ksize, @@ -107,7 +107,7 @@ void MklPoolParameters::Init(OpKernelContext* context, OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose( tensor_in_cols, window_cols, col_stride, padding, &out_width, &pad_left, &pad_right)); -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML // TF can work with int64, but mkldnn only supports int32 // Fail if the height or width are greater than MAX_INT diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h index 880e45ab1e..279167aba2 100644 --- a/tensorflow/core/kernels/mkl_pooling_ops_common.h +++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h @@ -22,7 +22,7 @@ limitations under the License. #include "tensorflow/core/util/mkl_util.h" #include "tensorflow/core/util/padding.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" using mkldnn::memory; using mkldnn::pooling_backward; @@ -85,7 +85,7 @@ struct MklPoolParameters { void Init(OpKernelContext* context, const std::vector<int32>& ksize, const std::vector<int32>& stride, Padding padding, TensorFormat data_format, const TensorShape& tensor_in_shape); -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML void Init(OpKernelContext* context, const std::vector<int32>& ksize, const std::vector<int32>& stride, Padding padding, TensorFormat data_format, const MklShape* mkl_in_shape); @@ -102,7 +102,7 @@ struct MklPoolParameters { TensorFormat data_format); }; -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML template <class T> class MklPoolingOpBase : public OpKernel { @@ -395,7 +395,7 @@ class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> { return grad_reorder_needed ? target_diff_dst_md : original_input_grad_md; } }; -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML //------------------------------------------------------------------- // Utility functions diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc index 873aca30ca..51db3991e2 100644 --- a/tensorflow/core/kernels/mkl_relu_op.cc +++ b/tensorflow/core/kernels/mkl_relu_op.cc @@ -28,7 +28,7 @@ limitations under the License. #include "tensorflow/core/platform/default/logging.h" #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" using mkldnn::algorithm; @@ -58,7 +58,7 @@ struct MklReluHelpers { } }; -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template <typename Device, typename T> class MklReluOp : public OpKernel { @@ -368,7 +368,7 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) { mkl_context.MklCleanup(); } -#else // INTEL_MKL_DNN +#else // INTEL_MKL_ML template <typename Device, typename T, algorithm alg_kind> class MklReluOpBase : public OpKernel { @@ -849,7 +849,7 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> { MklReluGradOp<CPUDevice, type>); TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES); -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML // register dnn kernels for supported operations and supported types #define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type) \ diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc index 7d471e1e4c..5dbc4a2709 100644 --- a/tensorflow/core/kernels/mkl_reshape_op.cc +++ b/tensorflow/core/kernels/mkl_reshape_op.cc @@ -28,7 +28,7 @@ limitations under the License. #include "mkl_dnn_types.h" #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" using mkldnn::stream; #endif @@ -40,7 +40,7 @@ class MklReshapeOp : public OpKernel { public: explicit MklReshapeOp(OpKernelConstruction* context) : OpKernel(context) {} -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML void Compute(OpKernelContext* context) override { const Tensor& input = MklGetInput(context, 0); const Tensor& sizes = MklGetInput(context, 1); @@ -312,7 +312,7 @@ class MklReshapeOp : public OpKernel { } } -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML private: const int kInputSlotIdx = 0; diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc index c46eabdde1..aceef1e234 100644 --- a/tensorflow/core/kernels/mkl_softmax_op.cc +++ b/tensorflow/core/kernels/mkl_softmax_op.cc @@ -15,7 +15,7 @@ limitations under the License. // See docs in ../ops/nn_ops.cc. #ifdef INTEL_MKL -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/numeric_op.h" @@ -156,5 +156,5 @@ TF_CALL_float(REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES); } // namespace tensorflow -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML #endif // INTEL_MKL diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h index c4d5a45d3c..5fafa14b5d 100644 --- a/tensorflow/core/kernels/mkl_tfconv_op.h +++ b/tensorflow/core/kernels/mkl_tfconv_op.h @@ -35,7 +35,7 @@ limitations under the License. #include "mkl_dnn_types.h" #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML using mkldnn::stream; #endif @@ -61,7 +61,7 @@ class MklToTfOp : public OpKernel { VLOG(1) << "MKLToTFConversion complete successfully."; } -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context, string data_format_str, DataType op_data_type, bool has_avx512f, uint input_number) { diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc new file mode 100644 index 0000000000..bcbdbee058 --- /dev/null +++ b/tensorflow/core/kernels/roll_op.cc @@ -0,0 +1,334 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/register_types_traits.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/work_sharder.h" + +namespace tensorflow { + +#define EIGEN_USE_THREADS +using CPUDevice = Eigen::ThreadPoolDevice; + +// dim_size - the size of each dimension +// dim_range - the number of indices over in the flattened tensor +// you need to skip in order to make it over from one side of a dimension +// to the other. Used to make the shifts wrap around after a threshold. +// threshold - the index for each dimension that the roll starts to wrap +// back to the front +template <typename T> +void DoRoll(OpKernelContext* context, const int64 num_elements, + const int num_dims, const gtl::ArraySlice<int>& dim_size, + const T* input, T* output, const gtl::ArraySlice<int>& threshold, + const gtl::ArraySlice<int64>& dim_range) { + auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range]( + int64 start, int64 end) { + // array of indices for each dimension + gtl::InlinedVector<int, 4> indices(num_dims); + int offset = 0; // the shift along the flattened tensor for current element + // initialize indices and offset + for (int i = 0; i < num_dims; i++) { + // stride is the number of indices over in the flattened tensor + // you need to skip in order to make it over to an adjacent element + // along a dimension. dim_size[i] != 0 because we set it to max(dim, 1) + const int64 stride = dim_range[i] / dim_size[i]; + const int shift = dim_size[i] - threshold[i]; + const int indx = (start / stride) % dim_size[i]; + indices[i] = indx; + // calculate dimension index after the shift + const int shifted_indx = (indx + shift) % dim_size[i]; + offset += (shifted_indx - indx) * stride; + } + + for (int64 i = start; i < end; i++) { + output[i + offset] = input[i]; + // create next combination of indices + // while at it adjust offset if needed + for (int j = num_dims - 1; j >= 0; j--) { + const int indx = (indices[j] + 1) % dim_size[j]; + indices[j] = indx; + if (indx != 0) { + if (indx == threshold[j]) { // we've reached the threshold + // dim_range[j] = threshold[j] + shift[j] + // offset = shift[j] + ... other offsets + // offset - dim_range[j] = -threshold[j] + ... other offsets + // thus we undo our previous offset as well as add a new offset of + // -threshold[j] in one operation + offset -= dim_range[j]; // now wraps around + } + break; // indx != 0 don't need to carry + } else if (threshold[j] != 0) { // if threshold is 0 shift is 0 + offset += dim_range[j]; // indx became 0 so reverse wrap around + } + } + } + }; + // Shard + auto worker_threads = context->device()->tensorflow_cpu_worker_threads(); + // 15 - expiramentally determined with float and bool types + const int cost_per_element = 15 * sizeof(T); // rough esitmate + Shard(worker_threads->num_threads, worker_threads->workers, num_elements, + cost_per_element, std::move(work)); +} + +// dim_size - the size of each dimension +// dim_range - the number of indices over in the flattened tensor +// you need to skip in order to make it over from one side of a dimension +// to the other. Used to make the shifts wrap around after a threshold. +// threshold - the index for each dimension that the roll starts to wrap +// back to the front +// isd - inner shift dimension +template <typename T> +// Use memcpy to copy memory in groups when the data type supports memcpy +void DoRollWithMemcpy(OpKernelContext* context, const int64 num_elements, + const int num_dims, const gtl::ArraySlice<int>& dim_size, + const T* input, T* output, + const gtl::ArraySlice<int>& threshold, + const gtl::ArraySlice<int64>& dim_range, + const int64 isd) { + auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range, isd]( + int64 start, int64 end) { + // the number of indices over in the flattened tensor you need to skip in + // order to make it over from one side of the isd to the other + const int64 isd_range = std::max<int>(dim_range[isd], 1); + // the distance along the flattend tensor to the next element in the isd + const int64 isd_stride = isd_range / std::max<int>(dim_size[isd], 1); + + // start and end represent the i-th group currently so we will convert + // them into numbers representing the i-th elements. + // there are 2 groups per isd one for all elements before threshold[isd] + // and another for all elements after threshold[isd]. + const int64 start_remainder = (start % 2) * threshold[isd] * isd_stride; + const int64 end_remainder = (end % 2) * threshold[isd] * isd_stride; + start = (start / 2) * isd_range + start_remainder; + end = (end / 2) * isd_range + end_remainder; + + const T* in_ptr = &input[0]; + T* out_ptr = &output[0]; + in_ptr += start; + out_ptr += start; + + // array of indices for each dimension + // indicies = [i, j, k, l, m, n] + gtl::InlinedVector<int, 4> indicies(num_dims); + // the offset needed to make all inner non-shifting dimensions become 0 + int64 remainder_offset = 0; + // initialize indicies + for (int i = 0; i < num_dims; i++) { + // stride is the number of indices over in the flattened tensor + // you need to skip in order to make it over to an adjacent element + // along a dimension. dim_size[i] != 0 because we set it to max(dim, 1) + const int64 stride = dim_range[i] / dim_size[i]; + const int shift = dim_size[i] - threshold[i]; + const int indx = (start / stride) % dim_size[i]; + indicies[i] = indx; + // calculate dimension index after the shift + int out_indx = (indx + shift) % dim_size[i]; + if (i > isd) { + // trailing zeroes for indices after the inner shifted dimension + out_indx = 0; + remainder_offset += (out_indx - indx) * stride; + } + out_ptr += (out_indx - indx) * stride; + } + // set trailing zeroes for indices after the inner shifted dimension + for (int i = num_dims - 1; i > isd; i--) indicies[i] = 0; + + // the number of indices in the isd dimension the next group will skip + // to make it to the next threshold or end point + int isd_indx_skip = 0; + // the size of the next group + int64 group_size = 0; + // initialize isd_indx_skip and group_size + if (indicies[isd] < threshold[isd]) { + isd_indx_skip = threshold[isd] - indicies[isd]; + group_size = isd_indx_skip * isd_stride + remainder_offset; + } else { + isd_indx_skip = dim_size[isd] - indicies[isd]; + group_size = isd_indx_skip * isd_stride + remainder_offset; + } + + int64 i = start; + while (i < end) { + // copy group of elements + memcpy(out_ptr, in_ptr, group_size * sizeof(T)); + + // shift i and the pointers over to the next group position + i += group_size; + out_ptr += group_size; + in_ptr += group_size; + + // produce next combination of indices and adjust the out_ptr position + // to fix the offset if necessary + // the isd (inner shift dim) should skip to next threshold or endpoint + // all dimensions to the left increment by 1 when a digit is carried + // all dimensions to the right remain set to 0 + // +1 +1 +1 +isd_indx_skip + // indicies = [i, j, k, l, 0, 0] + // ^isd + for (int j = isd; j >= 0; j--) { + int inc = 1; + if (j == isd) inc = isd_indx_skip; + const int indx = (indicies[j] + inc) % dim_size[j]; + indicies[j] = indx; + if (indx != 0) { + if (indx == threshold[j]) { + out_ptr -= dim_range[j]; // now wraps around + } + break; // indx != 0 don't need to carry + } else if (threshold[j] != 0) { // if threshold is 0 shift is 0 + out_ptr += dim_range[j]; // indx became 0 so reverse wrap around + } + } + + // set isd_indx_skip and group_size for next iteration + if (indicies[isd] < threshold[isd]) { + isd_indx_skip = threshold[isd] - indicies[isd]; + group_size = isd_indx_skip * isd_stride; + } else { + isd_indx_skip = dim_size[isd] - indicies[isd]; + group_size = isd_indx_skip * isd_stride; + } + } + }; + // Shard + auto worker_threads = context->device()->tensorflow_cpu_worker_threads(); + const int64 ave_group_size = dim_range[isd] / 2; + const int total_work = 2 * num_elements / std::max<int>(dim_range[isd], 1); + // 25000 - expiramentally determined with float and bool types + const int cost_per_group = 25000 * sizeof(T) * ave_group_size; + Shard(worker_threads->num_threads, worker_threads->workers, total_work, + cost_per_group, std::move(work)); +} + +template <typename Device, typename T, typename Tshift, typename Taxis> +class RollOp : public OpKernel { + public: + explicit RollOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + const Tensor& input = context->input(0); + const Tensor& shift = context->input(1); + const Tensor& axis = context->input(2); + + auto shift_flat = shift.flat<Tshift>(); + auto axis_flat = axis.flat<Taxis>(); + + OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(input.shape()), + errors::InvalidArgument("input must be 1-D or higher")); + OP_REQUIRES(context, shift.shape().dims() <= 1, + errors::InvalidArgument( + "shift must be a scalar or a 1-D vector. Found: ", + shift.shape().DebugString())); + OP_REQUIRES(context, axis.shape().dims() <= 1, + errors::InvalidArgument( + "axis must be a scalar or a 1-D vector. Found: ", + axis.shape().DebugString())); + OP_REQUIRES( + context, shift.shape() == axis.shape(), + errors::InvalidArgument("shift and axis must have the same size")); + const int64 num_elements = input.NumElements(); + const int num_shifts = static_cast<int>(shift_flat.size()); + const int num_dims = input.dims(); + + // if there are any duplicate axes, shift_mod_sum will have the + // total modulo sum of shifts for each dimension + gtl::InlinedVector<int, 4> shift_mod_sum(num_dims, 0); + for (int i = 0; i < num_shifts; i++) { + const int axis = axis_flat(i); + OP_REQUIRES(context, axis < num_dims, + errors::InvalidArgument("axis ", axis, " is out of range")); + const int ds = std::max<int>(static_cast<int>(input.dim_size(axis)), 1); + const int sum = shift_mod_sum[axis] + static_cast<int>(shift_flat(i)); + // modulo that works with negatives: ((x % y) + y) % y + shift_mod_sum[axis] = (sum % ds + ds) % ds; + } + // the size of each dimension + gtl::InlinedVector<int, 4> dim_size(num_dims); + // threshold[i] is the index that the roll starts to wrap back to the front + gtl::InlinedVector<int, 4> threshold(num_dims); + // dim_range is the number of indices over in the flattened tensor + // you need to skip in order to make it over from one side of a dimension + // to the other. Used to make the shifts wrap around after a threshold. + gtl::InlinedVector<int64, 4> dim_range(num_dims); + int64 dim_size_prod = 1; // dimension size product + // inner shift dimension (inner most shifted dimension) + int64 isd = 0; + for (int i = num_dims - 1; i >= 0; i--) { + if (isd == 0 && shift_mod_sum[i] != 0) isd = i; + const int ds = std::max<int>(static_cast<int>(input.dim_size(i)), 1); + dim_size[i] = ds; + threshold[i] = (ds - shift_mod_sum[i]) % ds; + dim_size_prod *= static_cast<int64>(input.dim_size(i)); + dim_range[i] = dim_size_prod; + } + + Tensor* output = NULL; + OP_REQUIRES_OK(context, + context->allocate_output(0, input.shape(), &output)); + auto input_flat = input.flat<T>().data(); + auto output_flat = output->flat<T>().data(); + + if (std::is_same<Device, CPUDevice>::value) { + if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) { + // V2 copies memory in groups instead of element by element + DoRollWithMemcpy<T>(context, num_elements, num_dims, dim_size, + input_flat, output_flat, threshold, dim_range, isd); + } else { + // incase memcpy does not work for current data type + DoRoll<T>(context, num_elements, num_dims, dim_size, input_flat, + output_flat, threshold, dim_range); + } + } + } +}; + +// Register the CPU kernels. +#define REGISTER_CPU(type) \ + REGISTER_KERNEL_BUILDER(Name("Roll") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<int32>("Tshift") \ + .TypeConstraint<int32>("Taxis"), \ + RollOp<CPUDevice, type, int32, int32>) \ + REGISTER_KERNEL_BUILDER(Name("Roll") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<int64>("Tshift") \ + .TypeConstraint<int32>("Taxis"), \ + RollOp<CPUDevice, type, int64, int32>) \ + REGISTER_KERNEL_BUILDER(Name("Roll") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<int32>("Tshift") \ + .TypeConstraint<int64>("Taxis"), \ + RollOp<CPUDevice, type, int32, int64>) \ + REGISTER_KERNEL_BUILDER(Name("Roll") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<int64>("Tshift") \ + .TypeConstraint<int64>("Taxis"), \ + RollOp<CPUDevice, type, int64, int64>) + +TF_CALL_ALL_TYPES(REGISTER_CPU); +#undef REGISTER_CPU +} // namespace tensorflow diff --git a/tensorflow/core/kernels/roll_op_test.cc b/tensorflow/core/kernels/roll_op_test.cc new file mode 100644 index 0000000000..90b6f8d0f3 --- /dev/null +++ b/tensorflow/core/kernels/roll_op_test.cc @@ -0,0 +1,484 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include <functional> +#include <memory> + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" + +namespace tensorflow { +namespace { + +class RollOpTest : public OpsTestBase { + protected: + void MakeOp(DataType data_type, DataType index_type) { + TF_ASSERT_OK(NodeDefBuilder("myop", "Roll") + .Input(FakeInput(data_type)) + .Input(FakeInput(index_type)) + .Input(FakeInput(index_type)) + .Finalize(node_def())); + TF_ASSERT_OK(InitOp()); + } +}; + +TEST_F(RollOpTest, ScalarIndices) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({5}), {0, 1, 2, 3, 4}); + AddInputFromArray<int32>(TensorShape({}), {3}); + AddInputFromArray<int32>(TensorShape({}), {0}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({5})); + test::FillValues<float>(&expected, {2, 3, 4, 0, 1}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, ScalarIndices_NoMemcpy) { + MakeOp(DT_STRING, DT_INT32); + + // Feed and run + AddInputFromArray<string>(TensorShape({5}), {"a", "b", "c", "d", "e"}); + AddInputFromArray<int32>(TensorShape({}), {3}); + AddInputFromArray<int32>(TensorShape({}), {0}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_STRING, TensorShape({5})); + test::FillValues<string>(&expected, {"c", "d", "e", "a", "b"}); + test::ExpectTensorEqual<string>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, ScalarIndices_Complex) { + MakeOp(DT_COMPLEX64, DT_INT32); + + // Feed and run + AddInputFromArray<std::complex<float>>( + TensorShape({5}), {std::complex<float>(0, 10), std::complex<float>(1, 11), + std::complex<float>(2, 12), std::complex<float>(3, 13), + std::complex<float>(4, 14)}); + AddInputFromArray<int32>(TensorShape({}), {3}); + AddInputFromArray<int32>(TensorShape({}), {0}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_COMPLEX64, TensorShape({5})); + test::FillValues<std::complex<float>>( + &expected, {std::complex<float>(2, 12), std::complex<float>(3, 13), + std::complex<float>(4, 14), std::complex<float>(0, 10), + std::complex<float>(1, 11)}); + test::ExpectTensorEqual<std::complex<float>>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, Simple_TwoD32) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({3, 5}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + AddInputFromArray<int32>(TensorShape({2}), {2, -1}); + AddInputFromArray<int32>(TensorShape({2}), {0, 1}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({3, 5})); + test::FillValues<float>(&expected, + {6, 7, 8, 9, 5, 11, 12, 13, 14, 10, 1, 2, 3, 4, 0}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, Simple_TwoD32_NoMemcpy) { + MakeOp(DT_STRING, DT_INT32); + + // Feed and run + AddInputFromArray<string>(TensorShape({3, 5}), + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", + "k", "l", "m", "n", "o"}); + AddInputFromArray<int32>(TensorShape({2}), {2, -1}); + AddInputFromArray<int32>(TensorShape({2}), {0, 1}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_STRING, TensorShape({3, 5})); + test::FillValues<string>(&expected, {"g", "h", "i", "j", "f", "l", "m", "n", + "o", "k", "b", "c", "d", "e", "a"}); + test::ExpectTensorEqual<string>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, Simple_ThreeD32) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({2, 2, 3}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); + AddInputFromArray<int32>(TensorShape({3}), {1, -1, -1}); + AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 3})); + test::FillValues<float>(&expected, {10, 11, 9, 7, 8, 6, 4, 5, 3, 1, 2, 0}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, Simple_ThreeD32_NoMemcpy) { + MakeOp(DT_STRING, DT_INT32); + + // Feed and run + AddInputFromArray<string>( + TensorShape({2, 2, 3}), + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"}); + AddInputFromArray<int32>(TensorShape({3}), {1, -1, -1}); + AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_STRING, TensorShape({2, 2, 3})); + test::FillValues<string>( + &expected, {"k", "l", "j", "h", "i", "g", "e", "f", "d", "b", "c", "a"}); + test::ExpectTensorEqual<string>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, Simple_TwoD64) { + MakeOp(DT_FLOAT, DT_INT64); + + // Feed and run + AddInputFromArray<float>(TensorShape({5, 3}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + AddInputFromArray<int64>(TensorShape({2}), {-1, 4}); + AddInputFromArray<int64>(TensorShape({2}), {0, 1}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3})); + test::FillValues<float>(&expected, + {5, 3, 4, 8, 6, 7, 11, 9, 10, 14, 12, 13, 2, 0, 1}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, Simple_TwoD64_NoMemcpy) { + MakeOp(DT_STRING, DT_INT64); + + // Feed and run + AddInputFromArray<string>(TensorShape({5, 3}), + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", + "k", "l", "m", "n", "o"}); + AddInputFromArray<int64>(TensorShape({2}), {-1, 4}); + AddInputFromArray<int64>(TensorShape({2}), {0, 1}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_STRING, TensorShape({5, 3})); + test::FillValues<string>(&expected, {"f", "d", "e", "i", "g", "h", "l", "j", + "k", "o", "m", "n", "c", "a", "b"}); + test::ExpectTensorEqual<string>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, Simple_ThreeD64) { + MakeOp(DT_FLOAT, DT_INT64); + + // Feed and run + AddInputFromArray<float>(TensorShape({4, 1, 3}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); + AddInputFromArray<int64>(TensorShape({3}), {4, 3, 2}); + AddInputFromArray<int64>(TensorShape({3}), {0, 1, 2}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 1, 3})); + test::FillValues<float>(&expected, {1, 2, 0, 4, 5, 3, 7, 8, 6, 10, 11, 9}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, Simple_ThreeD64_NoMemcpy) { + MakeOp(DT_STRING, DT_INT64); + + // Feed and run + AddInputFromArray<string>( + TensorShape({4, 1, 3}), + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"}); + AddInputFromArray<int64>(TensorShape({3}), {4, 3, 2}); + AddInputFromArray<int64>(TensorShape({3}), {0, 1, 2}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_STRING, TensorShape({4, 1, 3})); + test::FillValues<string>( + &expected, {"b", "c", "a", "e", "f", "d", "h", "i", "g", "k", "l", "j"}); + test::ExpectTensorEqual<string>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, ZeroShift_ThreeD32) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({2, 2, 3}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); + AddInputFromArray<int32>(TensorShape({3}), {0, 0, 0}); + AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 3})); + test::FillValues<float>(&expected, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, ZeroShift_ThreeD32_NoMemcpy) { + MakeOp(DT_STRING, DT_INT32); + + // Feed and run + AddInputFromArray<string>( + TensorShape({2, 2, 3}), + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"}); + AddInputFromArray<int32>(TensorShape({3}), {0, 0, 0}); + AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_STRING, TensorShape({2, 2, 3})); + test::FillValues<string>( + &expected, {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"}); + test::ExpectTensorEqual<string>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, ZeroSize_ThreeD32) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({5, 0, 0}), {}); + AddInputFromArray<int32>(TensorShape({}), {1}); + AddInputFromArray<int32>(TensorShape({}), {0}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 0, 0})); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, ZeroSize_ThreeD32_NoMemcpy) { + MakeOp(DT_STRING, DT_INT32); + + // Feed and run + AddInputFromArray<string>(TensorShape({5, 0, 0}), {}); + AddInputFromArray<int32>(TensorShape({}), {1}); + AddInputFromArray<int32>(TensorShape({}), {0}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_STRING, TensorShape({5, 0, 0})); + test::ExpectTensorEqual<string>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, OneSize_ThreeD32) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({1, 1, 1}), {5}); + AddInputFromArray<int32>(TensorShape({}), {1}); + AddInputFromArray<int32>(TensorShape({}), {0}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1})); + test::FillValues<float>(&expected, {5}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, OneSize_ThreeD32_NoMemcpy) { + MakeOp(DT_STRING, DT_INT32); + + // Feed and run + AddInputFromArray<string>(TensorShape({1, 1, 1}), {"a"}); + AddInputFromArray<int32>(TensorShape({}), {1}); + AddInputFromArray<int32>(TensorShape({}), {0}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_STRING, TensorShape({1, 1, 1})); + test::FillValues<string>(&expected, {"a"}); + test::ExpectTensorEqual<string>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, MultiShifts_TwoD32) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({3, 5}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + AddInputFromArray<int32>(TensorShape({4}), {-2, 2, -1, 1}); + AddInputFromArray<int32>(TensorShape({4}), {1, 0, 0, 1}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({3, 5})); + test::FillValues<float>(&expected, + {11, 12, 13, 14, 10, 1, 2, 3, 4, 0, 6, 7, 8, 9, 5}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, MultiShifts_TwoD32_NoMemcpy) { + MakeOp(DT_STRING, DT_INT32); + + // Feed and run + AddInputFromArray<string>(TensorShape({3, 5}), + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", + "k", "l", "m", "n", "o"}); + AddInputFromArray<int32>(TensorShape({4}), {-2, 2, -1, 1}); + AddInputFromArray<int32>(TensorShape({4}), {1, 0, 0, 1}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_STRING, TensorShape({3, 5})); + test::FillValues<string>(&expected, {"l", "m", "n", "o", "k", "b", "c", "d", + "e", "a", "g", "h", "i", "j", "f"}); + test::ExpectTensorEqual<string>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, Error_InputMustBeVectorOrHigher) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({}), {7}); + AddInputFromArray<int32>(TensorShape({}), {1}); + AddInputFromArray<int32>(TensorShape({}), {0}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()).contains("input must be 1-D or higher")) + << s; +} + +TEST_F(RollOpTest, Error_AxisMustBeScalarOrVector) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4}); + AddInputFromArray<int32>(TensorShape({}), {1}); + AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()) + .contains("axis must be a scalar or a 1-D vector")) + << s; +} + +TEST_F(RollOpTest, Error_ShiftMustBeScalarOrVector) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4}); + AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1}); + AddInputFromArray<int32>(TensorShape({}), {1}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()) + .contains("shift must be a scalar or a 1-D vector")) + << s; +} + +TEST_F(RollOpTest, Error_ShiftAndAxisMustBeSameSize) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4}); + AddInputFromArray<int32>(TensorShape({1}), {1}); + AddInputFromArray<int32>(TensorShape({2}), {0, 1}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()) + .contains("shift and axis must have the same size")) + << s; +} + +TEST_F(RollOpTest, Error_AxisOutOfRange) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({4}), {1, 2, 3, 4}); + AddInputFromArray<int32>(TensorShape({}), {1}); + AddInputFromArray<int32>(TensorShape({}), {1}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()).contains("is out of range")) << s; +} + +// isd - (inner shift dimension) The inner most dimension to be shifted. +// All outer dimensions will also be shifted for testing. +static Graph* RollGraph(const TensorShape& shape, int isd) { + Graph* g = new Graph(OpRegistry::Global()); + Tensor input(DT_FLOAT, shape); + input.flat<float>().setRandom(); + const int dims = static_cast<int>(input.dims()); + Tensor shift(DT_INT32, TensorShape({dims})); + for (int i = 0; i < dims; i++) { + // shift the inner shift dimension and all outer dimensions + shift.flat<int32>()(i) = (i <= isd) ? 2 : 0; + } + Tensor axis(DT_INT32, TensorShape({dims})); + for (int i = 0; i < dims; i++) { + axis.flat<int32>()(i) = i; + } + test::graph::Roll(g, test::graph::Constant(g, input), + test::graph::Constant(g, shift), + test::graph::Constant(g, axis)); + return g; +} + +#define BM_ROLL_OUTER(DEVICE) \ + static void BM_##DEVICE##_roll_outer(int iters, int rows, int columns) { \ + TensorShape shape{rows, columns}; \ + const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \ + testing::ItemsProcessed(num_items); \ + testing::BytesProcessed(num_items * sizeof(float)); \ + testing::UseRealTime(); \ + test::Benchmark(#DEVICE, RollGraph(shape, 0)).Run(iters); \ + } \ + BENCHMARK(BM_##DEVICE##_roll_outer) \ + ->ArgPair(256, 256) \ + ->ArgPair(512, 512) \ + ->ArgPair(1024, 1024) \ + ->ArgPair(2048, 2048) + +#define BM_ROLL_ALL(DEVICE) \ + static void BM_##DEVICE##_roll_all(int iters, int rows, int columns) { \ + TensorShape shape{rows, columns}; \ + const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \ + testing::ItemsProcessed(num_items); \ + testing::BytesProcessed(num_items * sizeof(float)); \ + testing::UseRealTime(); \ + test::Benchmark(#DEVICE, RollGraph(shape, 1)).Run(iters); \ + } \ + BENCHMARK(BM_##DEVICE##_roll_all) \ + ->ArgPair(256, 256) \ + ->ArgPair(512, 512) \ + ->ArgPair(1024, 1024) \ + ->ArgPair(2048, 2048) + +BM_ROLL_OUTER(cpu); +BM_ROLL_ALL(cpu); +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/unravel_index_op.cc b/tensorflow/core/kernels/unravel_index_op.cc new file mode 100644 index 0000000000..a61272675b --- /dev/null +++ b/tensorflow/core/kernels/unravel_index_op.cc @@ -0,0 +1,122 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#define EIGEN_USE_THREADS + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" + +namespace tensorflow { + +namespace { +template <typename T> +struct mod_op { + const T operator()(const T& a, const T& b) const { return a % b; } +}; +} // namespace + +typedef Eigen::ThreadPoolDevice CPUDevice; + +template <typename Tidx> +class UnravelIndexOp : public OpKernel { + public: + explicit UnravelIndexOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + const Tensor& indices_tensor = ctx->input(0); + OP_REQUIRES(ctx, + TensorShapeUtils::IsVector(indices_tensor.shape()) || + TensorShapeUtils::IsScalar(indices_tensor.shape()), + errors::InvalidArgument( + "The indices can only be scalar or vector, got \"", + indices_tensor.shape().DebugString(), "\"")); + + const Tensor& dims_tensor = ctx->input(1); + OP_REQUIRES( + ctx, TensorShapeUtils::IsVector(dims_tensor.shape()), + errors::InvalidArgument("The indices can only be 1-D, got \"", + dims_tensor.shape().DebugString(), "\"")); + + auto dims = dims_tensor.vec<Tidx>(); + + Eigen::array<bool, 1> reverse({true}); + + Tensor strides_tensor; + OP_REQUIRES_OK(ctx, + ctx->allocate_temp(DataTypeToEnum<Tidx>::value, + TensorShape({dims_tensor.NumElements()}), + &strides_tensor)); + + auto strides = strides_tensor.vec<Tidx>(); + strides = dims.reverse(reverse) + .scan(0, Eigen::internal::ProdReducer<Tidx>(), false) + .reverse(reverse); + + Tensor strides_shifted_tensor; + OP_REQUIRES_OK(ctx, + ctx->allocate_temp(DataTypeToEnum<Tidx>::value, + TensorShape({dims_tensor.NumElements()}), + &strides_shifted_tensor)); + + auto strides_shifted = strides_shifted_tensor.vec<Tidx>(); + strides_shifted = dims.reverse(reverse) + .scan(0, Eigen::internal::ProdReducer<Tidx>(), true) + .reverse(reverse); + + Tensor* output_tensor = nullptr; + if (TensorShapeUtils::IsScalar(indices_tensor.shape())) { + OP_REQUIRES_OK( + ctx, ctx->allocate_output(0, TensorShape({dims_tensor.NumElements()}), + &output_tensor)); + + auto output = output_tensor->vec<Tidx>(); + + output = output.constant(indices_tensor.scalar<Tidx>()()); + output = output.binaryExpr(strides, mod_op<Tidx>()) / strides_shifted; + } else { + OP_REQUIRES_OK( + ctx, ctx->allocate_output(0, + TensorShape({dims_tensor.NumElements(), + indices_tensor.NumElements()}), + &output_tensor)); + + auto output = output_tensor->matrix<Tidx>(); + + Eigen::array<int64, 2> reshape{{dims_tensor.NumElements(), 1}}; + Eigen::array<int64, 2> bcast({1, indices_tensor.NumElements()}); + Eigen::array<int64, 2> indices_reshape{{1, indices_tensor.NumElements()}}; + Eigen::array<int64, 2> indices_bcast({dims_tensor.NumElements(), 1}); + + output = indices_tensor.vec<Tidx>() + .reshape(indices_reshape) + .broadcast(indices_bcast); + output = output.binaryExpr(strides.reshape(reshape).broadcast(bcast), + mod_op<Tidx>()) / + strides_shifted.reshape(reshape).broadcast(bcast); + } + } +}; + +#define REGISTER_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("UnravelIndex").Device(DEVICE_CPU).TypeConstraint<type>("Tidx"), \ + UnravelIndexOp<type>); +TF_CALL_int32(REGISTER_KERNEL) TF_CALL_int64(REGISTER_KERNEL) +#undef REGISTER_KERNEL + +} // namespace tensorflow diff --git a/tensorflow/core/lib/io/random_inputstream.cc b/tensorflow/core/lib/io/random_inputstream.cc index 8b8c1392a1..09336e79cd 100644 --- a/tensorflow/core/lib/io/random_inputstream.cc +++ b/tensorflow/core/lib/io/random_inputstream.cc @@ -57,6 +57,43 @@ Status RandomAccessInputStream::ReadNBytes(int64 bytes_to_read, return Status::OK(); } +// To limit memory usage, the default implementation of SkipNBytes() only reads +// 8MB at a time. +static constexpr int64 kMaxSkipSize = 8 * 1024 * 1024; + +Status RandomAccessInputStream::SkipNBytes(int64 bytes_to_skip) { + if (bytes_to_skip < 0) { + return errors::InvalidArgument("Can't skip a negative number of bytes"); + } + std::unique_ptr<char[]> scratch(new char[kMaxSkipSize]); + // Try to read 1 bytes first, if we could complete the read then EOF is + // not reached yet and we could return. + if (bytes_to_skip > 0) { + StringPiece data; + Status s = file_->Read(pos_ + bytes_to_skip - 1, 1, &data, scratch.get()); + if ((s.ok() || errors::IsOutOfRange(s)) && data.size() == 1) { + pos_ += bytes_to_skip; + return Status::OK(); + } + } + // Read kDefaultSkipSize at a time till bytes_to_skip. + while (bytes_to_skip > 0) { + int64 bytes_to_read = std::min<int64>(kMaxSkipSize, bytes_to_skip); + StringPiece data; + Status s = file_->Read(pos_, bytes_to_read, &data, scratch.get()); + if (s.ok() || errors::IsOutOfRange(s)) { + pos_ += data.size(); + } else { + return s; + } + if (data.size() < bytes_to_read) { + return errors::OutOfRange("reached end of file"); + } + bytes_to_skip -= bytes_to_read; + } + return Status::OK(); +} + int64 RandomAccessInputStream::Tell() const { return pos_; } } // namespace io diff --git a/tensorflow/core/lib/io/random_inputstream.h b/tensorflow/core/lib/io/random_inputstream.h index 09ebe9ba49..bdbdbd71ff 100644 --- a/tensorflow/core/lib/io/random_inputstream.h +++ b/tensorflow/core/lib/io/random_inputstream.h @@ -34,6 +34,8 @@ class RandomAccessInputStream : public InputStreamInterface { Status ReadNBytes(int64 bytes_to_read, string* result) override; + Status SkipNBytes(int64 bytes_to_skip) override; + int64 Tell() const override; Status Seek(int64 position) { diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc index 87dfa77689..267ce88440 100644 --- a/tensorflow/core/ops/array_ops.cc +++ b/tensorflow/core/ops/array_ops.cc @@ -335,6 +335,13 @@ REGISTER_OP("Unpack") return Status::OK(); }); +REGISTER_OP("UnravelIndex") + .Input("indices: Tidx") + .Input("dims: Tidx") + .Output("output: Tidx") + .Attr("Tidx: {int32, int64} = DT_INT32") + .SetShapeFn([](InferenceContext* c) { return Status::OK(); }); + // -------------------------------------------------------------------------- // TODO(josh11b): Remove the >= 2 constraint, once we can rewrite the graph // in the N == 1 case to remove the node. diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc index ef2ac267cc..a62e2d782b 100644 --- a/tensorflow/core/ops/image_ops.cc +++ b/tensorflow/core/ops/image_ops.cc @@ -586,6 +586,17 @@ REGISTER_OP("NonMaxSuppression") .Output("selected_indices: int32") .Attr("iou_threshold: float = 0.5") .SetShapeFn([](InferenceContext* c) { + // Get inputs and validate ranks. + ShapeHandle boxes; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes)); + ShapeHandle scores; + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores)); + ShapeHandle max_output_size; + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size)); + // The boxes is a 2-D float Tensor of shape [num_boxes, 4]. + DimensionHandle unused; + TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused)); + c->set_output(0, c->Vector(c->UnknownDim())); return Status::OK(); }); @@ -597,6 +608,19 @@ REGISTER_OP("NonMaxSuppressionV2") .Input("iou_threshold: float") .Output("selected_indices: int32") .SetShapeFn([](InferenceContext* c) { + // Get inputs and validate ranks. + ShapeHandle boxes; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes)); + ShapeHandle scores; + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores)); + ShapeHandle max_output_size; + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size)); + ShapeHandle iou_threshold; + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &iou_threshold)); + // The boxes is a 2-D float Tensor of shape [num_boxes, 4]. + DimensionHandle unused; + TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused)); + c->set_output(0, c->Vector(c->UnknownDim())); return Status::OK(); }); diff --git a/tensorflow/core/ops/manip_ops.cc b/tensorflow/core/ops/manip_ops.cc new file mode 100644 index 0000000000..95b4774fe6 --- /dev/null +++ b/tensorflow/core/ops/manip_ops.cc @@ -0,0 +1,33 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { + +// -------------------------------------------------------------------------- +REGISTER_OP("Roll") + .Input("input: T") + .Input("shift: Tshift") + .Input("axis: Taxis") + .Output("output: T") + .Attr("T: type") + .Attr("Tshift: {int32,int64}") + .Attr("Taxis: {int32,int64}") + .SetShapeFn(shape_inference::UnchangedShape); + +} // namespace tensorflow diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc index 62661fe4bd..67481fd202 100644 --- a/tensorflow/core/ops/nn_ops.cc +++ b/tensorflow/core/ops/nn_ops.cc @@ -1818,7 +1818,7 @@ REGISTER_OP("_MklMaxPool") .Input("input: T") .Input("mkl_input: uint8") .Output("output: T") -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML .Output("workspace: T") #else .Output("workspace: uint8") @@ -1844,7 +1844,7 @@ REGISTER_OP("_MklMaxPoolGrad") .Input("orig_input: T") .Input("orig_output: T") .Input("grad: T") -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML .Input("workspace: T") #else .Input("workspace: uint8") @@ -1916,7 +1916,7 @@ REGISTER_OP("_MklLRN") .Input("input: T") .Input("mkl_input: uint8") .Output("output: T") -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML .Output("workspace: T") #else .Output("workspace: uint8") @@ -1944,7 +1944,7 @@ REGISTER_OP("_MklLRNGrad") .Input("input_grads: T") .Input("input_image: T") .Input("output_image: T") -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML .Input("workspace: T") #else .Input("workspace: uint8") diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc index b0d7b3a67a..b570658158 100644 --- a/tensorflow/core/platform/cpu_feature_guard.cc +++ b/tensorflow/core/platform/cpu_feature_guard.cc @@ -97,14 +97,17 @@ std::once_flag g_cpu_feature_guard_warn_once_flag; void InfoAboutUnusedCPUFeatures() { std::call_once(g_cpu_feature_guard_warn_once_flag, [] { string missing_instructions; -#ifdef PLATFORM_WINDOWS +#if defined(_MSC_VER) && !defined(__clang__) + #ifndef __AVX__ CheckIfFeatureUnused(CPUFeature::AVX, "AVX", missing_instructions); #endif // __AVX__ #ifndef __AVX2__ CheckIfFeatureUnused(CPUFeature::AVX2, "AVX2", missing_instructions); #endif // __AVX2__ -#else // ifdef platform windows + +#else // if defined(_MSC_VER) && !defined(__clang__) + #ifndef __SSE__ CheckIfFeatureUnused(CPUFeature::SSE, "SSE", missing_instructions); #endif // __SSE__ @@ -132,7 +135,7 @@ void InfoAboutUnusedCPUFeatures() { #ifndef __FMA__ CheckIfFeatureUnused(CPUFeature::FMA, "FMA", missing_instructions); #endif // __FMA__ -#endif // else of ifdef platform windows +#endif // else of if defined(_MSC_VER) && !defined(__clang__) if (!missing_instructions.empty()) { LOG(INFO) << "Your CPU supports instructions that this TensorFlow " << "binary was not compiled to use:" << missing_instructions; diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.h b/tensorflow/core/platform/profile_utils/cpu_utils.h index 2da20bb1b8..7b580c8bf6 100644 --- a/tensorflow/core/platform/profile_utils/cpu_utils.h +++ b/tensorflow/core/platform/profile_utils/cpu_utils.h @@ -42,7 +42,7 @@ namespace profile_utils { class CpuUtils { public: // Constant for invalid frequency. - // This value is returned when the furequency is not obtained somehow. + // This value is returned when the frequency is not obtained somehow. static constexpr int64 INVALID_FREQUENCY = -1; static constexpr uint64 DUMMY_CYCLE_CLOCK = 1; @@ -105,7 +105,7 @@ class CpuUtils { static int64 GetCycleCounterFrequency(); #endif - // Return micro secound per each clock + // Return micro second per each clock // As this method caches the cpu frequency internally, // the first call will incur overhead, but not subsequent calls. static double GetMicroSecPerClock(); diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc index ebda3a2065..52bf0d4694 100644 --- a/tensorflow/core/platform/s3/s3_file_system.cc +++ b/tensorflow/core/platform/s3/s3_file_system.cc @@ -22,6 +22,7 @@ limitations under the License. #include <aws/core/Aws.h> #include <aws/core/config/AWSProfileConfigLoader.h> #include <aws/core/utils/FileSystemUtils.h> +#include <aws/core/utils/StringUtils.h> #include <aws/core/utils/logging/AWSLogging.h> #include <aws/core/utils/logging/LogSystemInterface.h> #include <aws/s3/S3Client.h> @@ -128,6 +129,15 @@ Aws::Client::ClientConfiguration& GetDefaultClientConfig() { return cfg; }; +void ShutdownClient(Aws::S3::S3Client* s3_client) { + if (s3_client != nullptr) { + delete s3_client; + Aws::SDKOptions options; + Aws::ShutdownAPI(options); + AWSLogSystem::ShutdownAWSLogging(); + } +} + Status ParseS3Path(const string& fname, bool empty_object_ok, string* bucket, string* object) { if (!bucket || !object) { @@ -155,12 +165,12 @@ Status ParseS3Path(const string& fname, bool empty_object_ok, string* bucket, class S3RandomAccessFile : public RandomAccessFile { public: - S3RandomAccessFile(const string& bucket, const string& object) - : bucket_(bucket), object_(object) {} + S3RandomAccessFile(const string& bucket, const string& object, + std::shared_ptr<Aws::S3::S3Client> s3_client) + : bucket_(bucket), object_(object), s3_client_(s3_client) {} Status Read(uint64 offset, size_t n, StringPiece* result, char* scratch) const override { - Aws::S3::S3Client s3Client(GetDefaultClientConfig()); Aws::S3::Model::GetObjectRequest getObjectRequest; getObjectRequest.WithBucket(bucket_.c_str()).WithKey(object_.c_str()); string bytes = strings::StrCat("bytes=", offset, "-", offset + n - 1); @@ -168,7 +178,7 @@ class S3RandomAccessFile : public RandomAccessFile { getObjectRequest.SetResponseStreamFactory([]() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); }); - auto getObjectOutcome = s3Client.GetObject(getObjectRequest); + auto getObjectOutcome = this->s3_client_->GetObject(getObjectRequest); if (!getObjectOutcome.IsSuccess()) { n = 0; *result = StringPiece(scratch, n); @@ -186,13 +196,16 @@ class S3RandomAccessFile : public RandomAccessFile { private: string bucket_; string object_; + std::shared_ptr<Aws::S3::S3Client> s3_client_; }; class S3WritableFile : public WritableFile { public: - S3WritableFile(const string& bucket, const string& object) + S3WritableFile(const string& bucket, const string& object, + std::shared_ptr<Aws::S3::S3Client> s3_client) : bucket_(bucket), object_(object), + s3_client_(s3_client), sync_needed_(true), outfile_(Aws::MakeShared<Aws::Utils::TempFile>( kS3FileSystemAllocationTag, "/tmp/s3_filesystem_XXXXXX", @@ -231,17 +244,13 @@ class S3WritableFile : public WritableFile { if (!sync_needed_) { return Status::OK(); } - Aws::Client::ClientConfiguration clientConfig = GetDefaultClientConfig(); - clientConfig.connectTimeoutMs = 300000; - clientConfig.requestTimeoutMs = 600000; - Aws::S3::S3Client s3Client(clientConfig); Aws::S3::Model::PutObjectRequest putObjectRequest; putObjectRequest.WithBucket(bucket_.c_str()).WithKey(object_.c_str()); long offset = outfile_->tellp(); outfile_->seekg(0); putObjectRequest.SetBody(outfile_); putObjectRequest.SetContentLength(offset); - auto putObjectOutcome = s3Client.PutObject(putObjectRequest); + auto putObjectOutcome = this->s3_client_->PutObject(putObjectRequest); outfile_->clear(); outfile_->seekp(offset); if (!putObjectOutcome.IsSuccess()) { @@ -256,6 +265,7 @@ class S3WritableFile : public WritableFile { private: string bucket_; string object_; + std::shared_ptr<Aws::S3::S3Client> s3_client_; bool sync_needed_; std::shared_ptr<Aws::Utils::TempFile> outfile_; }; @@ -274,31 +284,46 @@ class S3ReadOnlyMemoryRegion : public ReadOnlyMemoryRegion { } // namespace -S3FileSystem::S3FileSystem() { - AWSLogSystem::InitializeAWSLogging(); - - Aws::SDKOptions options; - options.cryptoOptions.sha256Factory_create_fn = []() { - return Aws::MakeShared<S3SHA256Factory>(S3CryptoAllocationTag); - }; - options.cryptoOptions.sha256HMACFactory_create_fn = []() { - return Aws::MakeShared<S3SHA256HmacFactory>(S3CryptoAllocationTag); - }; - Aws::InitAPI(options); -} - -S3FileSystem::~S3FileSystem() { - Aws::SDKOptions options; - Aws::ShutdownAPI(options); +S3FileSystem::S3FileSystem() + : s3_client_(nullptr, ShutdownClient), client_lock_() {} + +S3FileSystem::~S3FileSystem() {} + +// Initializes s3_client_, if needed, and returns it. +std::shared_ptr<Aws::S3::S3Client> S3FileSystem::GetS3Client() { + std::lock_guard<mutex> lock(this->client_lock_); + + if (this->s3_client_.get() == nullptr) { + AWSLogSystem::InitializeAWSLogging(); + + Aws::SDKOptions options; + options.cryptoOptions.sha256Factory_create_fn = []() { + return Aws::MakeShared<S3SHA256Factory>(S3CryptoAllocationTag); + }; + options.cryptoOptions.sha256HMACFactory_create_fn = []() { + return Aws::MakeShared<S3SHA256HmacFactory>(S3CryptoAllocationTag); + }; + Aws::InitAPI(options); + + // The creation of S3Client disables virtual addressing: + // S3Client(clientConfiguration, signPayloads, useVirtualAdressing = true) + // The purpose is to address the issue encountered when there is an `.` + // in the bucket name. Due to TLS hostname validation or DNS rules, + // the bucket may not be resolved. Disabling of virtual addressing + // should address the issue. See GitHub issue 16397 for details. + this->s3_client_ = std::shared_ptr<Aws::S3::S3Client>(new Aws::S3::S3Client( + GetDefaultClientConfig(), + Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, false)); + } - AWSLogSystem::ShutdownAWSLogging(); + return this->s3_client_; } Status S3FileSystem::NewRandomAccessFile( const string& fname, std::unique_ptr<RandomAccessFile>* result) { string bucket, object; TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object)); - result->reset(new S3RandomAccessFile(bucket, object)); + result->reset(new S3RandomAccessFile(bucket, object, this->GetS3Client())); return Status::OK(); } @@ -306,7 +331,7 @@ Status S3FileSystem::NewWritableFile(const string& fname, std::unique_ptr<WritableFile>* result) { string bucket, object; TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object)); - result->reset(new S3WritableFile(bucket, object)); + result->reset(new S3WritableFile(bucket, object, this->GetS3Client())); return Status::OK(); } @@ -321,7 +346,7 @@ Status S3FileSystem::NewAppendableFile(const string& fname, string bucket, object; TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object)); - result->reset(new S3WritableFile(bucket, object)); + result->reset(new S3WritableFile(bucket, object, this->GetS3Client())); while (true) { status = reader->Read(offset, kS3ReadAppendableFileBufferSize, &read_chunk, @@ -372,7 +397,6 @@ Status S3FileSystem::GetChildren(const string& dir, prefix.push_back('/'); } - Aws::S3::S3Client s3Client(GetDefaultClientConfig()); Aws::S3::Model::ListObjectsRequest listObjectsRequest; listObjectsRequest.WithBucket(bucket.c_str()) .WithPrefix(prefix.c_str()) @@ -383,7 +407,8 @@ Status S3FileSystem::GetChildren(const string& dir, Aws::S3::Model::ListObjectsResult listObjectsResult; do { - auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest); + auto listObjectsOutcome = + this->GetS3Client()->ListObjects(listObjectsRequest); if (!listObjectsOutcome.IsSuccess()) { string error = strings::StrCat( listObjectsOutcome.GetError().GetExceptionName().c_str(), ": ", @@ -417,11 +442,10 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) { string bucket, object; TF_RETURN_IF_ERROR(ParseS3Path(fname, true, &bucket, &object)); - Aws::S3::S3Client s3Client(GetDefaultClientConfig()); if (object.empty()) { Aws::S3::Model::HeadBucketRequest headBucketRequest; headBucketRequest.WithBucket(bucket.c_str()); - auto headBucketOutcome = s3Client.HeadBucket(headBucketRequest); + auto headBucketOutcome = this->GetS3Client()->HeadBucket(headBucketRequest); if (!headBucketOutcome.IsSuccess()) { string error = strings::StrCat( headBucketOutcome.GetError().GetExceptionName().c_str(), ": ", @@ -439,7 +463,7 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) { headObjectRequest.WithBucket(bucket.c_str()).WithKey(object.c_str()); headObjectRequest.SetResponseStreamFactory( []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); }); - auto headObjectOutcome = s3Client.HeadObject(headObjectRequest); + auto headObjectOutcome = this->GetS3Client()->HeadObject(headObjectRequest); if (headObjectOutcome.IsSuccess()) { stats->length = headObjectOutcome.GetResult().GetContentLength(); stats->is_directory = 0; @@ -457,7 +481,8 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) { .WithMaxKeys(1); listObjectsRequest.SetResponseStreamFactory( []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); }); - auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest); + auto listObjectsOutcome = + this->GetS3Client()->ListObjects(listObjectsRequest); if (listObjectsOutcome.IsSuccess()) { if (listObjectsOutcome.GetResult().GetContents().size() > 0) { stats->length = 0; @@ -475,11 +500,11 @@ Status S3FileSystem::DeleteFile(const string& fname) { string bucket, object; TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object)); - Aws::S3::S3Client s3Client(GetDefaultClientConfig()); Aws::S3::Model::DeleteObjectRequest deleteObjectRequest; deleteObjectRequest.WithBucket(bucket.c_str()).WithKey(object.c_str()); - auto deleteObjectOutcome = s3Client.DeleteObject(deleteObjectRequest); + auto deleteObjectOutcome = + this->GetS3Client()->DeleteObject(deleteObjectRequest); if (!deleteObjectOutcome.IsSuccess()) { string error = strings::StrCat( deleteObjectOutcome.GetError().GetExceptionName().c_str(), ": ", @@ -494,10 +519,9 @@ Status S3FileSystem::CreateDir(const string& dirname) { TF_RETURN_IF_ERROR(ParseS3Path(dirname, true, &bucket, &object)); if (object.empty()) { - Aws::S3::S3Client s3Client(GetDefaultClientConfig()); Aws::S3::Model::HeadBucketRequest headBucketRequest; headBucketRequest.WithBucket(bucket.c_str()); - auto headBucketOutcome = s3Client.HeadBucket(headBucketRequest); + auto headBucketOutcome = this->GetS3Client()->HeadBucket(headBucketRequest); if (!headBucketOutcome.IsSuccess()) { return errors::NotFound("The bucket ", bucket, " was not found."); } @@ -517,7 +541,6 @@ Status S3FileSystem::DeleteDir(const string& dirname) { string bucket, object; TF_RETURN_IF_ERROR(ParseS3Path(dirname, false, &bucket, &object)); - Aws::S3::S3Client s3Client(GetDefaultClientConfig()); string prefix = object; if (prefix.back() != '/') { prefix.push_back('/'); @@ -528,7 +551,8 @@ Status S3FileSystem::DeleteDir(const string& dirname) { .WithMaxKeys(2); listObjectsRequest.SetResponseStreamFactory( []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); }); - auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest); + auto listObjectsOutcome = + this->GetS3Client()->ListObjects(listObjectsRequest); if (listObjectsOutcome.IsSuccess()) { auto contents = listObjectsOutcome.GetResult().GetContents(); if (contents.size() > 1 || @@ -568,8 +592,6 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) { } } - Aws::S3::S3Client s3Client(GetDefaultClientConfig()); - Aws::S3::Model::CopyObjectRequest copyObjectRequest; Aws::S3::Model::DeleteObjectRequest deleteObjectRequest; @@ -582,7 +604,8 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) { Aws::S3::Model::ListObjectsResult listObjectsResult; do { - auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest); + auto listObjectsOutcome = + this->GetS3Client()->ListObjects(listObjectsRequest); if (!listObjectsOutcome.IsSuccess()) { string error = strings::StrCat( listObjectsOutcome.GetError().GetExceptionName().c_str(), ": ", @@ -595,13 +618,15 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) { Aws::String src_key = object.GetKey(); Aws::String target_key = src_key; target_key.replace(0, src_object.length(), target_object.c_str()); - Aws::String source = Aws::String(src_bucket.c_str()) + "/" + src_key; + Aws::String source = Aws::String(src_bucket.c_str()) + "/" + + Aws::Utils::StringUtils::URLEncode(src_key.c_str()); copyObjectRequest.SetBucket(target_bucket.c_str()); copyObjectRequest.SetKey(target_key); copyObjectRequest.SetCopySource(source); - auto copyObjectOutcome = s3Client.CopyObject(copyObjectRequest); + auto copyObjectOutcome = + this->GetS3Client()->CopyObject(copyObjectRequest); if (!copyObjectOutcome.IsSuccess()) { string error = strings::StrCat( copyObjectOutcome.GetError().GetExceptionName().c_str(), ": ", @@ -612,7 +637,8 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) { deleteObjectRequest.SetBucket(src_bucket.c_str()); deleteObjectRequest.SetKey(src_key.c_str()); - auto deleteObjectOutcome = s3Client.DeleteObject(deleteObjectRequest); + auto deleteObjectOutcome = + this->GetS3Client()->DeleteObject(deleteObjectRequest); if (!deleteObjectOutcome.IsSuccess()) { string error = strings::StrCat( deleteObjectOutcome.GetError().GetExceptionName().c_str(), ": ", diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h index 31ba3cecc5..31264be621 100644 --- a/tensorflow/core/platform/s3/s3_file_system.h +++ b/tensorflow/core/platform/s3/s3_file_system.h @@ -16,7 +16,9 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_ #define TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_ +#include <aws/s3/S3Client.h> #include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/mutex.h" namespace tensorflow { @@ -53,6 +55,26 @@ class S3FileSystem : public FileSystem { Status GetFileSize(const string& fname, uint64* size) override; Status RenameFile(const string& src, const string& target) override; + + private: + // Returns the member S3 client, initializing as-needed. + // When the client tries to access the object in S3, e.g., + // s3://bucket-name/path/to/object + // the behavior could be controlled by various environmental + // variables. + // By default S3 access regional endpoint, with region + // controlled by `AWS_REGION`. The endpoint could be overridden + // explicitly with `S3_ENDPOINT`. S3 uses HTTPS by default. + // If S3_USE_HTTPS=0 is specified, HTTP is used. Also, + // S3_VERIFY_SSL=0 could disable SSL verification in case + // HTTPS is used. + // This S3 Client does not support Virtual Hosted–Style Method + // for a bucket. + std::shared_ptr<Aws::S3::S3Client> GetS3Client(); + + std::shared_ptr<Aws::S3::S3Client> s3_client_; + // Lock held when checking for s3_client_ initialization. + mutex client_lock_; }; } // namespace tensorflow diff --git a/tensorflow/core/platform/s3/s3_file_system_test.cc b/tensorflow/core/platform/s3/s3_file_system_test.cc index 0b42f5fcec..d4411d9865 100644 --- a/tensorflow/core/platform/s3/s3_file_system_test.cc +++ b/tensorflow/core/platform/s3/s3_file_system_test.cc @@ -130,6 +130,8 @@ TEST_F(S3FileSystemTest, NewReadOnlyMemoryRegionFromFile) { TEST_F(S3FileSystemTest, FileExists) { const string fname = TmpDir("FileExists"); + // Ensure the file doesn't yet exist. + TF_ASSERT_OK(s3fs.DeleteFile(fname)); EXPECT_EQ(error::Code::NOT_FOUND, s3fs.FileExists(fname).code()); TF_ASSERT_OK(WriteString(fname, "test")); TF_EXPECT_OK(s3fs.FileExists(fname)); diff --git a/tensorflow/core/platform/windows/cpu_info.h b/tensorflow/core/platform/windows/cpu_info.h index d6e78dbc8f..f20939d3c0 100644 --- a/tensorflow/core/platform/windows/cpu_info.h +++ b/tensorflow/core/platform/windows/cpu_info.h @@ -22,8 +22,10 @@ limitations under the License. // Byte order defines provided by gcc. MSVC doesn't define those so // we define them here. // We assume that all windows platform out there are little endian. +#if defined(_MSC_VER) && !defined(__clang__) #define __ORDER_LITTLE_ENDIAN__ 0x4d2 #define __ORDER_BIG_ENDIAN__ 0x10e1 #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ +#endif #endif // TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_ diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md index 460f935e4a..57d76eb4cb 100644 --- a/tensorflow/core/profiler/README.md +++ b/tensorflow/core/profiler/README.md @@ -240,8 +240,9 @@ Open a Chrome browser, enter URL chrome://tracing and load the timeline file. # can also generate memory profile using `-select bytes` tfprof> code -select accelerator_micros -max_depth 100000 -output pprof:outfile=<filename> -trim_name_regexes .*apply_op.* -# Use pprof to visualize the generated file. -pprof -png --nodecount=100 --sample_index=1 <filename> +# Use google-pprof, from the google-perftools package to visualize the generated file. +# On Ubuntu you can install it with `apt-get install it google-perftools`. +google-pprof --pdf --nodecount=100 <filename> ``` ![PprofGraph](g3doc/pprof.jpg) diff --git a/tensorflow/core/profiler/internal/tfprof_stats.h b/tensorflow/core/profiler/internal/tfprof_stats.h index 0790cb0ca6..db148c936c 100644 --- a/tensorflow/core/profiler/internal/tfprof_stats.h +++ b/tensorflow/core/profiler/internal/tfprof_stats.h @@ -83,7 +83,7 @@ class TFStats { const MultiGraphNodeProto& ShowMultiGraphNode(const string& cmd, const Options& opts) const; - // A a (partial) graph to existing graph. + // Add a (partial) graph to existing graph. void AddGraph(std::unique_ptr<GraphDef> graph); // Add a step of run time meta data. @@ -118,7 +118,7 @@ class TFStats { MultiGraphNodeProto empty_multi_graph_node_; std::map<int64, string> id_to_string_; - // Graph nodes covered by RunMetdata, that is traced with run time stats. + // Graph nodes covered by RunMetadata, that is traced with run time stats. std::set<int64> covered_nodes_; }; diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc index 2cc212d589..808e3c853b 100644 --- a/tensorflow/core/profiler/profiler.cc +++ b/tensorflow/core/profiler/profiler.cc @@ -206,8 +206,12 @@ int Run(int argc, char** argv) { "graph_path,op_log_path,run_meta_path\n"); std::unique_ptr<GraphDef> graph(new GraphDef()); if (!FLAGS_graph_path.empty()) { - TF_CHECK_OK( - ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false)); + s = ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false); + if (!s.ok()) { + fprintf(stderr, "Failed to read graph_path: %s\n", + s.ToString().c_str()); + return 1; + } } std::unique_ptr<OpLogProto> op_log(new OpLogProto()); diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h index 67da7bf452..b02f899b87 100644 --- a/tensorflow/core/public/version.h +++ b/tensorflow/core/public/version.h @@ -24,7 +24,7 @@ limitations under the License. // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1", // "-beta", "-rc", "-rc.1") -#define TF_VERSION_SUFFIX "-rc1" +#define TF_VERSION_SUFFIX "" #define TF_STR_HELPER(x) #x #define TF_STR(x) TF_STR_HELPER(x) diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index 864e7e39c2..db4c5c35e3 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -35,7 +35,7 @@ limitations under the License. #include "tensorflow/core/util/padding.h" #include "tensorflow/core/util/tensor_format.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" using mkldnn::engine; @@ -325,7 +325,7 @@ class MklShape { nullptr; // TF dimension corresponding to this MKL dimension }; -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML // Forward decl TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format); @@ -660,7 +660,7 @@ class MklDnnShape { typedef std::vector<MklShape> MklShapeList; -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML typedef std::vector<MklDnnShape> MklDnnShapeList; #endif @@ -674,7 +674,7 @@ inline bool AreAllMklTensors(const MklShapeList& shapes) { return true; } -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template <typename T> inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor, const MklShape& mkl_shape) { @@ -725,7 +725,7 @@ inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) { sizeof(uint8)); } -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape) { mklshape->DeSerializeMklDnnShape( ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs())) @@ -749,7 +749,7 @@ inline void GetMklInputList(OpKernelContext* ctext, StringPiece name, ctext->input_list(name, input_tensors); } -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name, MklShapeList* mkl_shapes) { @@ -779,7 +779,7 @@ inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name, #endif -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML /// Get shape of input tensor pointed by 'input_idx' in TensorShape format. /// If the input tensor is in MKL layout, then obtains TensorShape from /// MklShape. @@ -814,7 +814,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, second_tensor->flat<uint8>().size() * sizeof(uint8)); } -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML // Allocate the second output tensor that will contain // the MKL shape serialized inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, @@ -851,7 +851,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, second_tensor->flat<uint8>().size() * sizeof(uint8)); } -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML // Allocate the output tensor, create a second output tensor that will contain // the MKL shape serialized inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, @@ -875,7 +875,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, // Allocates a temp tensor and returns the data buffer for temporary storage. // Currently -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML template <typename T> inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out, const memory::primitive_desc& pd, void** buf_out) { @@ -994,7 +994,7 @@ inline void CopyMklTensorInToOut(OpKernelContext* context, int idx_in, context->set_output(idx_meta_out, meta_output); } -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in, int idx_out, const TensorShape& shape) { @@ -1032,7 +1032,7 @@ inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in, } #endif -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in, int idx_out) { @@ -1090,7 +1090,7 @@ inline void ForwardMklTensorInToOut(OpKernelContext* context, int idx_in, } } -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML inline void ForwardMklTensorInToOutWithMklShape(OpKernelContext* context, int idx_in, int idx_out, const MklDnnShape& mkl_shape) { @@ -1132,7 +1132,7 @@ inline void SetDummyMklShapeOutput(OpKernelContext* context, AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output); } -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML // We don't need these functions in MKLDNN. We have defined equality operator // on MklDnnShape class directly. @@ -1242,7 +1242,7 @@ inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) { // ------------------------------------------------------------------- -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML /// Return MKL-DNN data type (memory::data_type) for input type T /// @@ -1753,7 +1753,7 @@ class MklDnnData { } }; -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML } // namespace tensorflow #endif // INTEL_MKL diff --git a/tensorflow/core/util/mkl_util_test.cc b/tensorflow/core/util/mkl_util_test.cc index 8b73eadb40..cd1d0713ad 100644 --- a/tensorflow/core/util/mkl_util_test.cc +++ b/tensorflow/core/util/mkl_util_test.cc @@ -22,7 +22,7 @@ limitations under the License. namespace tensorflow { namespace { -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML TEST(MklUtilTest, MklDnnTfShape) { auto cpu_engine = engine(engine::cpu, 0); @@ -84,7 +84,7 @@ TEST(MklUtilTest, MklDnnBlockedFormatTest) { EXPECT_EQ(b_md2.data.format, mkldnn_blocked); } -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML } // namespace } // namespace tensorflow |